In [None]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## **Data Preprocessing**

### **1. Train Data**

In [None]:
# Data Load

house_df_org = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
house_df = house_df_org.copy()
house_df.head(3)

In [None]:
# info check

print('Train Data Shape:', house_df.shape)
print('\nTotal Feature type: \n', house_df.dtypes.value_counts())

In [None]:
# correlation matrix

corrmat = house_df.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);

In [None]:
k = 10
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(house_df[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

# 'OveralllQual', 'GrLivArea' strongly related to 'SalePrice'

In [None]:
# null column

isnull_series = house_df.isnull().sum()
print('\nNull Column: \n', isnull_series[isnull_series > 0].sort_values(ascending=False))

In [None]:
# replace int/float Null column
house_df.fillna(house_df.mean(), inplace=True)

# drop too many null column
house_df.drop(['Id','PoolQC' , 'MiscFeature', 'Alley', 'Fence','FireplaceQu'], axis=1 , inplace=True)

# drop null data
house_df.dropna(axis=0, inplace=True)

In [None]:
# target distribution

plt.title('Original Price Histogram')
sns.distplot(house_df['SalePrice'])

In [None]:
# target log transformation

plt.title('Log Transformed Sale Price Histogram')
log_SalePrice = np.log1p(house_df['SalePrice'])
sns.distplot(log_SalePrice)

original_SalePrice = house_df['SalePrice']
house_df['SalePrice'] = np.log1p(house_df['SalePrice'])

In [None]:
# feature distribution

from scipy.stats import skew

features_index = house_df.dtypes[house_df.dtypes != 'object'].index
skew_features = house_df[features_index].apply(lambda x : skew(x))

skew_features_top = skew_features[skew_features > 1]
print(skew_features_top.sort_values(ascending=False))

In [None]:
# feature log transformation

house_df[skew_features_top.index] = np.log1p(house_df[skew_features_top.index])

In [None]:
# drop object columns

house_df.drop(list(house_df.dtypes[house_df.dtypes == 'object'].index), axis=1, inplace=True)

In [None]:
# Outlier

# OverallQual & SalePrice scatter plot
plt.scatter(x = house_df_org['OverallQual'], y = house_df_org['SalePrice'])
plt.ylabel('SalePrice', fontsize=15)
plt.xlabel('OverallQual', fontsize=15)
plt.show()

In [None]:
# GrLivArea & SalePrice scatter plot

plt.scatter(x = house_df_org['GrLivArea'], y = house_df_org['SalePrice'])
plt.ylabel('SalePrice', fontsize=15)
plt.xlabel('GrLivArea', fontsize=15)
plt.show()

In [None]:
# delete outlier

cond1 = house_df['GrLivArea'] > np.log1p(4000)
cond2 = house_df['SalePrice'] < np.log1p(500000)
outlier_index = house_df[cond1 & cond2].index

house_df.drop(outlier_index, axis=0, inplace=True)

In [None]:
house_df.shape

### **2. Test Data**

In [None]:
test_org = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
test_df = test_org.copy()
test_df.head(3)

In [None]:
# test data preprocessing

test_df.set_index('Id', inplace=True)

test_df.drop(['PoolQC' , 'MiscFeature', 'Alley', 'Fence','FireplaceQu'], axis=1 , inplace=True)

test_df[skew_features_top.index] = np.log1p(test_df[skew_features_top.index])

test_df.drop(list(test_df.dtypes[test_df.dtypes == 'object'].index), axis=1, inplace=True)

test_df.fillna(0, inplace=True)

In [None]:
test_df.head()

In [None]:
test_df.shape

## **Model - Linear, Lasso, Ridge**

In [None]:
# evaluation function

from sklearn.metrics import mean_squared_error

def get_rmse(model):
    pred = model.predict(X_test)
    mse = mean_squared_error(y_test , pred)
    rmse = np.sqrt(mse)
    print(model.__class__.__name__,np.round(rmse, 3))
    return rmse

def get_rmses(models):
    rmses = []
    for model in models:
        rmse = get_rmse(model)
        rmses.append(rmse)
    return rmses


# coefficients fuction

def get_top_bottom_coef(model, n=10):
    coef = pd.Series(model.coef_, index=X_features.columns)
    
    coef_high = coef.sort_values(ascending=False).head(n)
    coef_low = coef.sort_values(ascending=False).tail(n)
    return coef_high, coef_low

def visualize_coefficient(models):
    
    fig, axs = plt.subplots(figsize=(24, 10), nrows=1, ncols=3)
    fig.tight_layout()
    
    for i_num, model in enumerate(models):
        
        coef_high, coef_low = get_top_bottom_coef(model)
        coef_concat = pd.concat([coef_high, coef_low])
        
        axs[i_num].set_title(model.__class__.__name__+'Coefficients', size=25)
        axs[i_num].tick_params(axis='y', direction='in', pad=-120)
        for label in (axs[i_num].get_xticklabels()+axs[i_num].get_yticklabels()):
            label.set_fontsize(22)
        sns.barplot(x=coef_concat.values, y=coef_concat.index, ax=axs[i_num])

def get_top_features(model):
    ftr_importances_values = model.feature_importances_
    ftr_importances = pd.Series(ftr_importances_values, index=X_features.columns  )
    ftr_top20 = ftr_importances.sort_values(ascending=False)[:20]
    return ftr_top20
        
def visualize_ftr_importances(models):

    fig, axs = plt.subplots(figsize=(24,10),nrows=1, ncols=2)
    fig.tight_layout() 

    for i_num, model in enumerate(models):

        ftr_top20 = get_top_features(model)
        axs[i_num].set_title(model.__class__.__name__+' Feature Importances', size=25)

        for label in (axs[i_num].get_xticklabels() + axs[i_num].get_yticklabels()):
            label.set_fontsize(22)
        sns.barplot(x=ftr_top20.values, y=ftr_top20.index , ax=axs[i_num])

        
# cross_val_score

from sklearn.model_selection import cross_val_score

def get_avg_rmse_cv(models):
    
    for model in models:
        
        rmse_list = np.sqrt(-cross_val_score(model, X_features, y_target,
                                            scoring='neg_mean_squared_error', cv=5))
        
        rmse_avg = np.mean(rmse_list)
        print('\n{0} CV RMSE List: {1}'.format(model.__class__.__name__, np.round(rmse_list, 3)))
        print('{0} CV average RMSE: {1}'.format(model.__class__.__name__, np.round(rmse_avg, 3)))

        
# GridSearchCV

from sklearn.model_selection import GridSearchCV

def get_best_params(model, params):
    grid_model = GridSearchCV(model, param_grid=params,
                             scoring='neg_mean_squared_error', cv=5)
    grid_model.fit(X_features, y_target)
    rmse = np.sqrt(-1*grid_model.best_score_)
    print('{0} 5 CV best average RMSE: {1}, best alpha: {2}'.format(model.__class__.__name__, np.round(rmse, 4), grid_model.best_params_))
    
    return grid_model.best_estimator_

In [None]:
from sklearn.model_selection import train_test_split

y_target = house_df['SalePrice']
X_features = house_df.drop('SalePrice', axis=1, inplace=False)

X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.2)

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso

lr_reg = LinearRegression()
lr_reg.fit(X_train, y_train)

ridge_reg = Ridge()
ridge_reg.fit(X_train, y_train)

lasso_reg = Lasso()
lasso_reg.fit(X_train, y_train)

models = [lr_reg, ridge_reg, lasso_reg]
get_rmses(models)

In [None]:
visualize_coefficient(models)

In [None]:
# cross_val_score

get_avg_rmse_cv(models)

In [None]:
# GridSearchCV

ridge_params = {'alpha':[0.05, 0.1, 1, 5,8, 10, 12, 15, 20]}
lasso_params = {'alpha':[0.001, 0.005, 0.008, 0.05, 0.03, 0.1, 0.5, 1, 5, 10]}

get_best_params(ridge_reg, ridge_params)
get_best_params(lasso_reg, lasso_params)

In [None]:
# train model with best alpha

lr_reg = LinearRegression()
lr_reg.fit(X_train, y_train)

ridge_reg = Ridge(alpha=0.1)
ridge_reg.fit(X_train, y_train)

lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(X_train, y_train)

models = [lr_reg, ridge_reg, lasso_reg]
get_rmses(models)
visualize_coefficient(models)

In [None]:
# delete additional outlier (1stFlorSF)

# 1stFlrSF & SalePrice scatter plot

plt.scatter(x = house_df_org['1stFlrSF'], y = house_df_org['SalePrice'])
plt.ylabel('SalePrice', fontsize=15)
plt.xlabel('1stFlrSF', fontsize=15)
plt.show()

In [None]:
# delete outlier

cond1 = house_df['1stFlrSF'] > np.log1p(4000)
cond2 = house_df['SalePrice'] < np.log1p(200000)
outlier_index = house_df[cond1 & cond2].index

house_df.drop(outlier_index, axis=0, inplace=True)

In [None]:
y_target = house_df['SalePrice']
X_features = house_df.drop('SalePrice', axis=1, inplace=False)

X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.2)

lr_reg = LinearRegression()
lr_reg.fit(X_train, y_train)

ridge_reg = Ridge(alpha=0.1)
ridge_reg.fit(X_train, y_train)

lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(X_train, y_train)

models = [lr_reg, ridge_reg, lasso_reg]
get_rmses(models)
visualize_coefficient(models)

## **Model - Regression Tree**

In [None]:
from xgboost import XGBRegressor

xgb_params = {'n_estimators':[1000]}
xgb_reg = XGBRegressor(n_estimators=1000, learning_rate=0.05, colsample_bytree=0.5, subsample=0.8)
get_best_params(xgb_reg, xgb_params)

In [None]:
from lightgbm import LGBMRegressor

lgbm_params = {'n_estimators':[1000]}
lgbm_reg = LGBMRegressor(n_estimators=1000, learning_rate=0.05, num_leaves=4,
                        subsample=0.6, colsample_bytree=0.4, reg_lambda=10, n_jobs=-1)
get_best_params(lgbm_reg, lgbm_params)

In [None]:
# train model with best parameters

best_xgb = get_best_params(xgb_reg, xgb_params)
best_lgbm = get_best_params(lgbm_reg, lgbm_params)

models = [best_xgb, best_lgbm]
visualize_ftr_importances(models)

## **Submission**

In [None]:
predict = lasso_reg.predict(test_df)
test_df['SalePrice'] = predict

In [None]:
test_df.head()

In [None]:
submission = test_df.reset_index()[['Id', 'SalePrice']]
submission.head(3)

In [None]:
submission.to_csv('submission.csv', index=False)