In [1]:
import numpy as np # linear algebra
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('max_colwidth', None)

import seaborn as sns
import matplotlib.pyplot as plt


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [1]:
# train=pd.read_csv('/kaggle/input/ames-housing-dataset/AmesHousing.csv')
test=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
train2=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')

In [1]:
train2.head()

In [1]:
train2.columns = train2.columns.str.replace(' ', '')
train2=train2.rename(columns={"YearRemod/Add": "YearRemodAdd"})

In [1]:
train2.head()

In [1]:
test.head()

In [1]:
# print("Size of the Ames Dataset",len(train))
print("Size of the Housing Dataset",len(train2))
print("Size of the Housing Test Dataset",len(test))

Finding duplicates in data

In [1]:
print(train2.shape)
train2 = train2.drop_duplicates()
print(train2.shape)


In [1]:
useless = ['Id'] 
train2 = train2.drop(useless, axis = 1)
train2.shape

In [1]:
from scipy.stats import norm
(mu, sigma) = norm.fit(train2['SalePrice'])
plt.figure(figsize = (12,6))
sns.distplot(train2['SalePrice'], kde = True, hist=True, fit = norm)
plt.title('SalePrice distribution vs Normal Distribution', fontsize = 13)
plt.xlabel("House's sale Price in $", fontsize = 12)
plt.legend(['Normal dist ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma),'actual price dist'],loc='best')
plt.show()

In literature, acceptable values for skewness are between -0.5 and 0.5 while -2 and 2 for Kurtosis. Looking at the plot, we can clearly see how the distribution does not seem to be normal, but highly right-skewed. The non-normality of our distribution is also supported by the Shapiro test for normality (p-value really small that allows us to reject the hypotesis of normality). Despite that, let's leave it like that for now, we'll deal with that later in the notebook.

In [1]:
from scipy import stats
shap = stats.shapiro(train2['SalePrice'])
print('Skewness : %f' % abs(train2['SalePrice']).skew())
print('Kurtosis : %f' % abs(train2['SalePrice']).kurt())
print('Shapiro_Test_statistic : %f' % shap.statistic )
print('Shapiro_Test_pvalue : %f' % shap.pvalue )

In [1]:
f, ax = plt.subplots(figsize=(50, 35))
mat = train2.corr('pearson')
mask = np.triu(np.ones_like(mat, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(mat, mask=mask, cmap=cmap, vmax=1, center=0, annot = True,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.show()

In [1]:
# OverallQuall - SalePrice [Pearson = 0.8]
fig,ax=plt.subplots(1,3,figsize=(20,10))
sns.stripplot(data=train2,x='OverallQual',y='SalePrice',ax=ax[1])
sns.violinplot(data=train2,x='OverallQual',y='SalePrice',ax=ax[2])
sns.boxplot(data=train2,x='OverallQual',y='SalePrice',ax=ax[0])
plt.show()

In [1]:
# BsmtFinSF2 - SalePrice [Pearson = -0.011
fig,ax=plt.subplots(1,3,figsize=(20,10))
sns.stripplot(data=train2,x='BsmtFinSF2',y='SalePrice',ax=ax[1])
sns.violinplot(data=train2,x='BsmtFinSF2',y='SalePrice',ax=ax[2])
sns.boxplot(data=train2,x='BsmtFinSF2',y='SalePrice',ax=ax[0])
plt.show()

In [1]:
# GrLivArea vs SalePrice [corr = 0.71]

Pearson_GrLiv = 0.71
plt.figure(figsize = (12,6))
sns.regplot(data=train2, x = 'GrLivArea', y='SalePrice', scatter_kws={'alpha':0.2})
plt.title('GrLivArea vs SalePrice', fontsize = 12)
plt.legend(['$Pearson=$ {:.2f}'.format(Pearson_GrLiv)], loc = 'best')
plt.show()

In [1]:
# YearBuilt vs SalePrice

Pearson_YrBlt = 0.56
plt.figure(figsize = (12,6))
sns.regplot(data=train2, x = 'YearBuilt', y='SalePrice', scatter_kws={'alpha':0.2})
plt.title('YearBuilt vs SalePrice', fontsize = 12)
plt.legend(['$Pearson=$ {:.2f}'.format(Pearson_YrBlt)], loc = 'best')
plt.show()

In [1]:
plt.figure(figsize=(15,10))
sns.barplot(x='YrSold',y='SalePrice',data=train2,estimator=np.median)
plt.title('Median of Sale Price by Year')
plt.xlabel('Year of Selling')
plt.ylabel('Median of Price')
plt.show()

In [1]:
# Separating Target and Features

target = train2['SalePrice']
test_id = test['Id']
test = test.drop(['Id'],axis = 1)
train2_1 = train2.drop(['SalePrice'],axis = 1)
print("train_datasets shape",train2.shape)
print("test_datasets shape",test.shape)

train_test = pd.concat([train2_1,test], axis=0, sort=False)
print(train_test.shape)

In [1]:
nan=pd.DataFrame(train_test.isna().sum(),columns=['Nan_sum'])
nan['feat']=nan.index
nan=nan[nan['Nan_sum']>0]
nan['Percentage']=(nan['Nan_sum']/1460)*100

nan=nan.sort_values(by=['Nan_sum'])
nan.insert(0,'Serial No.',range(1,len(nan)+1))
nan

In [1]:
plt.figure(figsize=(20,10))
sns.barplot(x=nan['feat'],y=nan['Percentage'])
plt.xticks(rotation=40)
plt.title('Features Containing Nan')
plt.xlabel('Features')
plt.ylabel('% of Missing Data')
plt.show()

In [1]:
# Converting non-numeric predictors stored as numbers into string

train_test['MSSubClass'] = train_test['MSSubClass'].apply(str)
train_test['YrSold'] = train_test['YrSold'].apply(str)
train_test['MoSold'] = train_test['MoSold'].apply(str)
train_test['OverallQual'] = train_test['OverallQual'].apply(str)
train_test['OverallCond'] = train_test['OverallCond'].apply(str)

In [1]:
# Filling Categorical NaN (That we know how to fill due to the description file )

train_test['Functional'] = train_test['Functional'].fillna('Typ')
train_test['Electrical'] = train_test['Electrical'].fillna("SBrkr")
train_test['KitchenQual'] = train_test['KitchenQual'].fillna("TA")
train_test['Exterior1st'] = train_test['Exterior1st'].fillna(train_test['Exterior1st'].mode()[0])
train_test['Exterior2nd'] = train_test['Exterior2nd'].fillna(train_test['Exterior2nd'].mode()[0])
train_test['SaleType'] = train_test['SaleType'].fillna(train_test['SaleType'].mode()[0])
train_test["PoolQC"] = train_test["PoolQC"].fillna("None")
train_test["Alley"] = train_test["Alley"].fillna("None")
train_test['FireplaceQu'] = train_test['FireplaceQu'].fillna("None")
train_test['Fence'] = train_test['Fence'].fillna("None")
train_test['MiscFeature'] = train_test['MiscFeature'].fillna("None")
for col in ('GarageArea', 'GarageCars'):
    train_test[col] = train_test[col].fillna(0)
        
for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
    train_test[col] = train_test[col].fillna('None')
    
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    train_test[col] = train_test[col].fillna('None')
    
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea','BsmtUnfSF', 'TotalBsmtSF'):
    train_test[col] = train_test[col].fillna(0)

train_test['LotFrontage'] = train_test['LotFrontage'].fillna(train2['LotFrontage'].median())
    
    # Checking the features with NaN remained out

for col in train_test:
    if train_test[col].isna().sum() > 0:
        print(train_test[col].isna().sum(),'::::',train_test[col].name)

In [1]:
train_test["SqFtPerRoom"] = train_test["GrLivArea"] / (train_test["TotRmsAbvGrd"] +
                                                       train_test["FullBath"] +
                                                       train_test["HalfBath"] +
                                                       train_test["KitchenAbvGr"])

train_test['Total_Home_Quality'] = train_test['OverallQual'] + train_test['OverallCond']

train_test['Total_Bathrooms'] = (train_test['FullBath'] + (0.5 * train_test['HalfBath']) +
                               train_test['BsmtFullBath'] + (0.5 * train_test['BsmtHalfBath']))

train_test["HighQualSF"] = train_test["1stFlrSF"] + train_test["2ndFlrSF"]
train_test['renovated']=train_test['YearRemodAdd']+train_test['YearBuilt']

In [1]:
# Removing the useless variables

useless = ['GarageYrBlt'] 
train_test = train_test.drop(useless, axis = 1)

In [1]:
# Creating dummy variables from categorical features

train_test_dummy = pd.get_dummies(train_test)
from scipy.stats import skew
numeric_features = train_test_dummy.dtypes[train_test_dummy.dtypes != object].index
skewed_features = train_test_dummy[numeric_features].apply(lambda x: skew(x)).sort_values(ascending=False)
high_skew = skewed_features[skewed_features > 0.5]
skew_index = high_skew.index


In [1]:
# Normalize skewed features using log_transformation
    
for i in skew_index:
    train_test_dummy[i] = np.log1p(train_test_dummy[i] )

Checking for Nan values after dummy

In [1]:
nan=pd.DataFrame(train_test_dummy.isna().sum(),columns=['Nan_sum'])
nan['feat']=nan.index
nan=nan[nan['Nan_sum']>0]
nan['Percentage']=(nan['Nan_sum']/1460)*100
nan=nan.sort_values(by=['Nan_sum'])
nan.insert(0,'Serial No.',range(1,len(nan)+1))
nan

checking if the values are in infinity or not after log transformation

In [1]:
inf=pd.DataFrame(np.isinf(train_test_dummy).sum() ,columns=['Inf_sum'])
inf['feat']=inf.index
inf=inf[inf['Inf_sum']>0]
inf=inf.sort_values(by=['Inf_sum'])
inf.insert(0,'Serial No.',range(1,len(inf)+1))
inf

In [1]:
import statsmodels.api as sm
# SalePrice before transformation

fig, ax = plt.subplots(1,2, figsize= (15,5))
fig.suptitle(" qq-plot & distribution SalePrice ", fontsize= 15)

sm.qqplot(target, stats.t, distargs=(4,),fit=True, line="45", ax = ax[0])
#research sm 
sns.distplot(target, kde = True, hist=True, fit = norm, ax = ax[1])
plt.show()

transforming the sale price 

In [1]:
# SalePrice after transformation

target_log = np.log1p(target)

fig, ax = plt.subplots(1,2, figsize= (15,5))
fig.suptitle("qq-plot & distribution SalePrice ", fontsize= 15)

sm.qqplot(target_log, stats.t, distargs=(4,),fit=True, line="45", ax = ax[0])
sns.distplot(target_log, kde = True, hist=True, fit = norm, ax = ax[1])
plt.show()

In [1]:
import shap
from xgboost import XGBRegressor
from catboost import Pool
from sklearn.svm import SVR
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeRegressor
from mlxtend.regressor import StackingRegressor
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error

In [1]:
# Train-Test separation

X_train = train_test_dummy[0:train2.shape[0]]
X_test = train_test_dummy[train2.shape[0]:]
print(X_train.shape)
print(X_test.shape)

# Creation of the RMSE metric:
    
def rmse(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def cv_rmse(model):
    rmse = np.sqrt(-cross_val_score(model, X_train, target_log, scoring="neg_mean_squared_error", cv=kf))
    return (rmse)

checking for nan values in training set

In [1]:
nan=pd.DataFrame(X_train.isna().sum(),columns=['Nan_sum'])
nan['feat']=nan.index
nan=nan[nan['Nan_sum']>0]
nan['Percentage']=(nan['Nan_sum']/1460)*100
nan=nan.sort_values(by=['Nan_sum'])
nan.insert(0,'Serial No.',range(1,len(nan)+1))
nan

checking for nan values in test set

In [1]:
nan=pd.DataFrame(X_test.isna().sum(),columns=['Nan_sum'])
nan['feat']=nan.index
nan=nan[nan['Nan_sum']>0]
nan['Percentage']=(nan['Nan_sum']/1460)*100
nan['Perc']=(nan['Nan_sum']/2919)*100
nan=nan.sort_values(by=['Nan_sum'])
nan.insert(0,'Serial No.',range(1,len(nan)+1))
nan

In [1]:
# 10 Fold Cross validation

kf = KFold(n_splits=11, random_state=42, shuffle=True)

cv_scores = []
cv_std = []

# baseline_models = ['Linear_Reg.','Bayesian_Ridge_Reg.','LGBM_Reg.','SVR',
#                    'Dec_Tree_Reg.','Random_Forest_Reg.', 'XGB_Reg.',
#                    'Grad_Boost_Reg.','Cat_Boost_Reg.','Stacked_Reg.','Stacked_Reg2']

baseline_models = ['Linear_Reg.','XGB_Reg.',
                   'Grad_Boost_Reg.','Cat_Boost_Reg.','Stacked_Reg.','Stacked_Reg2']

In [1]:
# Linear Regression

lreg = LinearRegression()
score_lreg = cv_rmse(lreg)
cv_scores.append(score_lreg.mean())
cv_std.append(score_lreg.std())

# # Bayesian Ridge Regression

# brr = BayesianRidge(compute_score=True)
# score_brr = cv_rmse(brr)
# cv_scores.append(score_brr.mean())
# cv_std.append(score_brr.std())

# # Light Gradient Boost Regressor

# l_gbm = LGBMRegressor(objective='regression')
# score_l_gbm = cv_rmse(l_gbm)
# cv_scores.append(score_l_gbm.mean())
# cv_std.append(score_l_gbm.std())

# # Support Vector Regression

# svr = SVR()
# score_svr = cv_rmse(svr)
# cv_scores.append(score_svr.mean())
# cv_std.append(score_svr.std())

# # Decision Tree Regressor

# dtr = DecisionTreeRegressor()
# score_dtr = cv_rmse(dtr)
# cv_scores.append(score_dtr.mean())
# cv_std.append(score_dtr.std())

# # Random Forest Regressor

# rfr = RandomForestRegressor()
# score_rfr = cv_rmse(rfr)
# cv_scores.append(score_rfr.mean())
# cv_std.append(score_rfr.std())

# XGB Regressor

xgb = XGBRegressor()
score_xgb = cv_rmse(xgb)
cv_scores.append(score_xgb.mean())
cv_std.append(score_xgb.std())

# Gradient Boost Regressor

gbr = GradientBoostingRegressor()
score_gbr = cv_rmse(gbr)
cv_scores.append(score_gbr.mean())
cv_std.append(score_gbr.std())

# Cat Boost Regressor

catb = CatBoostRegressor()
score_catb = cv_rmse(catb)
cv_scores.append(score_catb.mean())
cv_std.append(score_catb.std())

# Stacked Regressor

stack_gen = StackingRegressor(regressors=(CatBoostRegressor(),
                                          BayesianRidge()),
                              meta_regressor = CatBoostRegressor(),
                              use_features_in_secondary = True)

In [1]:
score_stack_gen = cv_rmse(stack_gen)
cv_scores.append(score_stack_gen.mean())
cv_std.append(score_stack_gen.std())


In [1]:
# Stacked Regressor

stack_gen2 = StackingRegressor(regressors=(CatBoostRegressor(),
                                          XGBRegressor()),
                              meta_regressor = CatBoostRegressor(),
                              use_features_in_secondary = True)

score_stack_gen2 = cv_rmse(stack_gen2)
cv_scores.append(score_stack_gen2.mean())
cv_std.append(score_stack_gen2.std())



In [1]:
final_cv_score = pd.DataFrame(baseline_models, columns = ['Regressors'])
final_cv_score['RMSE_mean'] = cv_scores
final_cv_score['RMSE_std'] = cv_std

In [1]:
final_cv_score

In [1]:
plt.figure(figsize = (12,8))
sns.barplot(final_cv_score['Regressors'],final_cv_score['RMSE_mean'])
plt.xlabel('Regressors', fontsize = 12)
plt.ylabel('CV_Mean_RMSE', fontsize = 12)
plt.xticks(rotation=40)
plt.show()

In [1]:
cat = CatBoostRegressor()
cat_model = cat.fit(X_train,target_log,
                     plot=True,
                     verbose = 0)

In [1]:
feat_imp = cat_model.get_feature_importance(prettified=True)
feat_imp.head()

In [1]:
# Plotting top 30 features' importance

plt.figure(figsize = (12,8))
sns.barplot(feat_imp['Importances'][:30],feat_imp['Feature Id'][:30], orient = 'h')
plt.show()

In [1]:
params = {'iterations': 6000,
          'learning_rate': 0.005,
          'depth': 4,
          'l2_leaf_reg': 1,
          'eval_metric':'RMSE',
          'early_stopping_rounds': 200,
          'verbose': 200,
          'random_seed': 42}
         
cat_f = CatBoostRegressor(**params)
cat_model_f = cat_f.fit(X_train,target_log,
                     plot=True,
                     verbose = False)

In [1]:
test_pred = cat_f.predict(X_test)
submission = pd.DataFrame(test_id, columns = ['Id'])
test_pred = np.expm1(test_pred)
submission['SalePrice'] = test_pred 
submission.head()
submission.to_csv(r"C:\Users\Administrator\Desktop\cat.csv", index = False, header = True)

In [1]:
stack_f=stack_gen.fit(X_train,target_log)
test_stack = stack_gen.predict(X_test)
submission = pd.DataFrame(test_id, columns = ['Id'])
test_pre = np.expm1(test_stack)
submission['SalePrice'] = test_pre

submission.to_csv(r"C:\Users\Administrator\Desktop\stack.csv", index = False, header = True)

In [1]:
stack_f2=stack_gen2.fit(X_train,target_log)
test_stack = stack_gen2.predict(X_test)
submission = pd.DataFrame(test_id, columns = ['Id'])
test_pre = np.expm1(test_stack)
submission['SalePrice'] = test_pre

submission.to_csv("C:\Users\Administrator\Desktop\stack2.csv", index = False, header = True)