# House Price Kaggle Competition

Importing some libraries and reading train and test datasets.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV, ElasticNet, ElasticNetCV
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
total = [train, test]

In [None]:
train.shape

This dataset has 81 columns with 38 numeric variables and 43 text variables. I won't explain every variable in this notebook, but I'll give some intuitions behind every move I did in order to clean the dataset.

In [None]:
print('Int64 columns are: ' + str(len(train.loc[:,train.dtypes == np.int64].columns)))
print('Str columns are: ' + str(len(train.loc[:,train.dtypes == np.object].columns)))
print('Float64 columns are: ' + str(len(train.loc[:,train.dtypes == np.float64].columns)))

We can see that columns like PoolQC, MiscFeature, Alley and Fence have an high rate of missing values. We won't delete this columns because we can keep some information and we can use them for creating other features.

In [None]:
train.isnull().sum().sort_values(ascending=False)[train.isnull().sum().sort_values(ascending=False) > 0] / train.shape[0] * 100

In [None]:
test.isnull().sum().sort_values(ascending=False)[test.isnull().sum().sort_values(ascending=False) > 0] / test.shape[0] * 100

Checking the SalePrice, our target variable, we can see that it's highly positive skewed and leptokurtic. So this distribution is far from a normal distribution. Let's log(1+x) transform the dependent variable and solve normality issues.

In [None]:
sns.distplot(train.SalePrice)
sns.distplot(np.random.normal(train.SalePrice.mean(), train.SalePrice.std(), 1000))

In [None]:
print('Skewness: ', train.SalePrice.skew())
print('Kurtosis: ', train.SalePrice.kurt())

In [None]:
train['SalePrice'] = np.log1p(train['SalePrice'])
sns.distplot(train.SalePrice)
sns.distplot(np.random.normal(train.SalePrice.mean(), train.SalePrice.std(), 1000), color='green')

In [None]:
print('Skewness: ', train.SalePrice.skew())
print('Kurtosis: ', train.SalePrice.kurt())

These numerical variable are categorical so we can transform them in object type.

In [None]:
for dataset in total:
    dataset['MSSubClass'] = dataset['MSSubClass'].astype(np.object)
    dataset['MoSold'] = dataset['MoSold'].astype(np.object)
    dataset['YrSold'] = dataset['YrSold'].astype(np.object)

Evaluating model performances, highly skewed variables affected results negatively, so I applied the same transformation made on SalePrice.

In [None]:
numeric_features = train.loc[:,train.dtypes == np.int64].columns.append(train.loc[:,train.dtypes == np.float64].columns)

In [None]:
skew_feats = []
for feat in numeric_features:
    if train[feat].skew() > 0.75:
        skew_feats.append(feat)
        
for dataset in total:
    for feat in skew_feats:
        dataset[feat] = dataset[feat].apply(np.log1p)

I mapped all the object type variables that had ordinality, for example variables that have an evaluation scale. I filled other missing data with the mode of the variable or 0. This cell is a summary of all the work done to clean the dataset.

In [None]:
mapping = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, np.nan:0}
mapping_1 = {'Gd': 4, 'Av': 3, 'Mn':2, 'No':1, np.nan: 0}
mapping_2 = {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, np.nan: 0}
mapping_3 = {'Typ': 7, 'Min1': 6, 'Min2': 5, 'Mod': 4, 'Maj1': 3, 'Maj2': 2, 'Sev': 1, 'Sal': 0}
mapping_4 = {'Fin': 3, 'RFn': 2, 'Unf': 1, np.nan: 0}
mapping_5 = {'GdPrv': 4, 'MnPrv': 3, 'GdWo': 2, 'MnWw': 1, np.nan: 0}

for dataset in total:
    for column in ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC']:
        dataset[column] = dataset[column].map(mapping)
    dataset['BsmtExposure'] = dataset['BsmtExposure'].map(mapping_1)
    dataset['BsmtFinType1'] = dataset['BsmtFinType1'].map(mapping_2)
    dataset['BsmtFinType2'] = dataset['BsmtFinType2'].map(mapping_2)
    dataset['Functional'] = dataset['Functional'].map(mapping_3)
    dataset['GarageFinish'] = dataset['GarageFinish'].map(mapping_4)
    dataset['Fence'] = dataset['Fence'].map(mapping_5)
    dataset[['LotFrontage','GarageYrBlt','MasVnrArea','BsmtFullBath','BsmtHalfBath']] = dataset[['LotFrontage','GarageYrBlt','MasVnrArea','BsmtFullBath','BsmtHalfBath']].fillna(0)
    dataset[['MiscFeature','Alley','GarageType']] = dataset[['MiscFeature','Alley','GarageType']].fillna('No')
    dataset['MasVnrType'] = dataset['MasVnrType'].fillna('None')

train['Electrical'] = train['Electrical'].fillna(train['Electrical'].mode()[0])
test.at[1150, 'MasVnrType'] = 'BrkFace'
test.at[1116, 'GarageCars'] = 0
test.at[1116, 'GarageArea'] = 0
test.at[1116, 'GarageType'] = 0
test['BsmtFinSF1'] = test['BsmtFinSF1'].fillna(0)
test['BsmtUnfSF'] = test['BsmtUnfSF'].fillna(0)
test['BsmtFinSF2'] = test['BsmtFinSF2'].fillna(0)
test['TotalBsmtSF'] = test['TotalBsmtSF'].fillna(0)
test['MSZoning'] = test['MSZoning'].fillna(train['MSZoning'].mode()[0])
test['Utilities'] = test['Utilities'].fillna(train['Utilities'].mode()[0])
test['Functional'] = test['Functional'].fillna(train['Functional'].mode()[0])
test['SaleType'] = test['SaleType'].fillna(train['SaleType'].mode()[0])
test['Exterior1st'] = test['Exterior1st'].fillna(train['Exterior1st'].mode()[0])
test['Exterior2nd'] = test['Exterior2nd'].fillna(train['Exterior2nd'].mode()[0])

I added some new variables:
- HasPool: 1 if house has pool, 0 otherwise
- Has2ndFloor: 1 if house has 2nd floor, 0 otherwise
- HasGarage: 1 if house has garage, 0 otherwise
- HasBsmt: 1 if house has basement, 0 otherwise
- HasFireplace: 1 if house has fireplace, 0 otherwise
- BltSoldYrDiff: time passed between year of built and last sale year in years
- TotalSF: house total surface
- TotalBathr: total bathrooms
- TotalPorchSF: porch total surface

In [None]:
for dataset in total:
    dataset['HasPool'] = dataset['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
    dataset['Has2ndFloor'] = dataset['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
    dataset['HasGarage'] = dataset['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
    dataset['HasBsmt'] = dataset['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
    dataset['HasFireplace'] = dataset['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
    dataset['BltSoldYrDiff'] = dataset['YrSold'].astype(np.int64) - dataset['YearBuilt']
    dataset['TotalSF'] = dataset['TotalBsmtSF'] + dataset['1stFlrSF'] + dataset['2ndFlrSF']
    dataset['TotalBathr'] = dataset['FullBath'] + 0.5 * dataset['HalfBath'] + dataset['BsmtFullBath'] + 0.5 * dataset['BsmtHalfBath']
    dataset['TotalPorchSF'] = dataset['OpenPorchSF'] + dataset['3SsnPorch'] + dataset['EnclosedPorch'] + dataset['ScreenPorch'] + dataset['WoodDeckSF']

In [None]:
numeric_features = ['LotArea', 'OverallQual', 'OverallCond', 'YearBuilt',
       'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
       '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath',
       'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr',
       'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt', 'SalePrice', 'BltSoldYrDiff',
        'TotalSF', 'TotalBathr', 'TotalPorchSF']

Plotting all variables vs target variable to check for some outliers

In [None]:
sns.pairplot(data=train, y_vars=['SalePrice'], x_vars=numeric_features[:5])
sns.pairplot(data=train, y_vars=['SalePrice'], x_vars=numeric_features[5:10])
sns.pairplot(data=train, y_vars=['SalePrice'], x_vars=numeric_features[10:15])
sns.pairplot(data=train, y_vars=['SalePrice'], x_vars=numeric_features[15:20])
sns.pairplot(data=train, y_vars=['SalePrice'], x_vars=numeric_features[20:25])
sns.pairplot(data=train, y_vars=['SalePrice'], x_vars=numeric_features[25:30])
sns.pairplot(data=train, y_vars=['SalePrice'], x_vars=numeric_features[30:35])
sns.pairplot(data=train, y_vars=['SalePrice'], x_vars=numeric_features[35:40])

Removing some outliers

In [None]:
sns.pairplot(data=train, y_vars=['SalePrice'], x_vars=['OverallQual','OverallCond','OpenPorchSF','TotalPorchSF'])
train = train.drop(train[(train['OverallQual'] == 10) & (train['SalePrice'] < 12.5)].index)
train = train.drop(train[(train['OverallCond'] == 2) & (train['SalePrice'] > 12)].index)
train = train.drop(train[(train['OpenPorchSF'] > 3.5) & (train['SalePrice'] < 11)].index)
train = train.drop(train[(train['TotalPorchSF'] > 6) & (train['SalePrice'] < 11)].index)
sns.pairplot(data=train, y_vars=['SalePrice'], x_vars=['OverallQual','OverallCond','OpenPorchSF','TotalPorchSF'])

Checking missing values

In [None]:
train.isnull().sum().sort_values(ascending=False)[train.isnull().sum().sort_values(ascending=False) > 0]

In [None]:
test.isnull().sum().sort_values(ascending=False)[test.isnull().sum().sort_values(ascending=False) > 0]

Getting dummy variables for all categorical features

In [None]:
columns = train.loc[:,train.dtypes == np.object].columns

In [None]:
df_dummies = pd.get_dummies(data=pd.concat([train, test]), columns=columns)

In [None]:
df_dummies.shape

In [None]:
train = df_dummies.iloc[:train.shape[0]]
test = df_dummies.iloc[train.shape[0]:].drop('SalePrice', axis=1)

Standard scaling data

In [None]:
X = pd.DataFrame(StandardScaler().fit_transform(train), columns=train.columns).drop(['Id', 'SalePrice'], axis=1)
y = train['SalePrice']
scaled_test = pd.DataFrame(StandardScaler().fit_transform(test),columns=test.columns).drop('Id', axis=1)

Transforming categorical variables into dummies, increased the numeber of columns, so we need to do some feature selection. In this case I did it in three ways, using Ridge, Lasso and ElasticNet models.

Ridge keeps 270 of 272 variables, so we won't use it.

In [None]:
reg = RidgeCV()
reg.fit(X, y)
print("Best alpha using built-in RidgeCV: %f" % reg.alpha_)
print("Best score using built-in RidgeCV: %f" %reg.score(X,y))
coef = pd.Series(reg.coef_, index = X.columns)
print("Ridge picked " + str(sum(coef != 0)) + " variables and eliminated the other " +  str(sum(coef == 0)) + " variables")

ElasticNet keeps 98 variables.

In [None]:
reg = ElasticNetCV()
reg.fit(X, y)
print("Best alpha using built-in ElasticNetCV: %f" % reg.alpha_)
print("Best score using built-in ElasticNetCV: %f" %reg.score(X,y))
coef = pd.Series(reg.coef_, index = X.columns)
print("ElasticNet picked " + str(sum(coef != 0)) + " variables and eliminated the other " +  str(sum(coef == 0)) + " variables")
ridge_coef = coef[coef != 0]

In [None]:
enet_coef = coef[coef != 0]
imp_coef = enet_coef.sort_values()
import matplotlib
plt.figure(figsize=(8,18))
imp_coef.plot(kind = "barh")
plt.title("Feature importance using ElasticNet Model")

Lasso keeps 96 variables.

In [None]:
reg = LassoCV()
reg.fit(X, y)
print("Best alpha using built-in LassoCV: %f" % reg.alpha_)
print("Best score using built-in LassoCV: %f" %reg.score(X,y))
coef = pd.Series(reg.coef_, index = X.columns)
print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " +  str(sum(coef == 0)) + " variables")

In [None]:
lasso_coef = coef[coef != 0]
imp_coef = lasso_coef.sort_values()
import matplotlib
plt.figure(figsize=(8,18))
imp_coef.plot(kind = "barh")
plt.title("Feature importance using Lasso Model")

Let's test all these models using the Ridge, ElasticNet and Lasso feature selection.

In [None]:
models = [('DTR', DecisionTreeRegressor()),
          ('RFR', RandomForestRegressor()),
          ('KNR', KNeighborsRegressor()),
          ('GBR', GradientBoostingRegressor()),
          ('LR', LinearRegression()),
          ('XGB', XGBRegressor()),
          ('LGBM', LGBMRegressor()),
          ('SVR', SVR()),
          ('Ridge', Ridge(alpha=10)),
          ('Lasso', Lasso(alpha=0.003487)),
          ('ENet', ElasticNet(alpha=0.006974))]


We can see that with Lasso feature selection we generally meet better performances by negative mean squared error. Using features picked by Lasso model, we can exclude some models: Decision Tree, Linear Regression, K-nearest neighbour, Support Vector Regression and Random Forest. XGBoost, LightGBM and Gradient Boosting can be good but they need hyperparameter tuning to reach better performances. I also excluded Ridge because better performances are reached at higher value of the alpha hyperparameter: high values of alpha (like 10 or more) highly reduce the complexity of the model fit, so it's almost averaging the points, giving bad performances on the test set.

In [None]:
results = []
names = []

for coef in [ridge_coef.index, enet_coef.index, lasso_coef.index]:
    for name, model in models:
        kfold = KFold(n_splits=10, random_state=21)
        cv_results = cross_val_score(model, X[coef], y, cv=kfold, scoring='neg_mean_squared_error')
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)

I searched the best alpha from 1e-10 and 1 resulting to be 0.001

In [None]:
params = {
    'alpha': [0.001]
    }

reg = Lasso()
rs = GridSearchCV(estimator = reg, param_grid = params, 
                               cv = 10, verbose= 5, n_jobs = -1, scoring='neg_mean_squared_error')
rs.fit(X[lasso_coef.index],y)
print(rs.best_score_)
print(rs.best_estimator_)
lasso = rs.best_estimator_

For Gradient Boosting hyperparameter tuning, I started finding the best n_estimator for 0.1 learning rate in order to reduce the numerical computation of high n_estimators, then i tuned in a few steps:
- max_depth and min_samples_split together
- min_samples_split and min_sample_leaf together
- max_features
- subsample

After all I reduced learning rate to 0.01 increasing the n_estimetors, to reach better performances.

Hyperparameters detailed explanation can be found here:
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html

I found really helpful for hyperparameter tuning detailed explanation this page:
https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/

In [None]:
params = {
    'learning_rate': [0.01],
    'n_estimators': [2000],
    'max_depth': [11],
    'min_samples_split': [200],
    'min_samples_leaf': [10],
    'max_features': ['sqrt'],
    'subsample': [0.85]
    }

reg = GradientBoostingRegressor()
rs = GridSearchCV(estimator = reg, param_grid = params, 
                               cv = 10, verbose= 5, n_jobs = -1, scoring='neg_mean_squared_error')
rs.fit(X[lasso_coef.index],y)
print(rs.best_score_)
print(rs.best_estimator_)
gbr = rs.best_estimator_

For Elastic Net, I used the same technique used for Lasso.

In [None]:
params = {
    'alpha': [0.001]
    }

reg = ElasticNet()
rs = GridSearchCV(estimator = reg, param_grid = params, 
                               cv = 10, verbose= 5, n_jobs = -1, scoring='neg_mean_squared_error')
rs.fit(X[lasso_coef.index],y)
print(rs.best_score_)
print(rs.best_estimator_)
elasticnet = rs.best_estimator_

For XGBoost and LightGBM I proceeded in the same way of Gradient Boosting for the hyperparameter optimization, some hyperparameters are different.

I found really helpful this page for the tuning procedure:
https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

Here you can find a deep explanation of XGBoost parameters:
https://xgboost.readthedocs.io/en/latest/parameter.html

In [None]:
params = {
    'learning_rate': [0.01],
    'n_estimators': [3000],
    'max_depth': [3],
    'min_child_weight': [5],
    'gamma': [0],
    'colsample_bytree': [0.65],
    'subsample': [0.6],
    'reg_alpha':[1e-6]
    }

reg = XGBRegressor()
rs = GridSearchCV(estimator = reg, param_grid = params, 
                               cv = 10, verbose= 5, n_jobs = -1, scoring='neg_mean_squared_error')
rs.fit(X[lasso_coef.index],y)
print(rs.best_score_)
print(rs.best_estimator_)
xgboost = rs.best_estimator_

LightGBM is not giving good performances so I won't use it for final predictions

In [None]:
params = {
    'learning_rate': [0.01],
    'n_estimators': [3000],
    'max_depth': [3],
    'min_child_weight': [1],
    'gamma': [0],
    'colsample_bytree': [0.8],
    'subsample': [0.6],
    }

reg = LGBMRegressor()
rs = GridSearchCV(estimator = reg, param_grid = params, 
                               cv = 10, verbose= 5, n_jobs = -1, scoring='neg_mean_squared_error')
rs.fit(X[lasso_coef.index],y)
print(rs.best_score_)
print(rs.best_estimator_)

I submitted the results of every model I tuned and the results were good for the Gradient Boosting and XGBoost, but the best result for me (0.12173) in the Kaggle competition (top 13% of the leaderboard) was given by stacking three models: this is an ensemble technique, ElasticNet and Gradient Boost are trained individually on the training set, then their predictions are stacked to fit a final estimator, that in this case is Lasso.

In [None]:
level_0 = [('ENet',elasticnet),('GBR', gbr)]
level_1 = lasso

model = StackingRegressor(estimators=level_0, final_estimator=level_1, cv=10)

cv = KFold(n_splits=10, random_state=21)
scores = cross_val_score(model, X[lasso_coef.index],y,cv=cv,scoring='neg_mean_absolute_error')
print(scores.mean())

In [None]:
model.fit(X[lasso_coef.index],y)

In [None]:
pred_stacked = model.predict(scaled_test[lasso_coef.index])
pred = np.expm1(pred_stacked)
sub = test[['Id']]
sub['SalePrice'] = pred
sub[['Id', 'SalePrice']].to_csv('pred_submission.csv', index=False, encoding='utf-8')

In [None]:
sub.head()