In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
%matplotlib inline
np.set_printoptions(precision=2)
pd.set_option('precision', 2)
np.set_printoptions(suppress=True)
sns.set(style='whitegrid')

In [2]:
train_path = 'data/train.csv'
test_path = 'data/test.csv'
train_data = pd.read_csv(train_path, index_col='Id')
test_data = pd.read_csv(test_path, index_col='Id')
features = train_data.iloc[:,:-1]
target = train_data.loc[:, ['SalePrice']]
print('Train Set Size : ', train_data.shape)
print('Test Set Size : ', test_data.shape)
print('Train Features Size : ', features.shape)
num_train_rows = train_data.shape[0]
num_test_rows = test_data.shape[0]
print('Train Rows : ', num_train_rows)
print('Test Rows : ', num_test_rows)
all_data = pd.concat((features, test_data)).reset_index(drop=True)

Train Set Size :  (1460, 80)
Test Set Size :  (1459, 79)
Train Features Size :  (1460, 79)
Train Rows :  1460
Test Rows :  1459


In [3]:
# Analyze SalePrice
print('Skewness of SalePrice before Log Transform : %f'% target.skew())
print('Kurtosis of SalePrice before Log Transform : %f'% target.kurt())

'''
Skew = 1.882876 indicates positive skew with tail to the right.
Kurt = 6.536282 indicates heavy tails i.e. more data on tails.
'''

#Apply Log transformation
target['SalePrice'] = np.log(target['SalePrice'])
print('Skewness of SalePrice after Log Transform : %f'% target.skew())
print('Kurtosis of SalePrice after Log Transform : %f'% target.kurt())

Skewness of SalePrice before Log Transform : 1.882876
Kurtosis of SalePrice before Log Transform : 6.536282
Skewness of SalePrice after Log Transform : 0.121335
Kurtosis of SalePrice after Log Transform : 0.809532


In [4]:
# Missing Data
null_features = all_data.columns[all_data.isnull().any()]
missing_ratio = (all_data[null_features].isnull().sum()/len(all_data)) * 100
missing_data = pd.DataFrame({'Missing Ratio' :missing_ratio})
print(missing_data.sort_values(by='Missing Ratio',ascending=False))


for col in ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu']:
    all_data[col] = all_data[col].fillna('None')

for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'MasVnrType']:
    all_data[col] = all_data[col].fillna('None')
    
for col in ['BsmtExposure', 'BsmtFinType2', 'BsmtQual', 'BsmtCond', 'BsmtFinType1']:
    all_data[col] = all_data[col].fillna('None')

'''
No GarageYrBlt means no Garage. We can impute mean/median since it would 
incorrectly convey existence of Garage. same reasoning for MasVnrArea.
'''
for col in ['GarageYrBlt', 'MasVnrArea']:
    all_data[col] = all_data[col].fillna(0)

'''
Group data by neighborhood & imputed null LotFrontage columns with median of
grouped data.
'''
all_data['LotFrontage'] = all_data.groupby(['Neighborhood'])\
                    ['LotFrontage'].transform(lambda x : x.fillna(x.median()))
    
all_data['Electrical'] = \
    all_data['Electrical'].fillna(all_data['Electrical'].mode()[0])
    
all_data['MSZoning'] = \
    all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0])
    
all_data['Utilities'] = all_data['Utilities'].fillna('ELO')

all_data['Exterior1st'] = all_data['Exterior1st'].fillna('Other')

all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna('Other')

all_data['SaleType'] = all_data['SaleType'].fillna('Oth')

all_data['Functional'] = \
    all_data['Functional'].fillna(all_data['Functional'].mode()[0])

all_data['KitchenQual'] = \
    all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0])

for col in ['BsmtFullBath', 'BsmtHalfBath', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'GarageCars', 'GarageArea']:
    all_data[col] = all_data[col].fillna(0)
    
null_features = all_data.columns[all_data.isnull().any()]
missing_ratio = (all_data[null_features].isnull().sum()/len(all_data)) * 100
missing_data = pd.DataFrame({'Missing Ratio' :missing_ratio})
print(missing_data.sort_values(by='Missing Ratio',ascending=False))

              Missing Ratio
PoolQC                99.66
MiscFeature           96.40
Alley                 93.22
Fence                 80.44
FireplaceQu           48.65
LotFrontage           16.65
GarageFinish           5.45
GarageQual             5.45
GarageCond             5.45
GarageYrBlt            5.45
GarageType             5.38
BsmtExposure           2.81
BsmtCond               2.81
BsmtQual               2.77
BsmtFinType2           2.74
BsmtFinType1           2.71
MasVnrType             0.82
MasVnrArea             0.79
MSZoning               0.14
BsmtFullBath           0.07
BsmtHalfBath           0.07
Functional             0.07
Utilities              0.07
GarageArea             0.03
GarageCars             0.03
Electrical             0.03
KitchenQual            0.03
TotalBsmtSF            0.03
BsmtUnfSF              0.03
BsmtFinSF2             0.03
BsmtFinSF1             0.03
Exterior2nd            0.03
Exterior1st            0.03
SaleType               0.03
Empty DataFrame
Colu

In [5]:
'''
                    Numerical Features - Analysis
Selected Features : 
    'OverallQual', 'GarageCars', 'YearBuilt','FullBath', 'TotalBsmtSF', 
    'YearRemodAdd', 'TotRmsAbvGrd', 'Fireplaces', 'OpenPorchSF', 'LotArea', 
    'MasVnrArea'
'''
print('Skewness of OverallQual before Log Transform : %f'% all_data['OverallQual'].skew())
print('Kurtosis of OverallQual before Log Transform : %f'% all_data['OverallQual'].kurt())

print('Skewness of GarageCars before Log Transform : %f'% all_data['GarageCars'].skew())
print('Kurtosis of GarageCars before Log Transform : %f'% all_data['GarageCars'].kurt())

print('Skewness of YearBuilt before Log Transform : %f'% all_data['YearBuilt'].skew())
print('Kurtosis of YearBuilt before Log Transform : %f'% all_data['YearBuilt'].kurt())

print('Skewness of FullBath before Log Transform : %f'% all_data['FullBath'].skew())
print('Kurtosis of FullBath before Log Transform : %f'% all_data['FullBath'].kurt())

print('Skewness of TotalBsmtSF before Log Transform : %f'% all_data['TotalBsmtSF'].skew())
print('Kurtosis of TotalBsmtSF before Log Transform : %f'% all_data['TotalBsmtSF'].kurt())

print('Skewness of YearRemodAdd before Log Transform : %f'% all_data['YearRemodAdd'].skew())
print('Kurtosis of YearRemodAdd before Log Transform : %f'% all_data['YearRemodAdd'].kurt())

print('Skewness of TotRmsAbvGrd before Log Transform : %f'% all_data['TotRmsAbvGrd'].skew())
print('Kurtosis of TotRmsAbvGrd before Log Transform : %f'% all_data['TotRmsAbvGrd'].kurt())

print('Skewness of Fireplaces before Log Transform : %f'% all_data['Fireplaces'].skew())
print('Kurtosis of Fireplaces before Log Transform : %f'% all_data['Fireplaces'].kurt())

print('Skewness of OpenPorchSF before Log Transform : %f'% all_data['OpenPorchSF'].skew())
print('Kurtosis of OpenPorchSF before Log Transform : %f'% all_data['OpenPorchSF'].kurt())

print('Skewness of LotArea before Log Transform : %f'% all_data['LotArea'].skew())
print('Kurtosis of LotArea before Log Transform : %f'% all_data['LotArea'].kurt())

print('Skewness of MasVnrArea before Log Transform : %f'% all_data['MasVnrArea'].skew())
print('Kurtosis of MasVnrArea before Log Transform : %f'% all_data['MasVnrArea'].kurt())


'''
TotalBsmtSF, OpenPorchSF, LotArea, MasVnrArea
These features has high Skewness & Kurtosis
'''
TotalBsmtSFMean = all_data['TotalBsmtSF'].mean()
all_data.loc[all_data['TotalBsmtSF'] == 0, 'TotalBsmtSF'] = np.round(TotalBsmtSFMean).astype(int)
all_data['TotalBsmtSF'] = np.log(all_data['TotalBsmtSF'])
print('Skewness of TotalBsmtSF after Log Transform : %f'% all_data['TotalBsmtSF'].skew())
print('Kurtosis of TotalBsmtSF after Log Transform : %f'% all_data['TotalBsmtSF'].kurt())

OpenPorchSFMean = all_data['OpenPorchSF'].mean()
all_data.loc[all_data['OpenPorchSF'] == 0, 'OpenPorchSF'] = np.round(OpenPorchSFMean).astype(int)
all_data['TotalBsmtSF'] = np.log(all_data['TotalBsmtSF'])
print('Skewness of TotalBsmtSF after Log Transform : %f'% all_data['TotalBsmtSF'].skew())
print('Kurtosis of TotalBsmtSF after Log Transform : %f'% all_data['TotalBsmtSF'].kurt())

all_data['LotArea'] = np.log(all_data['LotArea'])
print('Skewness of LotArea after Log Transform : %f'% all_data['LotArea'].skew())
print('Kurtosis of LotArea after Log Transform : %f'% all_data['LotArea'].kurt())

MasVnrAreaMean = all_data['MasVnrArea'].mean()
all_data.loc[all_data['MasVnrArea'] == 0, 'MasVnrArea'] = np.round(MasVnrAreaMean).astype(int)
'''
This is to handle 'NA' values in MasVnrArea column
ms_df = all_data['MasVnrArea']
print(np.any(np.isnan(ms_df)))
If True, then find where NaNs exist
print(np.where(np.isnan(ms_df)))
''' 
all_data.loc[all_data['MasVnrArea'].isnull(), 'MasVnrArea'] = np.round(MasVnrAreaMean).astype(int)
all_data['MasVnrArea'] = np.log(all_data['MasVnrArea'])
print('Skewness of MasVnrArea after Log Transform : %f'% all_data['MasVnrArea'].skew())
print('Kurtosis of MasVnrArea after Log Transform : %f'% all_data['MasVnrArea'].kurt())

Skewness of OverallQual before Log Transform : 0.197212
Kurtosis of OverallQual before Log Transform : 0.067219
Skewness of GarageCars before Log Transform : -0.219694
Kurtosis of GarageCars before Log Transform : 0.236592
Skewness of YearBuilt before Log Transform : -0.600114
Kurtosis of YearBuilt before Log Transform : -0.511317
Skewness of FullBath before Log Transform : 0.167692
Kurtosis of FullBath before Log Transform : -0.538129
Skewness of TotalBsmtSF before Log Transform : 1.157489
Kurtosis of TotalBsmtSF before Log Transform : 9.122827
Skewness of YearRemodAdd before Log Transform : -0.451252
Kurtosis of YearRemodAdd before Log Transform : -1.346431
Skewness of TotRmsAbvGrd before Log Transform : 0.758757
Kurtosis of TotRmsAbvGrd before Log Transform : 1.169064
Skewness of Fireplaces before Log Transform : 0.733872
Kurtosis of Fireplaces before Log Transform : 0.076424
Skewness of OpenPorchSF before Log Transform : 2.536417
Kurtosis of OpenPorchSF before Log Transform : 10.93

In [6]:
import statsmodels.formula.api as sm

significance = 0.03
all_data = pd.get_dummies(all_data)
train_data_features = all_data[:num_train_rows]
test_data_features = all_data[num_train_rows:]
train_data_features_ones = np.append(arr=np.ones((1460,1)).astype(int), values=train_data_features, axis=1)
cols = train_data_features.columns.values
cols = np.insert(cols, 0, 'intercept')

def backwardElimination(cols, train_num_features_ones):
    for i in range (0, train_num_features_ones.shape[1]):
        regressor_OLS = sm.OLS(endog=target, exog=train_num_features_ones).fit()
        maxPVal = max(regressor_OLS.pvalues)
        if maxPVal > significance:
            for j in range(0, train_num_features_ones.shape[1]):
                if (regressor_OLS.pvalues[j].astype(float) == maxPVal):
                    train_num_features_ones = np.delete(train_num_features_ones, j, 1)
                    cols = np.delete(cols, j)
    print(regressor_OLS.summary())
    return cols, train_num_features_ones

cols, train_num_features_ones_df = backwardElimination(cols, train_data_features_ones)
print(cols)

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 3.066e+05
Date:                Sat, 07 Jul 2018   Prob (F-statistic):               0.00
Time:                        20:38:39   Log-Likelihood:                 1268.8
No. Observations:                1460   AIC:                            -2410.
Df Residuals:                    1396   BIC:                            -2071.
Df Model:                          64                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x1             0.0737      0.008      9.818      0.0

In [7]:
selected_features = \
all_data.loc[:,['LotArea' ,'OverallQual' ,'OverallCond' ,'YearBuilt' ,
'YearRemodAdd' ,'BsmtFinSF1' ,'BsmtFinSF2' ,'TotalBsmtSF' ,'GrLivArea' ,
'BsmtFullBath' ,'KitchenAbvGr' ,'Fireplaces' ,'GarageCars' ,'GarageArea' ,
'WoodDeckSF' ,'EnclosedPorch' ,'ScreenPorch' ,'PoolArea' ,'YrSold' ,'MSZoning_FV' ,
'MSZoning_RH' ,'MSZoning_RL' ,'MSZoning_RM' ,'LotConfig_CulDSac' ,
'Neighborhood_BrkSide' ,'Neighborhood_Crawfor' ,'Neighborhood_Edwards' ,
'Neighborhood_MeadowV' ,'Neighborhood_Mitchel' ,'Neighborhood_NridgHt' ,
'Neighborhood_StoneBr' ,'Condition1_Norm' ,'Condition2_PosA' ,'Condition2_PosN' ,
'RoofMatl_CompShg' ,'RoofMatl_Membran' ,'RoofMatl_Metal' ,'RoofMatl_Roll' ,
'RoofMatl_Tar&Grv' ,'RoofMatl_WdShake' ,'RoofMatl_WdShngl' ,'Exterior1st_BrkFace' ,
'Foundation_BrkTil' ,'Foundation_CBlock' ,'Foundation_PConc' ,'Foundation_Stone' ,
'BsmtQual_Ex' ,'BsmtExposure_Gd' ,'BsmtFinType2_Unf' ,'Heating_GasA' ,'Heating_GasW' ,
'Heating_Wall' ,'HeatingQC_Ex' ,'CentralAir_Y' ,'KitchenQual_Ex' ,'Functional_Maj1' ,
'Functional_Min1' ,'Functional_Min2' ,'Functional_Mod' ,'Functional_Typ' ,'PoolQC_None' ,
'SaleType_ConLD' ,'SaleType_New' ,'SaleCondition_Normal']]

print(selected_features.shape)

# check whether there are any NaNs in the dataframe
print(np.any(np.isnan(selected_features)))

#If True, then find where NaNs exist
print(np.where(np.isnan(selected_features)))

train_selected_features = selected_features[:num_train_rows]
test_selected_features = selected_features[num_train_rows:]

(2919, 64)
False
(array([], dtype=int64), array([], dtype=int64))


In [8]:
'''
Establish a basline model - Linear Regression
Baseline Accuracy - 82.87
'''
from sklearn.cross_validation import train_test_split
X_train , X_test, y_train, y_test = train_test_split(train_selected_features, target, test_size=0.4, random_state=0)

from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_train_predict = regressor.predict(X_test)
print("Accuracy --> ", regressor.score(X_test, y_test)*100)



Accuracy -->  82.86856255173419


In [9]:
'''
Implement LASSO Regression with K-Fold
Lasso 5-fold Accuracy -->  86.03
'''
from sklearn.linear_model import Lasso, LassoCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_predict, cross_val_score
from sklearn import metrics

#lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=0))
#scores = cross_val_score(lasso, train_selected_features, target, cv=kf)
#predictions = cross_val_predict(lasso, train_selected_features, target, cv=kf)
#print("Lasso 5-fold Accuracy --> ", metrics.r2_score(y_pred=predictions, y_true=target))
kf = KFold(n_splits=5, shuffle=True, random_state=0)
lasso = make_pipeline(RobustScaler(), LassoCV(alphas=[0.002], random_state=0, cv=kf))
lasso.fit(X_train, y_train)
print("Accuracy --> ", lasso.score(X_test, y_test)*100)

Accuracy -->  86.03848907931788


  y = column_or_1d(y, warn=True)


In [10]:
'''
Implement Ridge Regression with K-Fold
KernelRidge 5-fold Accuracy -->  86.24
'''
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_predict, cross_val_score
from sklearn import metrics

#ridge = make_pipeline(RobustScaler(), Ridge(alpha =0.8, random_state=0))
#kf = KFold(n_splits=5, shuffle=True, random_state=0)
#predictions = cross_val_predict(ridge, train_selected_features, target, cv=kf)
#print("KernelRidge 5-fold Accuracy --> ", metrics.r2_score(y_pred=predictions, y_true=target))

kf = KFold(n_splits=5, shuffle=True, random_state=0)
ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=[2.5], cv=kf))
ridge.fit(X_train, y_train)
print("Accuracy --> ", ridge.score(X_test, y_test)*100)

Accuracy -->  86.2492416142501


In [11]:
'''
Implement ElasticNet Regression with K-Fold
ElasticNet 5-fold Accuracy -->  86.17
'''
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_predict, cross_val_score
from sklearn import metrics

#ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.3, random_state=0))
#kf = KFold(n_splits=5, shuffle=True, random_state=0)
#predictions = cross_val_predict(ENet, train_selected_features, target, cv=kf)
#print("ElasticNet 5-fold Accuracy --> ", metrics.r2_score(y_pred=predictions, y_true=target))

kf = KFold(n_splits=5, shuffle=True, random_state=0)
ENet = make_pipeline(RobustScaler(), ElasticNetCV(alphas=[0.003], cv=kf, random_state=0))
ENet.fit(X_train, y_train)
print("Accuracy --> ", ENet.score(X_test, y_test)*100)

Accuracy -->  86.16665216650719


  y = column_or_1d(y, warn=True)


In [None]:
'''
Random Forest Accuracy -->  88.74370937589907
'''
from sklearn.ensemble import RandomForestRegressor
forest_regr = RandomForestRegressor(random_state=0, n_estimators=70,
                        min_samples_split=2, min_samples_leaf=1, max_features='sqrt',
                        max_depth=10, bootstrap=False)
forest_regr.fit(X_train, y_train)
print("Random Forest Accuracy --> ", forest_regr.score(X_test, y_test)*100)

In [12]:
'''
Simple Stacking of 4 models
'''
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
class AverageModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)
        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)

In [13]:
'''
Averaged 5-fold Accuracy --> 86.38
'''
#average_models = make_pipeline(RobustScaler(), AverageModels(models = (ENet, ridge, lasso)))
#kf = KFold(n_splits=5, shuffle=True, random_state=0)
#predictions = cross_val_predict(average_models, train_selected_features, target, cv=kf)
#print("Averaged 5-fold Accuracy --> ", metrics.r2_score(y_pred=predictions, y_true=target))

average_models = make_pipeline(RobustScaler(), AverageModels(models = (ENet, ridge, lasso, forest_regr)))
average_models.fit(X_train, y_train)
print("Accuracy --> ", average_models.score(X_test, y_test)*100)

Accuracy -->  86.38102459017608


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [14]:
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
   
    # We again fit the data on clones of the original models
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=0)
        
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train the cloned meta-model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y.iloc[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred.flatten()
                
        # Now train the cloned  meta-model using the out-of-fold predictions as new feature
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
   
    #Do the predictions of all base models on the test data and use the averaged predictions as 
    #meta-features for the final prediction which is done by the meta-model
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)

In [15]:
stacked_averaged_models = make_pipeline(RobustScaler(), \
                StackingAveragedModels(base_models = (ENet, lasso, forest_regr), meta_model = ridge))
X_train , X_test, y_train, y_test = train_test_split(train_selected_features, target, test_size=0.4, random_state=0)
stacked_averaged_models.fit(X_train, y_train)
stacked_train_pred = stacked_averaged_models.predict(X_test)
print("Stacked 5-fold Accuracy --> ", metrics.r2_score(y_pred=stacked_train_pred, y_true=y_test))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Stacked 5-fold Accuracy -->  0.8629723279049879


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [None]:
'''
LinearRegression : 0.16532
lassocv : 0.12517
ridgecv : 0.12809
elasticnet : 0.12463
average : 0.12445 without Random Forest
average : 0.12504 with Random Forest
stacked : 0.12435
'''