In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
%matplotlib inline
np.set_printoptions(precision=2)
pd.set_option('precision', 2)
np.set_printoptions(suppress=True)
sns.set(style='whitegrid')

In [2]:
#1. Load training and test datasets
train_path = 'data/train.csv'
test_path = 'data/test.csv'
train_data = pd.read_csv(train_path, index_col='Id')
test_data = pd.read_csv(test_path, index_col='Id')

#1a. Seperate features and target from training dataset.
features = train_data.iloc[:,:-1]
target = train_data.loc[:, ['SalePrice']]
print('Train Set Size : ', train_data.shape)
print('Test Set Size : ', test_data.shape)
print('Train Features Size : ', features.shape)
num_train_rows = train_data.shape[0]
num_test_rows = test_data.shape[0]
print('Train Rows : ', num_train_rows)
print('Test Rows : ', num_test_rows)

#1b. Merge training and test datasets to cover all 
#encodings for categorical features
all_data = pd.concat((features, test_data)).reset_index(drop=True)

Train Set Size :  (1460, 80)
Test Set Size :  (1459, 79)
Train Features Size :  (1460, 79)
Train Rows :  1460
Test Rows :  1459


In [3]:
'''
2. In Numerical feature analysis, I found that SalPrice is NOT NORMALLY DISTRIBUTED.
So, apply LOG TRANSFORMATION to bring SalePrice closer to Normal Distribution.
'''
print('Skewness of SalePrice before Log Transform : %f'% target.skew())
print('Kurtosis of SalePrice before Log Transform : %f'% target.kurt())

'''
Skew = 1.882876 indicates positive skew with tail to the right.
Kurt = 6.536282 indicates heavy tails i.e. more data on tails.
'''

#Apply Log transformation
target['SalePrice'] = np.log(target['SalePrice'])
print('Skewness of SalePrice after Log Transform : %f'% target.skew())
print('Kurtosis of SalePrice after Log Transform : %f'% target.kurt())

Skewness of SalePrice before Log Transform : 1.882876
Kurtosis of SalePrice before Log Transform : 6.536282
Skewness of SalePrice after Log Transform : 0.121335
Kurtosis of SalePrice after Log Transform : 0.809532


In [4]:
#3. Missing Data
null_features = all_data.columns[all_data.isnull().any()]
missing_ratio = (all_data[null_features].isnull().sum()/len(all_data)) * 100
missing_data = pd.DataFrame({'Missing Ratio' :missing_ratio})
print(missing_data.sort_values(by='Missing Ratio',ascending=False))


for col in ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu']:
    all_data[col] = all_data[col].fillna('None')

for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'MasVnrType']:
    all_data[col] = all_data[col].fillna('None')
    
for col in ['BsmtExposure', 'BsmtFinType2', 'BsmtQual', 'BsmtCond', 'BsmtFinType1']:
    all_data[col] = all_data[col].fillna('None')

'''
No GarageYrBlt means no Garage. I can impute mean/median since it would 
incorrectly convey existence of Garage. same reasoning for MasVnrArea.
'''
for col in ['GarageYrBlt', 'MasVnrArea']:
    all_data[col] = all_data[col].fillna(0)

'''
Group data by neighborhood & imputed null LotFrontage columns with median of
grouped data.
'''
all_data['LotFrontage'] = all_data.groupby(['Neighborhood'])\
                    ['LotFrontage'].transform(lambda x : x.fillna(x.median()))
    
all_data['Electrical'] = \
    all_data['Electrical'].fillna(all_data['Electrical'].mode()[0])
    
all_data['MSZoning'] = \
    all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0])
    
all_data['Utilities'] = all_data['Utilities'].fillna('ELO')

all_data['Exterior1st'] = all_data['Exterior1st'].fillna('Other')

all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna('Other')

all_data['SaleType'] = all_data['SaleType'].fillna('Oth')

all_data['Functional'] = \
    all_data['Functional'].fillna(all_data['Functional'].mode()[0])

all_data['KitchenQual'] = \
    all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0])

for col in ['BsmtFullBath', 'BsmtHalfBath', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'GarageCars', 'GarageArea']:
    all_data[col] = all_data[col].fillna(0)
    
null_features = all_data.columns[all_data.isnull().any()]
missing_ratio = (all_data[null_features].isnull().sum()/len(all_data)) * 100
missing_data = pd.DataFrame({'Missing Ratio' :missing_ratio})
print(missing_data.sort_values(by='Missing Ratio',ascending=False))

              Missing Ratio
PoolQC                99.66
MiscFeature           96.40
Alley                 93.22
Fence                 80.44
FireplaceQu           48.65
LotFrontage           16.65
GarageFinish           5.45
GarageQual             5.45
GarageCond             5.45
GarageYrBlt            5.45
GarageType             5.38
BsmtExposure           2.81
BsmtCond               2.81
BsmtQual               2.77
BsmtFinType2           2.74
BsmtFinType1           2.71
MasVnrType             0.82
MasVnrArea             0.79
MSZoning               0.14
BsmtFullBath           0.07
BsmtHalfBath           0.07
Functional             0.07
Utilities              0.07
GarageArea             0.03
GarageCars             0.03
Electrical             0.03
KitchenQual            0.03
TotalBsmtSF            0.03
BsmtUnfSF              0.03
BsmtFinSF2             0.03
BsmtFinSF1             0.03
Exterior2nd            0.03
Exterior1st            0.03
SaleType               0.03
Empty DataFrame
Colu

In [5]:
#4. One Hot Encoding for Categorical Variables

all_data = pd.get_dummies(all_data)

#5. Standardize Features
all_data_std = (all_data - all_data.mean())/all_data.std()

train_data_features = all_data_std[:num_train_rows]
test_data_features = all_data_std[num_train_rows:]

#print(train_data_features.columns)

from sklearn.feature_selection import RFE
from sklearn.linear_model import Ridge
regressor = Ridge(alpha=0.05, normalize=True)
rfe = RFE(regressor, 11)
rfe = rfe.fit(train_data_features, target)
#print(rfe.support_)
print(rfe.ranking_)

  y = column_or_1d(y, warn=True)


[ 43  34  13   1   1   1   9 142   2  94 165   1   1   1 193   1  22 247
  32  33 106  20  19  36 249   1   7  29  98  79 109  28  72 208 245 169
   4 113 162 120   3 174 102 250 218  62 159 138 292 270  86 160 130 161
  88   1 181 190  35  97 115  96 281 101  76 216 173  83 189 129 237  12
  31 236  80  30  68  82 255  85  47  25  63 219  84 215  27  26 164 108
  51  91  21 200 170  50 293 204 211 289 135 140  49   6  78 272 223 158
 143 260  23  64 100 212  42 192  99 213 256 214 168 117 294 280 166  77
   1 167  87 145 240 242 290 119 279 285  45  24 225 144  57 262 112   1
 287 243 180 251  58 274 141 226 263 116 146 121 152 271 151 177 153 178
 179 195  71 154 124 246  95 155 253 228 276 241 107 133 199 252  67  81
 234   5 183 134 110  38 188 261 259  92  90 233 244 156 273 269  16 235
  54 118 182 284  44 126 264 224  75 127  89 128 268  74 267 163 191 239
  46  40 291 185  17 232 231 171 111  60  15 203 265 149 157 103  14 222
 221 139 196  37 227 197  56  53  18 283 202 136   

In [6]:
'''
Based on ranking, I picked following features
OverallQual, OverallCond, YearBuilt, TotalBsmtSF, 
1stFlrSF, 2ndFlrSF, GrLivArea, GarageCars, Utilities_ELO, 
RoofMatl_ClyTile, Exterior1st_Other.

Since TotalBsmtSF, 1stFlrSF, 2ndFlrSF are similar, I drop 1stFlrSF, 2ndFlrSF.

Final Features:
OverallQual, OverallCond, YearBuilt, TotalBsmtSF, 
GrLivArea, GarageCars, Utilities_ELO, 
RoofMatl_ClyTile, Exterior1st_Other.
'''
final_all_data_std = all_data_std.loc[:,['OverallQual', 'OverallCond', 'YearBuilt', 
                              'TotalBsmtSF', 'GrLivArea', 'GarageCars',
                              'Utilities_ELO', 'RoofMatl_ClyTile', 'Exterior1st_Other']]

train_data_std_features = final_all_data_std[:num_train_rows]
test_data_std_features = final_all_data_std[num_train_rows:]

from sklearn.cross_validation import train_test_split
X_train , X_test, y_train, y_test = train_test_split(train_data_std_features, target, test_size=0.4, random_state=0)

from sklearn.linear_model import Ridge
from sklearn import metrics
regressor = Ridge()
regressor.fit(X_train, y_train)

y_train_predict = regressor.predict(X_test)
print('MAE',metrics.mean_absolute_error(y_test, y_train_predict))
print('MSE',metrics.mean_squared_error(y_test, y_train_predict))
print("Accuracy --> ", regressor.score(X_test, y_test)*100)

MAE 0.10163147444173537
MSE 0.029411021360469008
Accuracy -->  80.52502621772013


