In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
%matplotlib inline
np.set_printoptions(precision=2)
pd.set_option('precision', 2)
np.set_printoptions(suppress=True)
sns.set(style='whitegrid')

In [2]:
#1. Load training and test datasets
train_path = 'data/train.csv'
test_path = 'data/test.csv'
train_data = pd.read_csv(train_path, index_col='Id')
test_data = pd.read_csv(test_path, index_col='Id')

#1a. Seperate features and target from training dataset.
features = train_data.iloc[:,:-1]
target = train_data.loc[:, ['SalePrice']]
print('Train Set Size : ', train_data.shape)
print('Test Set Size : ', test_data.shape)
print('Train Features Size : ', features.shape)
num_train_rows = train_data.shape[0]
num_test_rows = test_data.shape[0]
print('Train Rows : ', num_train_rows)
print('Test Rows : ', num_test_rows)

#1b. Merge training and test datasets to cover all 
#encodings for categorical features
all_data = pd.concat((features, test_data)).reset_index(drop=True)

Train Set Size :  (1460, 80)
Test Set Size :  (1459, 79)
Train Features Size :  (1460, 79)
Train Rows :  1460
Test Rows :  1459


In [3]:
'''
2. In Numerical feature analysis, I found that SalPrice is NOT NORMALLY DISTRIBUTED.
So, apply LOG TRANSFORMATION to bring SalePrice closer to Normal Distribution.
'''
print('Skewness of SalePrice before Log Transform : %f'% target.skew())
print('Kurtosis of SalePrice before Log Transform : %f'% target.kurt())

'''
Skew = 1.882876 indicates positive skew with tail to the right.
Kurt = 6.536282 indicates heavy tails i.e. more data on tails.
'''

#Apply Log transformation
target['SalePrice'] = np.log(target['SalePrice'])
print('Skewness of SalePrice after Log Transform : %f'% target.skew())
print('Kurtosis of SalePrice after Log Transform : %f'% target.kurt())

Skewness of SalePrice before Log Transform : 1.882876
Kurtosis of SalePrice before Log Transform : 6.536282
Skewness of SalePrice after Log Transform : 0.121335
Kurtosis of SalePrice after Log Transform : 0.809532


In [4]:
#3. Missing Data
null_features = all_data.columns[all_data.isnull().any()]
missing_ratio = (all_data[null_features].isnull().sum()/len(all_data)) * 100
missing_data = pd.DataFrame({'Missing Ratio' :missing_ratio})
print(missing_data.sort_values(by='Missing Ratio',ascending=False))


for col in ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu']:
    all_data[col] = all_data[col].fillna('None')

for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'MasVnrType']:
    all_data[col] = all_data[col].fillna('None')
    
for col in ['BsmtExposure', 'BsmtFinType2', 'BsmtQual', 'BsmtCond', 'BsmtFinType1']:
    all_data[col] = all_data[col].fillna('None')

'''
No GarageYrBlt means no Garage. I can impute mean/median since it would 
incorrectly convey existence of Garage. same reasoning for MasVnrArea.
'''
for col in ['GarageYrBlt', 'MasVnrArea']:
    all_data[col] = all_data[col].fillna(0)

'''
Group data by neighborhood & imputed null LotFrontage columns with median of
grouped data.
'''
all_data['LotFrontage'] = all_data.groupby(['Neighborhood'])\
                    ['LotFrontage'].transform(lambda x : x.fillna(x.median()))
    
all_data['Electrical'] = \
    all_data['Electrical'].fillna(all_data['Electrical'].mode()[0])
    
all_data['MSZoning'] = \
    all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0])
    
all_data['Utilities'] = all_data['Utilities'].fillna('ELO')

all_data['Exterior1st'] = all_data['Exterior1st'].fillna('Other')

all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna('Other')

all_data['SaleType'] = all_data['SaleType'].fillna('Oth')

all_data['Functional'] = \
    all_data['Functional'].fillna(all_data['Functional'].mode()[0])

all_data['KitchenQual'] = \
    all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0])

for col in ['BsmtFullBath', 'BsmtHalfBath', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'GarageCars', 'GarageArea']:
    all_data[col] = all_data[col].fillna(0)
    
null_features = all_data.columns[all_data.isnull().any()]
missing_ratio = (all_data[null_features].isnull().sum()/len(all_data)) * 100
missing_data = pd.DataFrame({'Missing Ratio' :missing_ratio})
print(missing_data.sort_values(by='Missing Ratio',ascending=False))

              Missing Ratio
PoolQC                99.66
MiscFeature           96.40
Alley                 93.22
Fence                 80.44
FireplaceQu           48.65
LotFrontage           16.65
GarageFinish           5.45
GarageQual             5.45
GarageCond             5.45
GarageYrBlt            5.45
GarageType             5.38
BsmtExposure           2.81
BsmtCond               2.81
BsmtQual               2.77
BsmtFinType2           2.74
BsmtFinType1           2.71
MasVnrType             0.82
MasVnrArea             0.79
MSZoning               0.14
BsmtFullBath           0.07
BsmtHalfBath           0.07
Functional             0.07
Utilities              0.07
GarageArea             0.03
GarageCars             0.03
Electrical             0.03
KitchenQual            0.03
TotalBsmtSF            0.03
BsmtUnfSF              0.03
BsmtFinSF2             0.03
BsmtFinSF1             0.03
Exterior2nd            0.03
Exterior1st            0.03
SaleType               0.03
Empty DataFrame
Colu

In [5]:
'''
                    4.Numerical Features - Analysis
                    
Linear models assume normal distribution of the features. So, lets check if they 
follow normal distribution or apply transformation to correct them.

Selected Features : 
    OverallQual, GarageCars, YearBuilt, FullBath, TotalBsmtSF, 
    YearRemodAdd, TotRmsAbvGrd, Fireplaces, OpenPorchSF, LotArea, 
    MasVnrArea
'''
print('Skewness of OverallQual before Log Transform : %f'% all_data['OverallQual'].skew())
print('Kurtosis of OverallQual before Log Transform : %f'% all_data['OverallQual'].kurt())

print('Skewness of GarageCars before Log Transform : %f'% all_data['GarageCars'].skew())
print('Kurtosis of GarageCars before Log Transform : %f'% all_data['GarageCars'].kurt())

print('Skewness of YearBuilt before Log Transform : %f'% all_data['YearBuilt'].skew())
print('Kurtosis of YearBuilt before Log Transform : %f'% all_data['YearBuilt'].kurt())

print('Skewness of FullBath before Log Transform : %f'% all_data['FullBath'].skew())
print('Kurtosis of FullBath before Log Transform : %f'% all_data['FullBath'].kurt())

print('Skewness of TotalBsmtSF before Log Transform : %f'% all_data['TotalBsmtSF'].skew())
print('Kurtosis of TotalBsmtSF before Log Transform : %f'% all_data['TotalBsmtSF'].kurt())

print('Skewness of YearRemodAdd before Log Transform : %f'% all_data['YearRemodAdd'].skew())
print('Kurtosis of YearRemodAdd before Log Transform : %f'% all_data['YearRemodAdd'].kurt())

print('Skewness of TotRmsAbvGrd before Log Transform : %f'% all_data['TotRmsAbvGrd'].skew())
print('Kurtosis of TotRmsAbvGrd before Log Transform : %f'% all_data['TotRmsAbvGrd'].kurt())

print('Skewness of Fireplaces before Log Transform : %f'% all_data['Fireplaces'].skew())
print('Kurtosis of Fireplaces before Log Transform : %f'% all_data['Fireplaces'].kurt())

print('Skewness of OpenPorchSF before Log Transform : %f'% all_data['OpenPorchSF'].skew())
print('Kurtosis of OpenPorchSF before Log Transform : %f'% all_data['OpenPorchSF'].kurt())

print('Skewness of LotArea before Log Transform : %f'% all_data['LotArea'].skew())
print('Kurtosis of LotArea before Log Transform : %f'% all_data['LotArea'].kurt())

print('Skewness of MasVnrArea before Log Transform : %f'% all_data['MasVnrArea'].skew())
print('Kurtosis of MasVnrArea before Log Transform : %f'% all_data['MasVnrArea'].kurt())


Skewness of OverallQual before Log Transform : 0.197212
Kurtosis of OverallQual before Log Transform : 0.067219
Skewness of GarageCars before Log Transform : -0.219694
Kurtosis of GarageCars before Log Transform : 0.236592
Skewness of YearBuilt before Log Transform : -0.600114
Kurtosis of YearBuilt before Log Transform : -0.511317
Skewness of FullBath before Log Transform : 0.167692
Kurtosis of FullBath before Log Transform : -0.538129
Skewness of TotalBsmtSF before Log Transform : 1.157489
Kurtosis of TotalBsmtSF before Log Transform : 9.122827
Skewness of YearRemodAdd before Log Transform : -0.451252
Kurtosis of YearRemodAdd before Log Transform : -1.346431
Skewness of TotRmsAbvGrd before Log Transform : 0.758757
Kurtosis of TotRmsAbvGrd before Log Transform : 1.169064
Skewness of Fireplaces before Log Transform : 0.733872
Kurtosis of Fireplaces before Log Transform : 0.076424
Skewness of OpenPorchSF before Log Transform : 2.536417
Kurtosis of OpenPorchSF before Log Transform : 10.93

In [6]:
'''
TotalBsmtSF, OpenPorchSF, LotArea, MasVnrArea
These features has high Skewness & Kurtosis
'''
TotalBsmtSFMean = all_data['TotalBsmtSF'].mean()
all_data.loc[all_data['TotalBsmtSF'] == 0, 'TotalBsmtSF'] = np.round(TotalBsmtSFMean).astype(int)
all_data['TotalBsmtSF'] = np.log(all_data['TotalBsmtSF'])
print('Skewness of TotalBsmtSF after Log Transform : %f'% all_data['TotalBsmtSF'].skew())
print('Kurtosis of TotalBsmtSF after Log Transform : %f'% all_data['TotalBsmtSF'].kurt())

OpenPorchSFMean = all_data['OpenPorchSF'].mean()
all_data.loc[all_data['OpenPorchSF'] == 0, 'OpenPorchSF'] = np.round(OpenPorchSFMean).astype(int)
all_data['TotalBsmtSF'] = np.log(all_data['TotalBsmtSF'])
print('Skewness of TotalBsmtSF after Log Transform : %f'% all_data['TotalBsmtSF'].skew())
print('Kurtosis of TotalBsmtSF after Log Transform : %f'% all_data['TotalBsmtSF'].kurt())

all_data['LotArea'] = np.log(all_data['LotArea'])
print('Skewness of LotArea after Log Transform : %f'% all_data['LotArea'].skew())
print('Kurtosis of LotArea after Log Transform : %f'% all_data['LotArea'].kurt())

MasVnrAreaMean = all_data['MasVnrArea'].mean()
all_data.loc[all_data['MasVnrArea'] == 0, 'MasVnrArea'] = np.round(MasVnrAreaMean).astype(int)
'''
This is to handle 'NA' values in MasVnrArea column
ms_df = all_data['MasVnrArea']
print(np.any(np.isnan(ms_df)))
If True, then find where NaNs exist
print(np.where(np.isnan(ms_df)))
''' 
all_data.loc[all_data['MasVnrArea'].isnull(), 'MasVnrArea'] = np.round(MasVnrAreaMean).astype(int)
all_data['MasVnrArea'] = np.log(all_data['MasVnrArea'])
print('Skewness of MasVnrArea after Log Transform : %f'% all_data['MasVnrArea'].skew())
print('Kurtosis of MasVnrArea after Log Transform : %f'% all_data['MasVnrArea'].kurt())

Skewness of TotalBsmtSF after Log Transform : -0.424181
Kurtosis of TotalBsmtSF after Log Transform : 1.712836
Skewness of TotalBsmtSF after Log Transform : -0.742638
Kurtosis of TotalBsmtSF after Log Transform : 2.775776
Skewness of LotArea after Log Transform : -0.505542
Kurtosis of LotArea after Log Transform : 3.754157
Skewness of MasVnrArea after Log Transform : 0.242429
Kurtosis of MasVnrArea after Log Transform : 5.717325


In [7]:
'''
    5. Standardize Numerical Data
    OverallQual, GarageCars, YearBuilt, FullBath, TotalBsmtSF, 
    YearRemodAdd, TotRmsAbvGrd, Fireplaces, OpenPorchSF, LotArea, 
    MasVnrArea
'''
num_features = all_data.loc[:,['OverallQual', 'GarageCars', 'YearBuilt', 'FullBath', 
                              'TotalBsmtSF', 'YearRemodAdd', 'TotRmsAbvGrd',
                              'Fireplaces', 'OpenPorchSF', 'LotArea', 'MasVnrArea']]

num_features_std = (num_features - num_features.mean())/num_features.std()
print(num_features_std.shape)

# check whether there are any NaNs in the dataframe
print(np.any(np.isnan(num_features_std)))

#If True, then find where NaNs exist
print(np.where(np.isnan(num_features_std)))

(2919, 11)
False
(array([], dtype=int64), array([], dtype=int64))


In [8]:
'''
    6. One Hot Encoding for Categorical Features
'''
cat_features = all_data.loc[:,['Neighborhood', 'PoolQC', 'Heating', 'Condition2']]
cat_features = pd.get_dummies(cat_features)
print(cat_features.shape)

(2919, 43)


In [9]:
'''
                            7. LINEAR MODEL
'''

all_data_final_features = pd.concat([num_features_std, cat_features], axis=1)

train_data_final_features = all_data_final_features[:num_train_rows]
test_data_final_features = all_data_final_features[num_train_rows:]

from sklearn.cross_validation import train_test_split
X_train , X_test, y_train, y_test = train_test_split(train_data_final_features, target, test_size=0.4, random_state=0)

from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

from sklearn import metrics
y_train_predict = regressor.predict(X_test)
print('MAE',metrics.mean_absolute_error(y_test, y_train_predict))
print('MSE',metrics.mean_squared_error(y_test, y_train_predict))



MAE 0.11208114594884296
MSE 0.027909990064389233


In [10]:
'''
                            7a. RIDGE OR L2 REGULARIZATION MODEL
'''
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=0.05, normalize=True)
ridge.fit(X_train, y_train)

from sklearn import metrics
y_ridge_train_predict = ridge.predict(X_test)
print('MAE',metrics.mean_absolute_error(y_test, y_ridge_train_predict))
print('MSE',metrics.mean_squared_error(y_test, y_ridge_train_predict))

MAE 0.11058322378090862
MSE 0.026801654823614924
