In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
%matplotlib inline
np.set_printoptions(precision=2)
pd.set_option('precision', 2)
np.set_printoptions(suppress=True)
sns.set(style='whitegrid')

In [2]:
train_path = 'data/train.csv'
test_path = 'data/test.csv'
train_data = pd.read_csv(train_path, index_col='Id')
test_data = pd.read_csv(test_path, index_col='Id')
features = train_data.iloc[:,:-1]
target = train_data.loc[:, ['SalePrice']]
print('Train Set Size : ', train_data.shape)
print('Test Set Size : ', test_data.shape)
print('Train Features Size : ', features.shape)
num_train_rows = train_data.shape[0]
num_test_rows = test_data.shape[0]
print('Train Rows : ', num_train_rows)
print('Test Rows : ', num_test_rows)
all_data = pd.concat((features, test_data)).reset_index(drop=True)

Train Set Size :  (1460, 80)
Test Set Size :  (1459, 79)
Train Features Size :  (1460, 79)
Train Rows :  1460
Test Rows :  1459


In [3]:
# Missing Data
null_features = all_data.columns[all_data.isnull().any()]
missing_ratio = (all_data[null_features].isnull().sum()/len(all_data)) * 100
missing_data = pd.DataFrame({'Missing Ratio' :missing_ratio})
print(missing_data.sort_values(by='Missing Ratio',ascending=False))


for col in ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu']:
    all_data[col] = all_data[col].fillna('None')

for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'MasVnrType']:
    all_data[col] = all_data[col].fillna('None')
    
for col in ['BsmtExposure', 'BsmtFinType2', 'BsmtQual', 'BsmtCond', 'BsmtFinType1']:
    all_data[col] = all_data[col].fillna('None')

'''
No GarageYrBlt means no Garage. We can impute mean/median since it would 
incorrectly convey existence of Garage. same reasoning for MasVnrArea.
'''
for col in ['GarageYrBlt', 'MasVnrArea']:
    all_data[col] = all_data[col].fillna(0)

'''
Group data by neighborhood & imputed null LotFrontage columns with median of
grouped data.
'''
all_data['LotFrontage'] = all_data.groupby(['Neighborhood'])\
                    ['LotFrontage'].transform(lambda x : x.fillna(x.median()))
    
all_data['Electrical'] = \
    all_data['Electrical'].fillna(all_data['Electrical'].mode()[0])
    
all_data['MSZoning'] = \
    all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0])
    
all_data['Utilities'] = all_data['Utilities'].fillna('ELO')

all_data['Exterior1st'] = all_data['Exterior1st'].fillna('Other')

all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna('Other')

all_data['SaleType'] = all_data['SaleType'].fillna('Oth')

all_data['Functional'] = \
    all_data['Functional'].fillna(all_data['Functional'].mode()[0])

all_data['KitchenQual'] = \
    all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0])

for col in ['BsmtFullBath', 'BsmtHalfBath', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'GarageCars', 'GarageArea']:
    all_data[col] = all_data[col].fillna(0)
    
null_features = all_data.columns[all_data.isnull().any()]
missing_ratio = (all_data[null_features].isnull().sum()/len(all_data)) * 100
missing_data = pd.DataFrame({'Missing Ratio' :missing_ratio})
print(missing_data.sort_values(by='Missing Ratio',ascending=False))

              Missing Ratio
PoolQC                99.66
MiscFeature           96.40
Alley                 93.22
Fence                 80.44
FireplaceQu           48.65
LotFrontage           16.65
GarageFinish           5.45
GarageQual             5.45
GarageCond             5.45
GarageYrBlt            5.45
GarageType             5.38
BsmtExposure           2.81
BsmtCond               2.81
BsmtQual               2.77
BsmtFinType2           2.74
BsmtFinType1           2.71
MasVnrType             0.82
MasVnrArea             0.79
MSZoning               0.14
BsmtFullBath           0.07
BsmtHalfBath           0.07
Functional             0.07
Utilities              0.07
GarageArea             0.03
GarageCars             0.03
Electrical             0.03
KitchenQual            0.03
TotalBsmtSF            0.03
BsmtUnfSF              0.03
BsmtFinSF2             0.03
BsmtFinSF1             0.03
Exterior2nd            0.03
Exterior1st            0.03
SaleType               0.03
Empty DataFrame
Colu

In [4]:
all_data = pd.get_dummies(all_data)
all_data.insert(0, 'intercept', np.ones((2919,1)))
train_fmt_set = all_data[:num_train_rows]
test_fmt_set = all_data[num_train_rows:]

from sklearn.cross_validation import train_test_split
X_train , X_test, y_train, y_test = train_test_split(train_fmt_set, target, test_size=0.4, random_state=0)

print(X_train.shape)
print(X_test.shape)

(876, 305)
(584, 305)




In [5]:
from sklearn.ensemble import RandomForestRegressor
#regressor = RandomForestRegressor(n_estimators=100, random_state=0, max_depth=6, min_samples_split=0.03, min_samples_leaf=0.01, max_features = 8)
regressor = RandomForestRegressor(random_state=0)
regressor.fit(X_train, y_train)

from sklearn import metrics
y_train_predict = regressor.predict(X_test)
print('Baseline MAE',metrics.mean_absolute_error(y_test, y_train_predict))
print('Baseline MSE',metrics.mean_squared_error(y_test, y_train_predict))
print('Baseline RMSE',np.sqrt(metrics.mean_squared_error(y_test, y_train_predict)))
print('Baseline RMSLE',np.sqrt(metrics.mean_squared_log_error(y_test, y_train_predict)))
print("Baseline Accuracy --> ", regressor.score(X_test, y_test)*100)

Baseline MAE 18410.816952054793
Baseline MSE 1161134433.2011473
Baseline RMSE 34075.42271492971
Baseline RMSLE 0.14834845772945354
Baseline Accuracy -->  81.36144924997993


  after removing the cwd from sys.path.


In [6]:
#Gather Feature Importance
importances_list = list(regressor.feature_importances_)
features_nms = list(X_train.columns)

feature_importances = [(feature, imp) for feature, imp \
                      in zip(features_nms, importances_list)]

feature_importances = sorted(feature_importances, key = lambda x: x[1], \
                             reverse = True)

[print('Variable: {:20} Importance: {}'.format(*pair)) \
     for pair in feature_importances]

Variable: OverallQual          Importance: 0.5337920656972814
Variable: GrLivArea            Importance: 0.13513545337420155
Variable: GarageArea           Importance: 0.03154883105208202
Variable: MasVnrArea           Importance: 0.03078319672122231
Variable: TotalBsmtSF          Importance: 0.02954328218357628
Variable: YearBuilt            Importance: 0.0287914118741373
Variable: BsmtFinSF1           Importance: 0.02081466599277871
Variable: MoSold               Importance: 0.01667271798719229
Variable: LotArea              Importance: 0.01569666086840537
Variable: 1stFlrSF             Importance: 0.015380427616524459
Variable: LotFrontage          Importance: 0.01205302195851105
Variable: ExterQual_Gd         Importance: 0.008830701254068835
Variable: Exterior2nd_HdBoard  Importance: 0.007414402302008287
Variable: BsmtUnfSF            Importance: 0.007373985699373133
Variable: YearRemodAdd         Importance: 0.006253397936669208
Variable: GarageYrBlt          Importance: 0.0053754

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [7]:
#Plot Cumulative feature importance
# List of features sorted from most to least important
'''sorted_importances = [importance[1] for importance in feature_importances]
sorted_features = [importance[0] for importance in feature_importances]

# Cumulative importances
cumulative_importances = np.cumsum(sorted_importances)

plt.plot(importances_list, cumulative_importances, 'g-')

# Draw line at 95% of importance retained
plt.hlines(y = 0.95, xmin=0, xmax=len(sorted_importances), color = 'r', linestyles = 'dashed')
# Format x ticks and labels
plt.xticks(importances_list, sorted_features, rotation = 'vertical')

# Axis labels and title
plt.xlabel('Variable'); 
plt.ylabel('Cumulative Importance'); 
plt.title('Cumulative Importances');'''

"sorted_importances = [importance[1] for importance in feature_importances]\nsorted_features = [importance[0] for importance in feature_importances]\n\n# Cumulative importances\ncumulative_importances = np.cumsum(sorted_importances)\n\nplt.plot(importances_list, cumulative_importances, 'g-')\n\n# Draw line at 95% of importance retained\nplt.hlines(y = 0.95, xmin=0, xmax=len(sorted_importances), color = 'r', linestyles = 'dashed')\n# Format x ticks and labels\nplt.xticks(importances_list, sorted_features, rotation = 'vertical')\n\n# Axis labels and title\nplt.xlabel('Variable'); \nplt.ylabel('Cumulative Importance'); \nplt.title('Cumulative Importances');"

In [8]:
#Rebuild model with top 11 important parameters

all_data = all_data.loc[:,['OverallQual', 'GrLivArea', 'GarageArea',
                             'MasVnrArea', 'TotalBsmtSF', 'YearBuilt',
                             'BsmtFinSF1', 'MoSold', 'LotArea', 
                              '1stFlrSF', 'LotFrontage', 'ExterQual_Gd',
                          'Exterior2nd_HdBoard', 'BsmtUnfSF', 'YearRemodAdd',
                          'GarageYrBlt', '2ndFlrSF', 'CentralAir_Y', 'Fireplaces',
                          'OverallCond', 'WoodDeckSF', 'OpenPorchSF', 
                          'Neighborhood_Edwards']]

train_fmt_set = all_data[:num_train_rows]
test_fmt_set = all_data[num_train_rows:]

from sklearn.cross_validation import train_test_split
X_train , X_test, y_train, y_test = train_test_split(train_fmt_set, target, test_size=0.4, random_state=0)

# check whether there are any NaNs in the dataframe
#print(np.any(np.isnan(X_train)))

#If True, then find where NaNs exist
#print(np.where(np.isnan(X_train)))

regressor = RandomForestRegressor(random_state=0)
regressor.fit(X_train, y_train)

from sklearn import metrics
y_train_predict = regressor.predict(X_test)
print('Top 23 Feature MAE',metrics.mean_absolute_error(y_test, y_train_predict))
print('Top 23 Feature MSE',metrics.mean_squared_error(y_test, y_train_predict))
print('Top 23 Feature RMSE',np.sqrt(metrics.mean_squared_error(y_test, y_train_predict)))
print('Top 23 Feature RMSLE',np.sqrt(metrics.mean_squared_log_error(y_test, y_train_predict)))
print("Top 23 Feature Accuracy --> ", round(regressor.score(X_test, y_test)*100, 2))

'''
Baseline Accuracy -->  81.36
Top 11 : 77.93
Top 19 : 78.77
Top 23 : 82.56
'''

Top 23 Feature MAE 19023.280479452053
Top 23 Feature MSE 1086762013.1353083
Top 23 Feature RMSE 32966.073668778154
Top 23 Feature RMSLE 0.1511254400767679
Top 23 Feature Accuracy -->  82.56




'\nBaseline Accuracy -->  81.36\nTop 11 : 77.93\nTop 19 : 78.77\nTop 23 : 82.56\n'

In [15]:
num_tree_list = np.linspace(20, 100, 9, endpoint=True, dtype=int)
max_features_list = ['auto', 'sqrt']
max_depth_list = np.linspace(4, 16, 7, endpoint=True, dtype=int)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': num_tree_list, 
               'max_features': max_features_list, 
               'max_depth': max_depth_list, 
               'min_samples_split': min_samples_split, 
               'min_samples_leaf': min_samples_leaf, 
               'bootstrap': bootstrap}

from sklearn.model_selection import RandomizedSearchCV
random_regr = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
#rf_random = RandomizedSearchCV(estimator = random_regr, 
                param_distributions = random_grid, 
                n_iter = 100, cv = 3, verbose=2, 
                random_state=0, n_jobs = -1)
#rf_random.fit(X_train, y_train)
#print(rf_random.best_params_)

'''
Best Model:
-----------
{'n_estimators': 70, 
'min_samples_split': 2, 
'min_samples_leaf': 1, 
'max_features': 'sqrt', 
'max_depth': 10, 
'bootstrap': False}
'''

In [19]:
regressor = RandomForestRegressor(random_state=0, n_estimators=70,
                        min_samples_split=2, min_samples_leaf=1, max_features='sqrt',
                        max_depth=10, bootstrap=False)
regressor.fit(X_train, y_train)

from sklearn import metrics
y_train_predict = regressor.predict(X_test)
print('Best Random Fit MAE',metrics.mean_absolute_error(y_test, y_train_predict))
print('Best Random Fit MSE',metrics.mean_squared_error(y_test, y_train_predict))
print('Best Random Fit RMSE',np.sqrt(metrics.mean_squared_error(y_test, y_train_predict)))
print('Best Random Fit RMSLE',np.sqrt(metrics.mean_squared_log_error(y_test, y_train_predict)))
print("Best Random Fit Accuracy --> ", round(regressor.score(X_test, y_test)*100, 2))

Best Random Fit MAE 17783.949537480927
Best Random Fit MSE 1033916943.7638636
Best Random Fit RMSE 32154.578892653277
Best Random Fit RMSLE 0.14174874073623517
Best Random Fit Accuracy -->  83.4


  after removing the cwd from sys.path.


In [None]:
param_grid = {'n_estimators': [50,60,80,90], 
              'min_samples_split': [2, 3, 4], 
              'min_samples_leaf': [1, 2, 3], 
              'max_features': [4, 5, 6], 
              'max_depth': [8, 9, 10, 11], 
              'bootstrap': [False]}

# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
#from sklearn.model_selection import GridSearchCV
#grid_regr = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)

#grid_regr.fit(X_train, y_train)
print(grid_regr.best_params_)

'''Best Model:
-----------
{'bootstrap': False, 
'max_depth': 11, 
'max_features': 5, 
'min_samples_leaf': 1, 
'min_samples_split': 4, 
'n_estimators': 90}'''

In [18]:
regressor = RandomForestRegressor(random_state=0, n_estimators=90,
                        min_samples_split=4, min_samples_leaf=1, max_features=5,
                        max_depth=11, bootstrap=False)
regressor.fit(X_train, y_train)

from sklearn import metrics
y_train_predict = regressor.predict(X_test)
print('Best GridSearchCV Fit MAE',metrics.mean_absolute_error(y_test, y_train_predict))
print('Best GridSearchCV Fit MSE',metrics.mean_squared_error(y_test, y_train_predict))
print('Best GridSearchCV Fit RMSE',np.sqrt(metrics.mean_squared_error(y_test, y_train_predict)))
print('Best GridSearchCV Fit RMSLE',np.sqrt(metrics.mean_squared_log_error(y_test, y_train_predict)))
print("Best GridSearchCV Fit Accuracy --> ", round(regressor.score(X_test, y_test)*100, 2))


'''
Best GridSearchCV Fit Accuracy -->  85.83
Best RandomSearchCV Fit Accuracy -->  86.22

So, lets go with Random Search CV Best Fit Params
{'n_estimators': 70, 
'min_samples_split': 2, 
'min_samples_leaf': 1, 
'max_features': 'sqrt', 
'max_depth': 10, 
'bootstrap': False}
'''

Best Random Fit MAE 17174.191466211432
Best Random Fit MSE 882943835.7111392
Best Random Fit RMSE 29714.370861775606
Best Random Fit RMSLE 0.13398974445965578
Best Random Fit Accuracy -->  85.83


  after removing the cwd from sys.path.
