In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
%matplotlib inline
np.set_printoptions(precision=2)
pd.set_option('precision', 2)
np.set_printoptions(suppress=True)
sns.set(style='whitegrid')

In [6]:
train_path = 'data/train.csv'
test_path = 'data/test.csv'
train_data = pd.read_csv(train_path, index_col='Id')
test_data = pd.read_csv(test_path, index_col='Id')
features = train_data.iloc[:,:-1]
target = train_data.loc[:, ['SalePrice']]
print('Train Set Size : ', train_data.shape)
print('Test Set Size : ', test_data.shape)
print('Train Features Size : ', features.shape)
num_train_rows = train_data.shape[0]
num_test_rows = test_data.shape[0]
print('Train Rows : ', num_train_rows)
print('Test Rows : ', num_test_rows)
all_data = pd.concat((features, test_data)).reset_index(drop=True)

Train Set Size :  (1460, 80)
Test Set Size :  (1459, 79)
Train Features Size :  (1460, 79)
Train Rows :  1460
Test Rows :  1459


In [7]:
# Missing Data
null_features = all_data.columns[all_data.isnull().any()]
missing_ratio = (all_data[null_features].isnull().sum()/len(all_data)) * 100
missing_data = pd.DataFrame({'Missing Ratio' :missing_ratio})
print(missing_data.sort_values(by='Missing Ratio',ascending=False))


for col in ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu']:
    all_data[col] = all_data[col].fillna('None')

for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'MasVnrType']:
    all_data[col] = all_data[col].fillna('None')
    
for col in ['BsmtExposure', 'BsmtFinType2', 'BsmtQual', 'BsmtCond', 'BsmtFinType1']:
    all_data[col] = all_data[col].fillna('None')

'''
No GarageYrBlt means no Garage. We can impute mean/median since it would 
incorrectly convey existence of Garage. same reasoning for MasVnrArea.
'''
for col in ['GarageYrBlt', 'MasVnrArea']:
    all_data[col] = all_data[col].fillna(0)

'''
Group data by neighborhood & imputed null LotFrontage columns with median of
grouped data.
'''
all_data['LotFrontage'] = all_data.groupby(['Neighborhood'])\
                    ['LotFrontage'].transform(lambda x : x.fillna(x.median()))
    
all_data['Electrical'] = \
    all_data['Electrical'].fillna(all_data['Electrical'].mode()[0])
    
all_data['MSZoning'] = \
    all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0])
    
all_data['Utilities'] = all_data['Utilities'].fillna('ELO')

all_data['Exterior1st'] = all_data['Exterior1st'].fillna('Other')

all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna('Other')

all_data['SaleType'] = all_data['SaleType'].fillna('Oth')

all_data['Functional'] = \
    all_data['Functional'].fillna(all_data['Functional'].mode()[0])

all_data['KitchenQual'] = \
    all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0])

for col in ['BsmtFullBath', 'BsmtHalfBath', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'GarageCars', 'GarageArea']:
    all_data[col] = all_data[col].fillna(0)
    
null_features = all_data.columns[all_data.isnull().any()]
missing_ratio = (all_data[null_features].isnull().sum()/len(all_data)) * 100
missing_data = pd.DataFrame({'Missing Ratio' :missing_ratio})
print(missing_data.sort_values(by='Missing Ratio',ascending=False))

              Missing Ratio
PoolQC                99.66
MiscFeature           96.40
Alley                 93.22
Fence                 80.44
FireplaceQu           48.65
LotFrontage           16.65
GarageFinish           5.45
GarageQual             5.45
GarageCond             5.45
GarageYrBlt            5.45
GarageType             5.38
BsmtExposure           2.81
BsmtCond               2.81
BsmtQual               2.77
BsmtFinType2           2.74
BsmtFinType1           2.71
MasVnrType             0.82
MasVnrArea             0.79
MSZoning               0.14
BsmtFullBath           0.07
BsmtHalfBath           0.07
Functional             0.07
Utilities              0.07
GarageArea             0.03
GarageCars             0.03
Electrical             0.03
KitchenQual            0.03
TotalBsmtSF            0.03
BsmtUnfSF              0.03
BsmtFinSF2             0.03
BsmtFinSF1             0.03
Exterior2nd            0.03
Exterior1st            0.03
SaleType               0.03
Empty DataFrame
Colu

In [13]:
final_num_features = all_data.loc[:,['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 
                               'BsmtFinSF1', 'TotalBsmtSF', 'GrLivArea', 'BsmtFullBath', 'FullBath', 'KitchenAbvGr', 
                               'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'WoodDeckSF', 'EnclosedPorch', 
                               'ScreenPorch', 'PoolArea', 'YrSold']]
final_num_features.insert(0, 'intercept', np.ones((2919,1)))
final_train_num_features = final_num_features[:num_train_rows]
final_test_num_features = final_num_features[num_train_rows:]

from sklearn.cross_validation import train_test_split
X_train , X_test, y_train, y_test = train_test_split(final_train_num_features, target, test_size=0.4, random_state=0)

from sklearn.ensemble import RandomForestRegressor
#regressor = RandomForestRegressor(n_estimators=13, random_state=0, max_depth=6, min_samples_leaf=20)
regressor = RandomForestRegressor(random_state=0)
regressor.fit(X_train, y_train)

from sklearn import metrics
y_train_predict = regressor.predict(X_test)
print('MAE',metrics.mean_absolute_error(y_test, y_train_predict))
print('MSE',metrics.mean_squared_error(y_test, y_train_predict))
print('RMSE',np.sqrt(metrics.mean_squared_error(y_test, y_train_predict)))
print('RMSLE',np.sqrt(metrics.mean_squared_log_error(y_test, y_train_predict)))
print("Accuracy --> ", regressor.score(X_test, y_test)*100)
print('Params : ', regressor.get_params())

MAE 18890.767579908676
MSE 973806726.6878387
RMSE 31205.876476840684
RMSLE 0.1464780907278054
Accuracy -->  84.36843695519109
Params :  {'bootstrap': True, 'criterion': 'mse', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 10, 'n_jobs': 1, 'oob_score': False, 'random_state': 0, 'verbose': 0, 'warm_start': False}


  from ipykernel import kernelapp as app


In [18]:
max_features = list(range(1,X_train.shape[1]))
mx_featutes_mse_list = []
mx_featutes_acc_list = []
for num_features in max_features:
    regressor = RandomForestRegressor(random_state=0, max_features=num_features)
    regressor.fit(X_train, y_train)
    y_train_predict = regressor.predict(X_test)
    mse = metrics.mean_squared_error(y_test, y_train_predict)
    acc = regressor.score(X_test, y_test)*100
    mx_featutes_mse_list.append(mse)
    mx_featutes_acc_list.append(acc)
    print("MSE for num_features - {0} : {1}".format(num_features, mse))
    print("Accuracy for num_features - {0} : {1}".format(num_features, acc))

#Best Accuracy is 88.36 with num_features = 9

  
  
  
  
  
  


Accuracy for num_features - 1 : 82.25031407226545
Accuracy for num_features - 2 : 81.05287267396078
Accuracy for num_features - 3 : 82.29283026627066
Accuracy for num_features - 4 : 85.23941682170273
Accuracy for num_features - 5 : 84.75846055275021
Accuracy for num_features - 6 : 83.19324756206163
Accuracy for num_features - 7 : 85.15896993699216
Accuracy for num_features - 8 : 85.42957423814332
Accuracy for num_features - 9 : 88.36331162251011


  
  
  
  


Accuracy for num_features - 10 : 85.4699690536576
Accuracy for num_features - 11 : 81.37225618399535
Accuracy for num_features - 12 : 85.87313637805615


  
  
  


Accuracy for num_features - 13 : 85.09511361842314
Accuracy for num_features - 14 : 83.74342362947156
Accuracy for num_features - 15 : 84.86946786982385


  
  
  


Accuracy for num_features - 16 : 81.66905599148855
Accuracy for num_features - 17 : 85.51622807035616
Accuracy for num_features - 18 : 82.45300786510273


  
  


Accuracy for num_features - 19 : 82.97737790311182
Accuracy for num_features - 20 : 85.15637477624728


  
  


In [26]:
num_estimators = np.linspace(10, 100, 20, endpoint=True, dtype=int)
num_estimators_mse_list = []
num_estimators_acc_list = []
for num_trees in num_estimators:
    regressor = RandomForestRegressor(random_state=0, n_estimators=num_trees)
    regressor.fit(X_train, y_train)
    y_train_predict = regressor.predict(X_test)
    mse = metrics.mean_squared_error(y_test, y_train_predict)
    acc = regressor.score(X_test, y_test)*100
    num_estimators_mse_list.append(mse)
    num_estimators_acc_list.append(acc)
    #print("MSE for num_features - {0} : {1}".format(num_features, mse))
    print("Accuracy for num_features - {0} : {1}".format(num_trees, acc))
    
#plt.figure(figsize=(15,15))
#plt.plot(num_estimators, num_estimators_acc_list, color='blue')
#Best Accuracy is 86.31 with num_features = 38

  
  


Accuracy for num_features - 10 : 84.36843695519109
Accuracy for num_features - 14 : 84.61824627624995


  


Accuracy for num_features - 19 : 85.59328106969201


  


Accuracy for num_features - 24 : 85.43102176007169


  


Accuracy for num_features - 28 : 86.07282961597463


  


Accuracy for num_features - 33 : 86.01493274811769


  


Accuracy for num_features - 38 : 86.31648136857655


  


Accuracy for num_features - 43 : 86.19900252558296


  


Accuracy for num_features - 47 : 86.17520422327689


  


Accuracy for num_features - 52 : 86.21539937427474


  


Accuracy for num_features - 57 : 86.28117315316256


  


Accuracy for num_features - 62 : 86.19076987069897


  


Accuracy for num_features - 66 : 86.04410648231973


  


Accuracy for num_features - 71 : 86.1400865531722


  


Accuracy for num_features - 76 : 86.3401445404753


  


Accuracy for num_features - 81 : 86.1904117977034


  


Accuracy for num_features - 85 : 86.15138825492504


  


Accuracy for num_features - 90 : 86.2313501290791


  


Accuracy for num_features - 95 : 86.35381166973842


  


Accuracy for num_features - 100 : 86.32882335975314


In [31]:
min_samples_leafs = np.linspace(0.001, 0.01, 10, endpoint=True)
sa_lf_mse_list = []
sa_lf_acc_list = []
for min_sm_lf in min_samples_leafs:
    regressor = RandomForestRegressor(random_state=0, min_samples_leaf=min_sm_lf)
    regressor.fit(X_train, y_train)
    y_train_predict = regressor.predict(X_test)
    mse = metrics.mean_squared_error(y_test, y_train_predict)
    acc = regressor.score(X_test, y_test)*100
    sa_lf_mse_list.append(mse)
    sa_lf_acc_list.append(acc)
    #print("MSE for min_leaf - {0} : {1}".format(min_sm_lf, mse))
    print("Accuracy for min_leaf - {0} : {1}".format(min_sm_lf, acc))
    
#Best Accuracy is 84.46 with min_samples_leaf = 0.003

  
  


Accuracy for min_leaf - 0.001 : 84.36843695519109
Accuracy for min_leaf - 0.002 : 83.50598064854965
Accuracy for min_leaf - 0.003 : 84.46107406565002


  
  
  


Accuracy for min_leaf - 0.004 : 83.28279044195197
Accuracy for min_leaf - 0.005 : 83.29203838030953
Accuracy for min_leaf - 0.006 : 82.86436207071601


  
  
  


Accuracy for min_leaf - 0.007 : 81.7278421567179
Accuracy for min_leaf - 0.008 : 82.10273426540564
Accuracy for min_leaf - 0.009000000000000001 : 82.10273426540564
Accuracy for min_leaf - 0.01 : 82.11727376503414


  
  


In [39]:
regressor = RandomForestRegressor(random_state=0, min_samples_leaf=0.003,
                        n_estimators=38, max_features=9, max_depth=10)
regressor.fit(X_train, y_train)

from sklearn import metrics
y_train_predict = regressor.predict(X_test)
print('MAE',metrics.mean_absolute_error(y_test, y_train_predict))
print('MSE',metrics.mean_squared_error(y_test, y_train_predict))
print('RMSE',np.sqrt(metrics.mean_squared_error(y_test, y_train_predict)))
print('RMSLE',np.sqrt(metrics.mean_squared_log_error(y_test, y_train_predict)))
print("Accuracy --> ", regressor.score(X_test, y_test)*100)

  This is separate from the ipykernel package so we can avoid doing imports until


MAE 17011.71549863652
MSE 862522558.5085111
RMSE 29368.73437021948
RMSLE 0.13600474133513366
Accuracy -->  86.15477241900626
