In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
%matplotlib inline
np.set_printoptions(precision=2)
pd.set_option('precision', 2)
np.set_printoptions(suppress=True)
sns.set(style='whitegrid')

In [2]:
train_path = 'data/train.csv'
test_path = 'data/test.csv'
train_data = pd.read_csv(train_path, index_col='Id')
test_data = pd.read_csv(test_path, index_col='Id')
features = train_data.iloc[:,:-1]
target = train_data.loc[:, ['SalePrice']]
print('Train Set Size : ', train_data.shape)
print('Test Set Size : ', test_data.shape)
print('Train Features Size : ', features.shape)
num_train_rows = train_data.shape[0]
num_test_rows = test_data.shape[0]
print('Train Rows : ', num_train_rows)
print('Test Rows : ', num_test_rows)
all_data = pd.concat((features, test_data)).reset_index(drop=True)

Train Set Size :  (1460, 80)
Test Set Size :  (1459, 79)
Train Features Size :  (1460, 79)
Train Rows :  1460
Test Rows :  1459


In [3]:
# Analyze SalePrice
print('Skewness of SalePrice before Log Transform : %f'% target.skew())
print('Kurtosis of SalePrice before Log Transform : %f'% target.kurt())

'''
Skew = 1.882876 indicates positive skew with tail to the right.
Kurt = 6.536282 indicates heavy tails i.e. more data on tails.
'''

#Apply Log transformation
target['SalePrice'] = np.log(target['SalePrice'])
print('Skewness of SalePrice after Log Transform : %f'% target.skew())
print('Kurtosis of SalePrice after Log Transform : %f'% target.kurt())

Skewness of SalePrice before Log Transform : 1.882876
Kurtosis of SalePrice before Log Transform : 6.536282
Skewness of SalePrice after Log Transform : 0.121335
Kurtosis of SalePrice after Log Transform : 0.809532


In [4]:
# Missing Data
null_features = all_data.columns[all_data.isnull().any()]
missing_ratio = (all_data[null_features].isnull().sum()/len(all_data)) * 100
missing_data = pd.DataFrame({'Missing Ratio' :missing_ratio})
print(missing_data.sort_values(by='Missing Ratio',ascending=False))


for col in ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu']:
    all_data[col] = all_data[col].fillna('None')

for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'MasVnrType']:
    all_data[col] = all_data[col].fillna('None')
    
for col in ['BsmtExposure', 'BsmtFinType2', 'BsmtQual', 'BsmtCond', 'BsmtFinType1']:
    all_data[col] = all_data[col].fillna('None')

'''
No GarageYrBlt means no Garage. We can impute mean/median since it would 
incorrectly convey existence of Garage. same reasoning for MasVnrArea.
'''
for col in ['GarageYrBlt', 'MasVnrArea']:
    all_data[col] = all_data[col].fillna(0)

'''
Group data by neighborhood & imputed null LotFrontage columns with median of
grouped data.
'''
all_data['LotFrontage'] = all_data.groupby(['Neighborhood'])\
                    ['LotFrontage'].transform(lambda x : x.fillna(x.median()))
    
all_data['Electrical'] = \
    all_data['Electrical'].fillna(all_data['Electrical'].mode()[0])
    
all_data['MSZoning'] = \
    all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0])
    
all_data['Utilities'] = all_data['Utilities'].fillna('ELO')

all_data['Exterior1st'] = all_data['Exterior1st'].fillna('Other')

all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna('Other')

all_data['SaleType'] = all_data['SaleType'].fillna('Oth')

all_data['Functional'] = \
    all_data['Functional'].fillna(all_data['Functional'].mode()[0])

all_data['KitchenQual'] = \
    all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0])

for col in ['BsmtFullBath', 'BsmtHalfBath', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'GarageCars', 'GarageArea']:
    all_data[col] = all_data[col].fillna(0)
    
null_features = all_data.columns[all_data.isnull().any()]
missing_ratio = (all_data[null_features].isnull().sum()/len(all_data)) * 100
missing_data = pd.DataFrame({'Missing Ratio' :missing_ratio})
print(missing_data.sort_values(by='Missing Ratio',ascending=False))

              Missing Ratio
PoolQC                99.66
MiscFeature           96.40
Alley                 93.22
Fence                 80.44
FireplaceQu           48.65
LotFrontage           16.65
GarageFinish           5.45
GarageQual             5.45
GarageCond             5.45
GarageYrBlt            5.45
GarageType             5.38
BsmtExposure           2.81
BsmtCond               2.81
BsmtQual               2.77
BsmtFinType2           2.74
BsmtFinType1           2.71
MasVnrType             0.82
MasVnrArea             0.79
MSZoning               0.14
BsmtFullBath           0.07
BsmtHalfBath           0.07
Functional             0.07
Utilities              0.07
GarageArea             0.03
GarageCars             0.03
Electrical             0.03
KitchenQual            0.03
TotalBsmtSF            0.03
BsmtUnfSF              0.03
BsmtFinSF2             0.03
BsmtFinSF1             0.03
Exterior2nd            0.03
Exterior1st            0.03
SaleType               0.03
Empty DataFrame
Colu

In [5]:
final_num_features = all_data.loc[:,['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 
                               'BsmtFinSF1', 'TotalBsmtSF', 'GrLivArea', 'BsmtFullBath', 'FullBath', 'KitchenAbvGr', 
                               'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'WoodDeckSF', 'EnclosedPorch', 
                               'ScreenPorch', 'PoolArea', 'YrSold']]
final_num_features.insert(0, 'intercept', np.ones((2919,1)))
final_train_num_features = final_num_features[:num_train_rows]
final_test_num_features = final_num_features[num_train_rows:]

from sklearn.cross_validation import train_test_split
X_train , X_test, y_train, y_test = train_test_split(final_train_num_features, target, test_size=0.4, random_state=0)

from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(X_train, y_train)

from sklearn import metrics
y_train_predict = regressor.predict(X_test)
print('MAE',metrics.mean_absolute_error(y_test, y_train_predict))
print('MSE',metrics.mean_squared_error(y_test, y_train_predict))
print('RMSE',np.sqrt(metrics.mean_squared_error(y_test, y_train_predict)))
print('RMSLE',np.sqrt(metrics.mean_squared_log_error(y_test, y_train_predict)))
print("Accuracy --> ", regressor.score(X_test, y_test)*100)

from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(regressor, out_file=dot_data,
               filled=True, rounded=True,
               special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_jpeg('defualt_tree.jpeg')
print('Max Depth : ', regressor.tree_.max_depth)
print('Params : ', regressor.get_params)
#Image(graph.create_png())



MAE 0.14003544040136828
MSE 0.039965685192865696
RMSE 0.19991419457573717
RMSLE 0.01560632439593081
Accuracy -->  73.5360883329249
dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.600689 to fit

Max Depth :  21
Params :  <bound method BaseEstimator.get_params of DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=0, splitter='best')>


In [6]:
depth_list = np.linspace(1, 10, 10, endpoint=False, dtype=int)
mse_list = []
acc_list = []
for depth in depth_list:
    regressor = DecisionTreeRegressor(random_state=0, max_depth=depth)
    regressor.fit(X_train, y_train)
    y_train_predict = regressor.predict(X_test)
    mse = metrics.mean_squared_error(y_test, y_train_predict)
    acc = regressor.score(X_test, y_test)*100
    mse_list.append(mse)
    acc_list.append(acc)
    print("MSE for depth - {0} : {1}".format(depth, mse))
    print("Accuracy for depth - {0} : {1}".format(depth, acc))
    
#Best Accuracy 76.95 is for depth 5

MSE for depth - 1 : 0.08141374237956288
Accuracy for depth - 1 : 46.09060056342118
MSE for depth - 1 : 0.08141374237956288
Accuracy for depth - 1 : 46.09060056342118
MSE for depth - 2 : 0.05872852126863835
Accuracy for depth - 2 : 61.11197914683506
MSE for depth - 3 : 0.048506652183856094
Accuracy for depth - 3 : 67.88055171669487
MSE for depth - 4 : 0.042759146234198506
Accuracy for depth - 4 : 71.68635384478847
MSE for depth - 5 : 0.03480594053781478
Accuracy for depth - 5 : 76.95269500730029
MSE for depth - 6 : 0.03554391954888272
Accuracy for depth - 6 : 76.46403051257658
MSE for depth - 7 : 0.039293635713687394
Accuracy for depth - 7 : 73.98109654351987
MSE for depth - 8 : 0.03965974569413037
Accuracy for depth - 8 : 73.73867101932024
MSE for depth - 9 : 0.03718865973070503
Accuracy for depth - 9 : 75.37493974190707


In [7]:
min_samples_splits = np.linspace(0.01, 0.1, 9, endpoint=False)
sa_sp_mse_list = []
sa_sp_acc_list = []
for min_split in min_samples_splits:
    regressor = DecisionTreeRegressor(random_state=0, min_samples_split=min_split)
    regressor.fit(X_train, y_train)
    y_train_predict = regressor.predict(X_test)
    mse = metrics.mean_squared_error(y_test, y_train_predict)
    acc = regressor.score(X_test, y_test)*100
    sa_sp_mse_list.append(mse)
    sa_sp_acc_list.append(acc)
    print("MSE for min_split - {0} : {1}".format(min_split, mse))
    print("Accuracy for min_split - {0} : {1}".format(min_split, acc))
    
#Best Accuracy is 77.99 for min_samples_splits=0.03

MSE for min_split - 0.01 : 0.03704588037594213
Accuracy for min_split - 0.01 : 75.46948335386048
MSE for min_split - 0.020000000000000004 : 0.035515332557884256
Accuracy for min_split - 0.020000000000000004 : 76.48295984159874
MSE for min_split - 0.030000000000000006 : 0.03323592753350655
Accuracy for min_split - 0.030000000000000006 : 77.99230399340091
MSE for min_split - 0.04000000000000001 : 0.03351457253254428
Accuracy for min_split - 0.04000000000000001 : 77.8077947924346
MSE for min_split - 0.05000000000000001 : 0.03485559478461908
Accuracy for min_split - 0.05000000000000001 : 76.91981566105656
MSE for min_split - 0.06000000000000001 : 0.036314941662004205
Accuracy for min_split - 0.06000000000000001 : 75.95348600429303
MSE for min_split - 0.07 : 0.03640096004763912
Accuracy for min_split - 0.07 : 75.89652756736889
MSE for min_split - 0.08 : 0.03848509365529783
Accuracy for min_split - 0.08 : 74.51648547802901
MSE for min_split - 0.09000000000000001 : 0.03848509365529783
Accurac

In [8]:
min_samples_leafs = np.linspace(0.01, 0.1, 5, endpoint=True)
sa_lf_mse_list = []
sa_lf_acc_list = []
for min_sm_lf in min_samples_leafs:
    regressor = DecisionTreeRegressor(random_state=0, min_samples_leaf=min_sm_lf)
    regressor.fit(X_train, y_train)
    y_train_predict = regressor.predict(X_test)
    mse = metrics.mean_squared_error(y_test, y_train_predict)
    acc = regressor.score(X_test, y_test)*100
    sa_lf_mse_list.append(mse)
    sa_lf_acc_list.append(acc)
    print("MSE for min_leaf - {0} : {1}".format(min_sm_lf, mse))
    print("Accuracy for min_leaf - {0} : {1}".format(min_sm_lf, acc))
    
#Best Accuracy 77.79 is for min_leaf - 0.01

MSE for min_leaf - 0.01 : 0.033536973047382936
Accuracy for min_leaf - 0.01 : 77.79296193662027
MSE for min_leaf - 0.0325 : 0.037653900203379345
Accuracy for min_leaf - 0.0325 : 75.06687339165218
MSE for min_leaf - 0.05500000000000001 : 0.04199163535635264
Accuracy for min_leaf - 0.05500000000000001 : 72.19457333300228
MSE for min_leaf - 0.0775 : 0.047286174235894644
Accuracy for min_leaf - 0.0775 : 68.6887105271994
MSE for min_leaf - 0.1 : 0.0493300007076024
Accuracy for min_leaf - 0.1 : 67.335358446556


In [9]:
max_features = list(range(1,X_train.shape[1]))
mx_featutes_mse_list = []
mx_featutes_acc_list = []
for num_features in max_features:
    regressor = DecisionTreeRegressor(random_state=0, max_features=num_features)
    regressor.fit(X_train, y_train)
    y_train_predict = regressor.predict(X_test)
    mse = metrics.mean_squared_error(y_test, y_train_predict)
    acc = regressor.score(X_test, y_test)*100
    mx_featutes_mse_list.append(mse)
    mx_featutes_acc_list.append(acc)
    print("MSE for num_features - {0} : {1}".format(num_features, mse))
    print("Accuracy for num_features - {0} : {1}".format(num_features, acc))
    
# Best Accuracy is 75.24 is for max_features = 12

MSE for num_features - 1 : 0.06693872758511475
Accuracy for num_features - 1 : 55.67546095180911
MSE for num_features - 2 : 0.06567284889721449
Accuracy for num_features - 2 : 56.51368258159379
MSE for num_features - 3 : 0.05580876338678482
Accuracy for num_features - 3 : 63.045343088970384
MSE for num_features - 4 : 0.04612254855565273
Accuracy for num_features - 4 : 69.45922370786568
MSE for num_features - 5 : 0.04551556911395369
Accuracy for num_features - 5 : 69.86114476217358
MSE for num_features - 6 : 0.04163013916097941
Accuracy for num_features - 6 : 72.43394376631713
MSE for num_features - 7 : 0.04219935550644966
Accuracy for num_features - 7 : 72.0570281444959
MSE for num_features - 8 : 0.038190518307684655
Accuracy for num_features - 8 : 74.7115432117051
MSE for num_features - 9 : 0.04687198969472789
Accuracy for num_features - 9 : 68.96296938347598
MSE for num_features - 10 : 0.04569003367668504
Accuracy for num_features - 10 : 69.74562028774317
MSE for num_features - 11 : 

In [10]:
regressor = DecisionTreeRegressor(random_state=0, max_depth=6, 
                        min_samples_split=0.03, min_samples_leaf=0.01, max_features = 8)
regressor.fit(X_train, y_train)

from sklearn import metrics
y_train_predict = regressor.predict(X_test)
print('MAE',metrics.mean_absolute_error(y_test, y_train_predict))
print('MSE',metrics.mean_squared_error(y_test, y_train_predict))
print('RMSE',np.sqrt(metrics.mean_squared_error(y_test, y_train_predict)))
print('RMSLE',np.sqrt(metrics.mean_squared_log_error(y_test, y_train_predict)))
print("Accuracy --> ", regressor.score(X_test, y_test)*100)


dot_data = StringIO()
export_graphviz(regressor, out_file=dot_data,
               filled=True, rounded=True,
               special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_jpeg('opt_tree.jpeg')
print('Max Depth : ', regressor.tree_.max_depth)
print('Params : ', regressor.get_params)

MAE 0.14160306644071782
MSE 0.036727531093044136
RMSE 0.19164428270377423
RMSLE 0.014749777788991692
Accuracy -->  75.68028337545985
Max Depth :  6
Params :  <bound method BaseEstimator.get_params of DecisionTreeRegressor(criterion='mse', max_depth=6, max_features=8,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=0.01,
           min_samples_split=0.03, min_weight_fraction_leaf=0.0,
           presort=False, random_state=0, splitter='best')>
