In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
%matplotlib inline
np.set_printoptions(precision=2)
pd.set_option('precision', 2)
np.set_printoptions(suppress=True)
sns.set(style='whitegrid')

In [2]:
#1. Load training and test datasets
train_path = 'data/train.csv'
test_path = 'data/test.csv'
train_data = pd.read_csv(train_path, index_col='Id')
test_data = pd.read_csv(test_path, index_col='Id')

#1a. Seperate features and target from training dataset.
features = train_data.iloc[:,:-1]
target = train_data.loc[:, ['SalePrice']]
print('Train Set Size : ', train_data.shape)
print('Test Set Size : ', test_data.shape)
print('Train Features Size : ', features.shape)
num_train_rows = train_data.shape[0]
num_test_rows = test_data.shape[0]
print('Train Rows : ', num_train_rows)
print('Test Rows : ', num_test_rows)

#1b. Merge training and test datasets to cover all 
#encodings for categorical features
all_data = pd.concat((features, test_data)).reset_index(drop=True)

Train Set Size :  (1460, 80)
Test Set Size :  (1459, 79)
Train Features Size :  (1460, 79)
Train Rows :  1460
Test Rows :  1459


In [3]:
#3. Missing Data
null_features = all_data.columns[all_data.isnull().any()]
missing_ratio = (all_data[null_features].isnull().sum()/len(all_data)) * 100
missing_data = pd.DataFrame({'Missing Ratio' :missing_ratio})
print(missing_data.sort_values(by='Missing Ratio',ascending=False))


for col in ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu']:
    all_data[col] = all_data[col].fillna('None')

for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'MasVnrType']:
    all_data[col] = all_data[col].fillna('None')
    
for col in ['BsmtExposure', 'BsmtFinType2', 'BsmtQual', 'BsmtCond', 'BsmtFinType1']:
    all_data[col] = all_data[col].fillna('None')

'''
No GarageYrBlt means no Garage. I can impute mean/median since it would 
incorrectly convey existence of Garage. same reasoning for MasVnrArea.
'''
for col in ['GarageYrBlt', 'MasVnrArea']:
    all_data[col] = all_data[col].fillna(0)

'''
Group data by neighborhood & imputed null LotFrontage columns with median of
grouped data.
'''
all_data['LotFrontage'] = all_data.groupby(['Neighborhood'])\
                    ['LotFrontage'].transform(lambda x : x.fillna(x.median()))
    
all_data['Electrical'] = \
    all_data['Electrical'].fillna(all_data['Electrical'].mode()[0])
    
all_data['MSZoning'] = \
    all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0])
    
all_data['Utilities'] = all_data['Utilities'].fillna('ELO')

all_data['Exterior1st'] = all_data['Exterior1st'].fillna('Other')

all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna('Other')

all_data['SaleType'] = all_data['SaleType'].fillna('Oth')

all_data['Functional'] = \
    all_data['Functional'].fillna(all_data['Functional'].mode()[0])

all_data['KitchenQual'] = \
    all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0])

for col in ['BsmtFullBath', 'BsmtHalfBath', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'GarageCars', 'GarageArea']:
    all_data[col] = all_data[col].fillna(0)
    
null_features = all_data.columns[all_data.isnull().any()]
missing_ratio = (all_data[null_features].isnull().sum()/len(all_data)) * 100
missing_data = pd.DataFrame({'Missing Ratio' :missing_ratio})
print(missing_data.sort_values(by='Missing Ratio',ascending=False))

              Missing Ratio
PoolQC                99.66
MiscFeature           96.40
Alley                 93.22
Fence                 80.44
FireplaceQu           48.65
LotFrontage           16.65
GarageFinish           5.45
GarageQual             5.45
GarageCond             5.45
GarageYrBlt            5.45
GarageType             5.38
BsmtExposure           2.81
BsmtCond               2.81
BsmtQual               2.77
BsmtFinType2           2.74
BsmtFinType1           2.71
MasVnrType             0.82
MasVnrArea             0.79
MSZoning               0.14
BsmtFullBath           0.07
BsmtHalfBath           0.07
Functional             0.07
Utilities              0.07
GarageArea             0.03
GarageCars             0.03
Electrical             0.03
KitchenQual            0.03
TotalBsmtSF            0.03
BsmtUnfSF              0.03
BsmtFinSF2             0.03
BsmtFinSF1             0.03
Exterior2nd            0.03
Exterior1st            0.03
SaleType               0.03
Empty DataFrame
Colu

In [4]:
'''
            4. Try different parameter values for Decision Tree
               and find optimal values
               
               Selecting following features based on analysis 
               from BackwardElimination_NumericFeatureOnly.ipynb
'''

final_num_features = all_data.loc[:,['MSSubClass', 'LotArea', 'OverallQual', 
                                     'OverallCond', 'YearBuilt', 'YearRemodAdd', 
                                       'BsmtFinSF1', 'TotalBsmtSF', 'GrLivArea', 
                                     'BsmtFullBath', 'FullBath', 'KitchenAbvGr', 
                                       'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 
                                     'WoodDeckSF', 'EnclosedPorch', 
                               'ScreenPorch', 'PoolArea', 'YrSold']]

final_num_features.insert(0, 'intercept', np.ones((2919,1)))
final_train_num_features = final_num_features[:num_train_rows]
final_test_num_features = final_num_features[num_train_rows:]

from sklearn.cross_validation import train_test_split
X_train , X_test, y_train, y_test = \
    train_test_split(final_train_num_features, target, test_size=0.4, random_state=0)

from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(X_train, y_train)

from sklearn import metrics
y_train_predict = regressor.predict(X_test)
print('Default Model MAE',metrics.mean_absolute_error(y_test, y_train_predict))
print('Default Model MSE',metrics.mean_squared_error(y_test, y_train_predict))
print("R-squared --> ", regressor.score(X_test, y_test)*100)


Default Model MAE 24112.539383561645
Default Model MSE 1431596711.3784246
R-squared -->  77.01998390916185




In [5]:
'''
            4a. Tune Decision Tree Depth
'''
depth_list = np.linspace(1, 10, 10, endpoint=False, dtype=int)
mse_list = []
acc_list = []
for depth in depth_list:
    regressor = DecisionTreeRegressor(random_state=0, max_depth=depth)
    regressor.fit(X_train, y_train)
    y_train_predict = regressor.predict(X_test)
    mse = metrics.mean_squared_error(y_test, y_train_predict)
    acc = regressor.score(X_test, y_test)*100
    mse_list.append(mse)
    acc_list.append(acc)
    print("MSE for depth - {0} : {1}".format(depth, mse))
    print("R-squared for depth - {0} : {1}".format(depth, acc))
    
# Best R-squared 76.82 is for depth 8

MSE for depth - 1 : 3394892163.9511614
R-squared for depth - 1 : 45.505130087131185
MSE for depth - 1 : 3394892163.9511614
R-squared for depth - 1 : 45.505130087131185
MSE for depth - 2 : 2357586901.2285194
R-squared for depth - 2 : 62.15597277139908
MSE for depth - 3 : 1849515473.9930742
R-squared for depth - 3 : 70.31154443510023
MSE for depth - 4 : 1454780810.443314
R-squared for depth - 4 : 76.64783233509895
MSE for depth - 5 : 1565750329.400996
R-squared for depth - 5 : 74.86654762623375
MSE for depth - 6 : 1699758536.1783252
R-squared for depth - 6 : 72.71544548722264
MSE for depth - 7 : 1536772051.8607423
R-squared for depth - 7 : 75.3317074571184
MSE for depth - 8 : 1444028317.618597
R-squared for depth - 8 : 76.82043154279809
MSE for depth - 9 : 1590962412.1827257
R-squared for depth - 9 : 74.46184282117044


In [6]:
'''
            4b. Tune Decision Tree min_samples_splits
'''
min_samples_splits = np.linspace(0.01, 0.1, 9, endpoint=False)
sa_sp_mse_list = []
sa_sp_acc_list = []
for min_split in min_samples_splits:
    regressor = DecisionTreeRegressor(random_state=0, min_samples_split=min_split)
    regressor.fit(X_train, y_train)
    y_train_predict = regressor.predict(X_test)
    mse = metrics.mean_squared_error(y_test, y_train_predict)
    acc = regressor.score(X_test, y_test)*100
    sa_sp_mse_list.append(mse)
    sa_sp_acc_list.append(acc)
    print("MSE for min_split - {0} : {1}".format(min_split, mse))
    print("R-squared for min_split - {0} : {1}".format(min_split, acc))
    
# Best R-squared is 78.31 for min_samples_splits=0.01

MSE for min_split - 0.01 : 1351162461.3300498
R-squared for min_split - 0.01 : 78.31111593375726
MSE for min_split - 0.020000000000000004 : 1446253086.823402
R-squared for min_split - 0.020000000000000004 : 76.78471950761495
MSE for min_split - 0.030000000000000006 : 1432206558.3096113
R-squared for min_split - 0.030000000000000006 : 77.01019463528309
MSE for min_split - 0.04000000000000001 : 1586058081.4139438
R-squared for min_split - 0.04000000000000001 : 74.54056722664416
MSE for min_split - 0.05000000000000001 : 1559960722.1241367
R-squared for min_split - 0.05000000000000001 : 74.95948250609509
MSE for min_split - 0.06000000000000001 : 1619647565.868082
R-squared for min_split - 0.06000000000000001 : 74.00138821966262
MSE for min_split - 0.07 : 1655441720.211563
R-squared for min_split - 0.07 : 73.42681981207028
MSE for min_split - 0.08 : 1695116098.933233
R-squared for min_split - 0.08 : 72.78996597315628
MSE for min_split - 0.09000000000000001 : 1693540777.133373
R-squared for 

In [7]:
'''
            4c. Tune Decision Tree min_samples_leafs
'''
min_samples_leafs = np.linspace(0.01, 0.1, 5, endpoint=True)
sa_lf_mse_list = []
sa_lf_acc_list = []
for min_sm_lf in min_samples_leafs:
    regressor = DecisionTreeRegressor(random_state=0, min_samples_leaf=min_sm_lf)
    regressor.fit(X_train, y_train)
    y_train_predict = regressor.predict(X_test)
    mse = metrics.mean_squared_error(y_test, y_train_predict)
    acc = regressor.score(X_test, y_test)*100
    sa_lf_mse_list.append(mse)
    sa_lf_acc_list.append(acc)
    print("MSE for min_leaf - {0} : {1}".format(min_sm_lf, mse))
    print("R-squared for min_leaf - {0} : {1}".format(min_sm_lf, acc))
    
# Best R-squared 73.51 is for min_leaf - 0.01

MSE for min_leaf - 0.01 : 1649644762.6273823
R-squared for min_leaf - 0.01 : 73.5198726791966
MSE for min_leaf - 0.0325 : 1698408104.704687
R-squared for min_leaf - 0.0325 : 72.73712263746135
MSE for min_leaf - 0.05500000000000001 : 1972150104.5449157
R-squared for min_leaf - 0.05500000000000001 : 68.34301114567836
MSE for min_leaf - 0.0775 : 2045981050.45495
R-squared for min_leaf - 0.0775 : 67.15787547756078
MSE for min_leaf - 0.1 : 2329906714.11265
R-squared for min_leaf - 0.1 : 62.60029563150653


In [8]:
'''
            4d. Tune Decision Tree max_features
'''
max_features = list(range(1,X_train.shape[1]))
mx_featutes_mse_list = []
mx_featutes_acc_list = []
for num_features in max_features:
    regressor = DecisionTreeRegressor(random_state=0, max_features=num_features)
    regressor.fit(X_train, y_train)
    y_train_predict = regressor.predict(X_test)
    mse = metrics.mean_squared_error(y_test, y_train_predict)
    acc = regressor.score(X_test, y_test)*100
    mx_featutes_mse_list.append(mse)
    mx_featutes_acc_list.append(acc)
    print("MSE for num_features - {0} : {1}".format(num_features, mse))
    print("R-squared for num_features - {0} : {1}".format(num_features, acc))
    
# Best R-squared is 79.18 is for max_features = 18

MSE for num_features - 1 : 2317143663.5753427
R-squared for num_features - 1 : 62.8051683476733
MSE for num_features - 2 : 1954134998.859589
R-squared for num_features - 2 : 68.63218994529178
MSE for num_features - 3 : 1975858021.5650685
R-squared for num_features - 3 : 68.28349159515874
MSE for num_features - 4 : 2137328767.515411
R-squared for num_features - 4 : 65.69156028472305
MSE for num_features - 5 : 2066363146.2363014
R-squared for num_features - 5 : 66.83070171046722
MSE for num_features - 6 : 1939005695.1506848
R-squared for num_features - 6 : 68.87504579981456
MSE for num_features - 7 : 1969003763.6386986
R-squared for num_features - 7 : 68.3935162663436
MSE for num_features - 8 : 1537799205.1472602
R-squared for num_features - 8 : 75.31521957413827
MSE for num_features - 9 : 2133284842.0171232
R-squared for num_features - 9 : 65.75647344936097
MSE for num_features - 10 : 1518675432.3321917
R-squared for num_features - 10 : 75.62219471840544
MSE for num_features - 11 : 1673

In [9]:
'''
            5. Decision Tree Model by selecting optimal values for parameters
'''
regressor = DecisionTreeRegressor(random_state=0, max_depth=8, 
                        min_samples_split=0.01, min_samples_leaf=0.01, max_features = 18)
regressor.fit(X_train, y_train)

from sklearn import metrics
y_train_predict = regressor.predict(X_test)
print('Optimal params MAE',metrics.mean_absolute_error(y_test, y_train_predict))
print('Optimal params MSE',metrics.mean_squared_error(y_test, y_train_predict))
print("Optimal params R-squared --> ", regressor.score(X_test, y_test)*100)


'''
Error with optimal parameters is higher than that with default parameters.
So, after some trail and error, I decreased error by changing 
    max_features = 8
    max_depth=6
'''
regressor = DecisionTreeRegressor(random_state=0, max_depth=6, 
                        min_samples_split=0.03, min_samples_leaf=0.01, max_features = 8)
regressor.fit(X_train, y_train)
y_train_predict = regressor.predict(X_test)
print('Adjusted Optimal params MAE',metrics.mean_absolute_error(y_test, y_train_predict))
print('Adjusted Optimal params MSE',metrics.mean_squared_error(y_test, y_train_predict))
print("Adjusted Optimal params R-squared --> ", regressor.score(X_test, y_test)*100)

Optimal params MAE 24634.73387314005
Optimal params MSE 1634243258.518491
Optimal params R-squared -->  73.76709789942265
Adjusted Optimal params MAE 25017.204371393396
Adjusted Optimal params MSE 1348170192.1797292
Adjusted Optimal params R-squared -->  78.35914789183313


In [10]:
'''
            6. Visualize Decision Tree
'''
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(regressor, out_file=dot_data,
               filled=True, rounded=True,
               special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_jpeg('DecisionTreeVisualization.jpeg')
print('Max Depth : ', regressor.tree_.max_depth)
print('Params : ', regressor.get_params)

Max Depth :  6
Params :  <bound method BaseEstimator.get_params of DecisionTreeRegressor(criterion='mse', max_depth=6, max_features=8,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=0.01,
           min_samples_split=0.03, min_weight_fraction_leaf=0.0,
           presort=False, random_state=0, splitter='best')>
