# Load Files

In [1]:
import re
import pandas as pd
import os
import seaborn as sns
import numpy as np

In [2]:
os.chdir('C:/Users/moyke/Desktop/NYCDSA/Housing_Working')
house_train = pd.read_csv('train.csv')
house_test = pd.read_csv('test.csv')

# Find Category Levels From Text File

In [3]:
# FNDING FACTOR LEVELS 
factorLevel = {}
with open('data_description.txt', 'r') as f:
    lines = f.readlines()
    for line in lines:
        if line.find(':') != -1 and line.find(' ') != 0:
            col_name = re.sub(':.*', '', line).strip()
            factorLevel[col_name] = []
        else:
            if len(re.findall('[a-zA-Z0-9]', line)) > 0 :
                level = re.sub('\t.*', '', line).strip()
                if level !='':
                    factorLevel[col_name].append(level)
                    
# outputs factorLevel as a dict without any blank spaces
factorLevel = {k:v for k,v in factorLevel.items() if len(v) > 0}

# Pre Processing

In [4]:
# Understanding Missingness 
house_train.isnull().sum()
house_test.isnull().sum()

Id                  0
MSSubClass          0
MSZoning            4
LotFrontage       227
LotArea             0
Street              0
Alley            1352
LotShape            0
LandContour         0
Utilities           2
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         1
Exterior2nd         1
MasVnrType         16
MasVnrArea         15
ExterQual           0
ExterCond           0
Foundation          0
                 ... 
HalfBath            0
BedroomAbvGr        0
KitchenAbvGr        0
KitchenQual         1
TotRmsAbvGrd        0
Functional          2
Fireplaces          0
FireplaceQu       730
GarageType         76
GarageYrBlt        78
GarageFinish       78
GarageCars          1
GarageArea          1
GarageQual         78
GarageCond

In [5]:
# Combining both house_train and house_test to clean data
house_train # 80 columns
house_test # 79 columns. need SalePrice placeholder
house_test['SalePrice'] = np.nan

# house_test starts on index row 1460 
house_full = pd.concat([house_train, house_test], axis = 0)

### Imputing Garage Features

In [6]:
# Create new feature: hasGarage
house_full['hasGarage'] = [0 if x == True else 1 for x in house_full.GarageYrBlt.isnull()]

# replacing garageyrblt value 2207 to 2007
house_full.loc[house_full.GarageYrBlt == 2207, 'GarageYrBlt']  = 2007

# Create new feature GarageBlt (how many years after was the garage built after the house)
house_full['GarageBlt'] = house_full.GarageYrBlt - house_full.YearBuilt

# NA values will equal to zero assuming that these garages were built the same time as the house  according to Shu's graph
house_full.loc[house_full.GarageBlt.isnull(), 'GarageBlt'] = 0

# Delete the GarageYrBlt variable. No longer needed 
del house_full['GarageYrBlt']

In [7]:
# Plot year built against garageblt
sns.lmplot(x = 'YearBuilt', y = 'GarageBlt', fit_reg = False, hue = 'hasGarage', data = house_full)

<seaborn.axisgrid.FacetGrid at 0x20937a59860>

### Imputing YearRemodAdd Features

In [8]:
# New var isRemod: 0 if year remodelled is the same as year built, 1 otherwise 
house_full['isRemod'] = [0 if x == True else 1 for x in house_full.YearRemodAdd == house_full.YearBuilt]

# New var RemodAdd: YearRemodAdd - YearBuilt: How many years later was hosue remodeled?
house_full['RemodAdd'] = house_full.YearRemodAdd - house_full.YearBuilt

# Delete YearRemodAdd variable
del house_full['YearRemodAdd'] 

In [9]:
# Plot YearBuilt against RemodAdd
sns.lmplot(x = 'YearBuilt', y = 'RemodAdd', data = house_full, fit_reg = False, hue = 'isRemod')

<seaborn.axisgrid.FacetGrid at 0x20933de2b00>

### Deleting zero to nonzero variance features

In [10]:
del house_full["Street"]
del house_full["Utilities"]
del house_full["Condition2"]
del house_full["RoofMatl"]
del house_full["LowQualFinSF"]
del house_full["3SsnPorch"]
del house_full["PoolArea"]
del house_full["PoolQC"]
del house_full["MiscVal"]

# Feature Engineering


### Bath Capacity

In [11]:
# Total Number of bathrooms
house_full['TotBath'] = house_full.FullBath + .5*house_full.HalfBath

# Replace zeros with median
house_full.TotBath.replace(0, house_full.TotBath.median(), inplace=True)

# Replace zero bedrooms with median bedroom #. This is so we can engineer a feature
house_full.BedroomAbvGr.replace(0, house_full.BedroomAbvGr.median(), inplace=True)
house_full.BedroomAbvGr.unique()

# Bath Capacity. The higher the ratio the better
house_full['Bath_Capacity'] = house_full.TotBath / house_full.BedroomAbvGr
house_full.Bath_Capacity

0       0.833333
1       0.666667
2       0.833333
3       0.333333
4       0.625000
5       1.500000
6       0.666667
7       0.833333
8       1.000000
9       0.500000
10      0.333333
11      0.750000
12      0.500000
13      0.666667
14      0.750000
15      0.500000
16      0.500000
17      1.000000
18      0.500000
19      0.333333
20      0.875000
21      0.333333
22      0.666667
23      0.333333
24      0.333333
25      0.666667
26      0.333333
27      0.666667
28      0.500000
29      1.000000
          ...   
1429    0.500000
1430    0.500000
1431    0.500000
1432    0.500000
1433    0.500000
1434    1.000000
1435    1.000000
1436    0.666667
1437    0.500000
1438    0.666667
1439    0.666667
1440    0.500000
1441    1.000000
1442    0.666667
1443    0.666667
1444    0.500000
1445    1.000000
1446    0.500000
1447    0.333333
1448    0.500000
1449    1.000000
1450    0.500000
1451    0.333333
1452    0.500000
1453    0.500000
1454    0.500000
1455    0.500000
1456    0.2500

### Parking Capacity

In [12]:
# Parking Capacity. The higher the better
house_full['Parking_Capacity'] = house_full.GarageCars / house_full.BedroomAbvGr
house_full.Parking_Capacity.unique()

array([ 0.66666667,  1.        ,  0.75      ,  2.        ,  0.5       ,
        0.33333333,  0.        ,  0.25      ,  0.6       ,  1.5       ,
        3.        ,  0.4       ,  0.16666667,  0.2       ,  1.33333333,
        2.5       ,         nan])

# Imputing Rest of Missing Features

### Split back to Training/Test Data

In [13]:
# split full data back to training and test before imputation
house_train = house_full.iloc[0:1460,:]
house_test = house_full.iloc[1460:,]

### Impute for training

In [14]:
# Impute 'None' for categorical feautures in training
for feature in factorLevel.keys():
    if feature in house_train.columns.values:
        if house_train.loc[:,feature].isnull().any() == True:
            house_train.loc[house_train[feature].isnull(),feature] = 'None'
            
# Impute the mean for numerical features in training
for feature in house_train.columns.values:
    if house_train.loc[:,feature].dtype.name == 'float64' or house_train.loc[:,feature].dtype.name == 'int64':
        if house_train.loc[:,feature].isnull().any() == True:
            house_train.loc[house_train[feature].isnull(),feature] = house_train.loc[:,feature].median()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [15]:
# Check for  null values in training data
house_train.isnull().any().sum() # none

0

### Impute for testing

In [16]:
# Impute 'None' for categorical feautures in testing
for feature in factorLevel.keys():
    if feature in house_test.columns.values:
        if house_test.loc[:,feature].isnull().any() == True:
            house_test.loc[house_test[feature].isnull(),feature] = 'None'

# Impute the mean for numerical features in training
for feature in house_test.columns.values:
    if house_test.loc[:,feature].dtype.name == 'float64' or house_test.loc[:,feature].dtype.name == 'int64':
        if house_test.loc[:,feature].isnull().any() == True:
            house_test.loc[house_test[feature].isnull(),feature] = house_test.loc[:,feature].median()
  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [17]:
# Check for  null values in training data
house_test.isnull().any().sum() # Only SalePrice is still Null

1

# Converting to category variables

In [18]:
# convert current numerical variables to object so it can be converted to category
house_train.MSSubClass =  house_train.MSSubClass.astype('str')
house_train.OverallQual = house_train.OverallQual.astype('str')
house_train.OverallCond = house_train.OverallCond.astype('str')

# Doing the same for the test set
house_test.MSSubClass =  house_test.MSSubClass.astype('str')
house_test.OverallQual = house_test.OverallQual.astype('str')
house_test.OverallCond = house_test.OverallCond.astype('str')


# Converting Column data types in data sets into category and setting levels from the factorlevel dictionary 
for varname in factorLevel.keys():
    if varname in house_train.columns.values:
        house_train.loc[:,varname] = house_train.loc[:,varname].astype('category')
    if varname in house_test.columns.values:
        house_test.loc[:,varname] = house_test.loc[:,varname].astype('category')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


### Finishing touch, delete ID Column

In [19]:
# Nulling id column
del house_train['Id']
del house_test['Id']

# Normalize SalePrice variable

In [20]:
sns.distplot(house_train.SalePrice, bins = 20)
sns.distplot([np.log(house_train.SalePrice + 1)], bins = 20)

# Converting SalePrice to Log
house_train.SalePrice = np.log(house_train.SalePrice)
house_test.SalePrice = np.log(house_test.SalePrice)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


# Dummify and Seperate Training Set to Another Training and Test Set

In [21]:
from copy import deepcopy

from copy import deepcopy
import sklearn.model_selection as ms

# Creating a deep copy of training set 
house_train_copy = deepcopy(house_train)

house_train_x = pd.get_dummies(house_train_copy.drop('SalePrice', axis = 1))
house_train_y = house_train_copy.SalePrice.values.reshape(-1,1)

# Create sperate out original training set to the new training and test set
x_train, x_test, y_train, y_test = ms.train_test_split(house_train_x, \
                                                       house_train_y, \
                                                       test_size = 1/4, \
                                                       random_state = 0)


# Fitting Decision Tree

In [22]:
from sklearn import tree

# Train decision tree
tree_model = tree.DecisionTreeRegressor(max_depth = 5)
tree_model.fit(x_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=5, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

### First try, got 84% R sq on training, and 79% R sq on test

In [23]:
print(tree_model.score(x_train, y_train))
# 84.1% variance explained on training data on first try... 

print(tree_model.score(x_test, y_test))
# but only 79% variance explained on test data...

0.84108721728
0.789191120616


### These were the top 10 variables

Both Bath engineered features ranked higher than original bath features in this first model fit

In [24]:
# Top 10 Important Variables
result = pd.concat([ pd.DataFrame(x_train.columns.values),  pd.DataFrame(tree_model.feature_importances_)], axis = 1)
result.columns = ['feature', 'importance']
result.sort_values(by = 'importance', ascending = False).head(10)

Unnamed: 0,feature,importance
10,GrLivArea,0.46868
183,ExterQual_TA,0.135213
2,YearBuilt,0.097325
19,GarageCars,0.096405
8,1stFlrSF,0.032239
32,Bath_Capacity,0.031962
1,LotArea,0.030426
236,CentralAir_Y,0.028934
7,TotalBsmtSF,0.017185
31,TotBath,0.012613


# Tuning Decision Tree

In [25]:
from sklearn.grid_search import GridSearchCV

tree_model = tree.DecisionTreeRegressor()

param_grid = [{'max_depth': np.arange(1,21), \
              'min_samples_leaf': np.arange(1,10)}]

tree_model.set_params(random_state = 0)

grid_search = GridSearchCV(tree_model, param_grid=param_grid, cv = 5, verbose = 2)

grid_search.fit(x_train, y_train)

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


Fitting 5 folds for each of 180 candidates, totalling 900 fits
[CV] max_depth=1, min_samples_leaf=1 .................................
[CV] ........................ max_depth=1, min_samples_leaf=1 -   0.0s
[CV] max_depth=1, min_samples_leaf=1 .................................
[CV] ........................ max_depth=1, min_samples_leaf=1 -   0.0s
[CV] max_depth=1, min_samples_leaf=1 .................................
[CV] ........................ max_depth=1, min_samples_leaf=1 -   0.0s
[CV] max_depth=1, min_samples_leaf=1 .................................
[CV] ........................ max_depth=1, min_samples_leaf=1 -   0.0s
[CV] max_depth=1, min_samples_leaf=1 .................................
[CV] ........................ max_depth=1, min_samples_leaf=1 -   0.0s
[CV] max_depth=1, min_samples_leaf=2 .................................
[CV] ........................ max_depth=1, min_samples_leaf=2 -   0.0s
[CV] max_depth=1, min_samples_leaf=2 .................................
[CV] .........

[CV] ........................ max_depth=2, min_samples_leaf=3 -   0.0s
[CV] max_depth=2, min_samples_leaf=3 .................................
[CV] ........................ max_depth=2, min_samples_leaf=3 -   0.0s
[CV] max_depth=2, min_samples_leaf=4 .................................
[CV] ........................ max_depth=2, min_samples_leaf=4 -   0.0s
[CV] max_depth=2, min_samples_leaf=4 .................................
[CV] ........................ max_depth=2, min_samples_leaf=4 -   0.0s
[CV] max_depth=2, min_samples_leaf=4 .................................
[CV] ........................ max_depth=2, min_samples_leaf=4 -   0.0s
[CV] max_depth=2, min_samples_leaf=4 .................................
[CV] ........................ max_depth=2, min_samples_leaf=4 -   0.0s
[CV] max_depth=2, min_samples_leaf=4 .................................
[CV] ........................ max_depth=2, min_samples_leaf=4 -   0.0s
[CV] max_depth=2, min_samples_leaf=5 .................................
[CV] .

[CV] ........................ max_depth=3, min_samples_leaf=8 -   0.0s
[CV] max_depth=3, min_samples_leaf=8 .................................
[CV] ........................ max_depth=3, min_samples_leaf=8 -   0.0s
[CV] max_depth=3, min_samples_leaf=8 .................................
[CV] ........................ max_depth=3, min_samples_leaf=8 -   0.0s
[CV] max_depth=3, min_samples_leaf=8 .................................
[CV] ........................ max_depth=3, min_samples_leaf=8 -   0.0s
[CV] max_depth=3, min_samples_leaf=8 .................................
[CV] ........................ max_depth=3, min_samples_leaf=8 -   0.0s
[CV] max_depth=3, min_samples_leaf=9 .................................
[CV] ........................ max_depth=3, min_samples_leaf=9 -   0.0s
[CV] max_depth=3, min_samples_leaf=9 .................................
[CV] ........................ max_depth=3, min_samples_leaf=9 -   0.0s
[CV] max_depth=3, min_samples_leaf=9 .................................
[CV] .

[CV] ........................ max_depth=5, min_samples_leaf=2 -   0.0s
[CV] max_depth=5, min_samples_leaf=2 .................................
[CV] ........................ max_depth=5, min_samples_leaf=2 -   0.0s
[CV] max_depth=5, min_samples_leaf=2 .................................
[CV] ........................ max_depth=5, min_samples_leaf=2 -   0.0s
[CV] max_depth=5, min_samples_leaf=2 .................................
[CV] ........................ max_depth=5, min_samples_leaf=2 -   0.0s
[CV] max_depth=5, min_samples_leaf=3 .................................
[CV] ........................ max_depth=5, min_samples_leaf=3 -   0.0s
[CV] max_depth=5, min_samples_leaf=3 .................................
[CV] ........................ max_depth=5, min_samples_leaf=3 -   0.0s
[CV] max_depth=5, min_samples_leaf=3 .................................
[CV] ........................ max_depth=5, min_samples_leaf=3 -   0.0s
[CV] max_depth=5, min_samples_leaf=3 .................................
[CV] .

[CV] ........................ max_depth=6, min_samples_leaf=5 -   0.0s
[CV] max_depth=6, min_samples_leaf=5 .................................
[CV] ........................ max_depth=6, min_samples_leaf=5 -   0.0s
[CV] max_depth=6, min_samples_leaf=5 .................................
[CV] ........................ max_depth=6, min_samples_leaf=5 -   0.0s
[CV] max_depth=6, min_samples_leaf=5 .................................
[CV] ........................ max_depth=6, min_samples_leaf=5 -   0.0s
[CV] max_depth=6, min_samples_leaf=6 .................................
[CV] ........................ max_depth=6, min_samples_leaf=6 -   0.0s
[CV] max_depth=6, min_samples_leaf=6 .................................
[CV] ........................ max_depth=6, min_samples_leaf=6 -   0.0s
[CV] max_depth=6, min_samples_leaf=6 .................................
[CV] ........................ max_depth=6, min_samples_leaf=6 -   0.0s
[CV] max_depth=6, min_samples_leaf=6 .................................
[CV] .

[CV] ........................ max_depth=7, min_samples_leaf=7 -   0.0s
[CV] max_depth=7, min_samples_leaf=8 .................................
[CV] ........................ max_depth=7, min_samples_leaf=8 -   0.0s
[CV] max_depth=7, min_samples_leaf=8 .................................
[CV] ........................ max_depth=7, min_samples_leaf=8 -   0.0s
[CV] max_depth=7, min_samples_leaf=8 .................................
[CV] ........................ max_depth=7, min_samples_leaf=8 -   0.0s
[CV] max_depth=7, min_samples_leaf=8 .................................
[CV] ........................ max_depth=7, min_samples_leaf=8 -   0.0s
[CV] max_depth=7, min_samples_leaf=8 .................................
[CV] ........................ max_depth=7, min_samples_leaf=8 -   0.0s
[CV] max_depth=7, min_samples_leaf=9 .................................
[CV] ........................ max_depth=7, min_samples_leaf=9 -   0.0s
[CV] max_depth=7, min_samples_leaf=9 .................................
[CV] .

[CV] ........................ max_depth=9, min_samples_leaf=2 -   0.0s
[CV] max_depth=9, min_samples_leaf=2 .................................
[CV] ........................ max_depth=9, min_samples_leaf=2 -   0.0s
[CV] max_depth=9, min_samples_leaf=2 .................................
[CV] ........................ max_depth=9, min_samples_leaf=2 -   0.0s
[CV] max_depth=9, min_samples_leaf=3 .................................
[CV] ........................ max_depth=9, min_samples_leaf=3 -   0.0s
[CV] max_depth=9, min_samples_leaf=3 .................................
[CV] ........................ max_depth=9, min_samples_leaf=3 -   0.0s
[CV] max_depth=9, min_samples_leaf=3 .................................
[CV] ........................ max_depth=9, min_samples_leaf=3 -   0.0s
[CV] max_depth=9, min_samples_leaf=3 .................................
[CV] ........................ max_depth=9, min_samples_leaf=3 -   0.0s
[CV] max_depth=9, min_samples_leaf=3 .................................
[CV] .

[CV] ....................... max_depth=10, min_samples_leaf=6 -   0.0s
[CV] max_depth=10, min_samples_leaf=6 ................................
[CV] ....................... max_depth=10, min_samples_leaf=6 -   0.0s
[CV] max_depth=10, min_samples_leaf=6 ................................
[CV] ....................... max_depth=10, min_samples_leaf=6 -   0.0s
[CV] max_depth=10, min_samples_leaf=6 ................................
[CV] ....................... max_depth=10, min_samples_leaf=6 -   0.0s
[CV] max_depth=10, min_samples_leaf=6 ................................
[CV] ....................... max_depth=10, min_samples_leaf=6 -   0.0s
[CV] max_depth=10, min_samples_leaf=7 ................................
[CV] ....................... max_depth=10, min_samples_leaf=7 -   0.0s
[CV] max_depth=10, min_samples_leaf=7 ................................
[CV] ....................... max_depth=10, min_samples_leaf=7 -   0.0s
[CV] max_depth=10, min_samples_leaf=7 ................................
[CV] .

[CV] max_depth=11, min_samples_leaf=9 ................................
[CV] ....................... max_depth=11, min_samples_leaf=9 -   0.0s
[CV] max_depth=11, min_samples_leaf=9 ................................
[CV] ....................... max_depth=11, min_samples_leaf=9 -   0.0s
[CV] max_depth=11, min_samples_leaf=9 ................................
[CV] ....................... max_depth=11, min_samples_leaf=9 -   0.0s
[CV] max_depth=11, min_samples_leaf=9 ................................
[CV] ....................... max_depth=11, min_samples_leaf=9 -   0.0s
[CV] max_depth=12, min_samples_leaf=1 ................................
[CV] ....................... max_depth=12, min_samples_leaf=1 -   0.0s
[CV] max_depth=12, min_samples_leaf=1 ................................
[CV] ....................... max_depth=12, min_samples_leaf=1 -   0.0s
[CV] max_depth=12, min_samples_leaf=1 ................................
[CV] ....................... max_depth=12, min_samples_leaf=1 -   0.0s
[CV] m

[CV] ....................... max_depth=13, min_samples_leaf=3 -   0.0s
[CV] max_depth=13, min_samples_leaf=3 ................................
[CV] ....................... max_depth=13, min_samples_leaf=3 -   0.0s
[CV] max_depth=13, min_samples_leaf=3 ................................
[CV] ....................... max_depth=13, min_samples_leaf=3 -   0.0s
[CV] max_depth=13, min_samples_leaf=3 ................................
[CV] ....................... max_depth=13, min_samples_leaf=3 -   0.0s
[CV] max_depth=13, min_samples_leaf=4 ................................
[CV] ....................... max_depth=13, min_samples_leaf=4 -   0.0s
[CV] max_depth=13, min_samples_leaf=4 ................................
[CV] ....................... max_depth=13, min_samples_leaf=4 -   0.0s
[CV] max_depth=13, min_samples_leaf=4 ................................
[CV] ....................... max_depth=13, min_samples_leaf=4 -   0.0s
[CV] max_depth=13, min_samples_leaf=4 ................................
[CV] .

[CV] ....................... max_depth=14, min_samples_leaf=7 -   0.0s
[CV] max_depth=14, min_samples_leaf=7 ................................
[CV] ....................... max_depth=14, min_samples_leaf=7 -   0.0s
[CV] max_depth=14, min_samples_leaf=7 ................................
[CV] ....................... max_depth=14, min_samples_leaf=7 -   0.0s
[CV] max_depth=14, min_samples_leaf=7 ................................
[CV] ....................... max_depth=14, min_samples_leaf=7 -   0.0s
[CV] max_depth=14, min_samples_leaf=7 ................................
[CV] ....................... max_depth=14, min_samples_leaf=7 -   0.0s
[CV] max_depth=14, min_samples_leaf=8 ................................
[CV] ....................... max_depth=14, min_samples_leaf=8 -   0.0s
[CV] max_depth=14, min_samples_leaf=8 ................................
[CV] ....................... max_depth=14, min_samples_leaf=8 -   0.0s
[CV] max_depth=14, min_samples_leaf=8 ................................
[CV] .

[CV] ....................... max_depth=16, min_samples_leaf=1 -   0.0s
[CV] max_depth=16, min_samples_leaf=1 ................................
[CV] ....................... max_depth=16, min_samples_leaf=1 -   0.0s
[CV] max_depth=16, min_samples_leaf=1 ................................
[CV] ....................... max_depth=16, min_samples_leaf=1 -   0.0s
[CV] max_depth=16, min_samples_leaf=2 ................................
[CV] ....................... max_depth=16, min_samples_leaf=2 -   0.0s
[CV] max_depth=16, min_samples_leaf=2 ................................
[CV] ....................... max_depth=16, min_samples_leaf=2 -   0.0s
[CV] max_depth=16, min_samples_leaf=2 ................................
[CV] ....................... max_depth=16, min_samples_leaf=2 -   0.0s
[CV] max_depth=16, min_samples_leaf=2 ................................
[CV] ....................... max_depth=16, min_samples_leaf=2 -   0.0s
[CV] max_depth=16, min_samples_leaf=2 ................................
[CV] .

[CV] ....................... max_depth=17, min_samples_leaf=4 -   0.0s
[CV] max_depth=17, min_samples_leaf=4 ................................
[CV] ....................... max_depth=17, min_samples_leaf=4 -   0.0s
[CV] max_depth=17, min_samples_leaf=4 ................................
[CV] ....................... max_depth=17, min_samples_leaf=4 -   0.0s
[CV] max_depth=17, min_samples_leaf=4 ................................
[CV] ....................... max_depth=17, min_samples_leaf=4 -   0.0s
[CV] max_depth=17, min_samples_leaf=4 ................................
[CV] ....................... max_depth=17, min_samples_leaf=4 -   0.0s
[CV] max_depth=17, min_samples_leaf=5 ................................
[CV] ....................... max_depth=17, min_samples_leaf=5 -   0.0s
[CV] max_depth=17, min_samples_leaf=5 ................................
[CV] ....................... max_depth=17, min_samples_leaf=5 -   0.0s
[CV] max_depth=17, min_samples_leaf=5 ................................
[CV] .

[CV] ....................... max_depth=18, min_samples_leaf=7 -   0.0s
[CV] max_depth=18, min_samples_leaf=7 ................................
[CV] ....................... max_depth=18, min_samples_leaf=7 -   0.0s
[CV] max_depth=18, min_samples_leaf=7 ................................
[CV] ....................... max_depth=18, min_samples_leaf=7 -   0.0s
[CV] max_depth=18, min_samples_leaf=7 ................................
[CV] ....................... max_depth=18, min_samples_leaf=7 -   0.0s
[CV] max_depth=18, min_samples_leaf=8 ................................
[CV] ....................... max_depth=18, min_samples_leaf=8 -   0.0s
[CV] max_depth=18, min_samples_leaf=8 ................................
[CV] ....................... max_depth=18, min_samples_leaf=8 -   0.0s
[CV] max_depth=18, min_samples_leaf=8 ................................
[CV] ....................... max_depth=18, min_samples_leaf=8 -   0.0s
[CV] max_depth=18, min_samples_leaf=8 ................................
[CV] .

[CV] ....................... max_depth=20, min_samples_leaf=1 -   0.0s
[CV] max_depth=20, min_samples_leaf=1 ................................
[CV] ....................... max_depth=20, min_samples_leaf=1 -   0.0s
[CV] max_depth=20, min_samples_leaf=1 ................................
[CV] ....................... max_depth=20, min_samples_leaf=1 -   0.0s
[CV] max_depth=20, min_samples_leaf=2 ................................
[CV] ....................... max_depth=20, min_samples_leaf=2 -   0.0s
[CV] max_depth=20, min_samples_leaf=2 ................................
[CV] ....................... max_depth=20, min_samples_leaf=2 -   0.0s
[CV] max_depth=20, min_samples_leaf=2 ................................
[CV] ....................... max_depth=20, min_samples_leaf=2 -   0.0s
[CV] max_depth=20, min_samples_leaf=2 ................................
[CV] ....................... max_depth=20, min_samples_leaf=2 -   0.0s
[CV] max_depth=20, min_samples_leaf=2 ................................
[CV] .

[Parallel(n_jobs=1)]: Done 900 out of 900 | elapsed:   34.7s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=0, splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20]), 'min_samples_leaf': array([1, 2, 3, 4, 5, 6, 7, 8, 9])}],
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=2)

In [26]:
grid_search.best_params_

{'max_depth': 7, 'min_samples_leaf': 7}

Best hyperparamters are max depth = 7, min_samples_leaf = 7

### Refit Decision Tree with Best Parameters

In [27]:
tree_model = tree.DecisionTreeRegressor(max_depth = 7, 
                                        min_samples_leaf = 7)

tree_model.fit(x_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=7, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=7,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

### R sq is now 90% on training and 79% on test

Though training score increased, test score remained unchanged

In [30]:
print(tree_model.score(x_train, y_train))
print(tree_model.score(x_test, y_test))

0.900569576276
0.785170739179


### Top 10 Important Variables for this fitted tree

Both of the bath engineerd features continue to rank higher in importance compared to the original

In [31]:
result = pd.concat([pd.DataFrame(x_train.columns.values),pd.DataFrame(tree_model.feature_importances_)], axis = 1)
result.columns = ['feature','importance']
result.sort_values(by = 'importance', ascending = False).head(10)


Unnamed: 0,feature,importance
10,GrLivArea,0.452972
183,ExterQual_TA,0.126282
2,YearBuilt,0.098428
19,GarageCars,0.090038
8,1stFlrSF,0.031055
1,LotArea,0.030206
32,Bath_Capacity,0.029851
235,CentralAir_N,0.027022
7,TotalBsmtSF,0.020751
31,TotBath,0.01178


# Thoughts so far...

I think we can do better on the test score...our model is only fit on one set of observations, so there is high variance in our model.

However, if we fit the model on MULTIPLE sets of observations then we can lower this variance by averaging a bunch of weak models together

Lets try attacking with Bagged Trees to decrease the variance!

# Fitting Bagged Trees

In [33]:
from sklearn.ensemble import BaggingRegressor

bag_tree = BaggingRegressor(
        tree.DecisionTreeRegressor(),
        n_estimators = 500, 
        max_samples = round(len(x_train)*.67), # bag two thirds of training set
        bootstrap = True,      
        random_state = 0,
        oob_score = True
        )

bag_tree.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


BaggingRegressor(base_estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=734, n_estimators=500, n_jobs=1, oob_score=True,
         random_state=0, verbose=0, warm_start=False)

### R sq for Training is 96%, OOB 86% and Test 88%. Test score Exceeded our expectations!

Bagged Tree model gave us a much higher Test score, goign from 79% to 88% R sqaured!

In [37]:
print('Training Score:', bag_tree.score(x_train, y_train))
 
print('OOB Score:', bag_tree.oob_score_)

print('Test Score:', bag_tree.score(x_test, y_test))

Training Score: 0.962722034031
OOB Score: 0.85838827501
Test Score: 0.876984669309


### Error Rate: MSE: 27.5%

In [45]:
np.mean((y_test - bag_tree.predict(x_test))**2)

0.27465053651564059

## Thoughts so far:

Much improved training and test scores!
But, perhaps these trees are WAY too correlated, still increasing too much variance in our model.
Same predictors will be chosen at the top of all our trees
Lets try attacking Random Forest to decorrelate! Go!

# Fitting Random Forest: First Attempt

In [53]:
from sklearn.ensemble import RandomForestRegressor
import math

rf_tree_1 = RandomForestRegressor(n_estimators = 500,
                                max_features = round(math.sqrt(len(x_train.columns))),
                                bootstrap = True,
                                oob_score = True,
                                random_state = 0)

rf_tree_1.fit(x_train, y_train)

  # Remove the CWD from sys.path while we load stuff.


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=18, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=500, n_jobs=1, oob_score=True, random_state=0,
           verbose=0, warm_start=False)

## R sq for Training: 98%, OOB: 85%, Test: 87%. No improvement in Test Score...

Training score went up 2 points, but Test score slightly lowered. .2%. 

In [54]:
print('Training Score:' ,rf_tree_1.score(x_train, y_train))

print('OOB Score:', rf_tree_1.oob_score_)

print('Test Score:' ,rf_tree_1.score(x_test, y_test))

Training Score: 0.980014193856
OOB Score: 0.854726657908
Test Score: 0.874614961694


### Error Rate: MSE = .2629

In [55]:
np.mean((y_test - rf_tree_1.predict(x_test))**2)

0.26291239489685486

GRLiveArea is still the top variable. But FullBath (original bath feature) is now ranked higher than the engineered bath features...

In [56]:
result = pd.concat([pd.DataFrame(x_train.columns.values),pd.DataFrame(rf_tree_1.feature_importances_)], axis = 1)
result.columns = ['feature','importance']
result.sort_values(by = 'importance', ascending = False).head(10)

Unnamed: 0,feature,importance
10,GrLivArea,0.070697
20,GarageArea,0.047409
2,YearBuilt,0.04393
19,GarageCars,0.042184
7,TotalBsmtSF,0.039968
8,1stFlrSF,0.036597
13,FullBath,0.032637
31,TotBath,0.032377
183,ExterQual_TA,0.031841
32,Bath_Capacity,0.029885


# Grid Search CV to Tune Random Forest

In [44]:

rf_tree_cv = RandomForestRegressor(
                                bootstrap = True,
                                oob_score = True,
                                random_state = 0)

rf_param_grid = [{
        'n_estimators' : [100,250,500,750,1000], #Test[100,250,500,750,1000]
        'max_features' : np.arange(17,40),
        #'min_samples_leaf' : np.arange(2,9),
        #'max_depth' : np.arange(5,11)
        }]

rf_grid_search = GridSearchCV(rf_tree_cv, param_grid = rf_param_grid, cv = 5)

rf_grid_search.fit(x_train, y_train)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  best_estimator.fit(X, y, **self.fit_params)


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=True, random_state=0, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [100, 250, 500, 750, 1000], 'max_features': array([17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39])}],
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

## Best parameters: Max_features = 37, n_estimators = 1000

In [46]:
rf_grid_search.best_params_

{'max_features': 37, 'n_estimators': 1000}

# Refit Random Forest with Tuned Hyperparameters

In [47]:
rf_tree = RandomForestRegressor(n_estimators = 1000,
                                max_features = 37,
                                bootstrap = True,
                                oob_score = True,
                                random_state = 0)


rf_tree.fit(x_train, y_train)

  


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=37, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=1000, n_jobs=1, oob_score=True, random_state=0,
           verbose=0, warm_start=False)

## R sq for Training: 98%, OOB: 86%, Test: 88%

Test score was <1% higher in the tuned Random Forest than the untuned Random Forest model

In [51]:
print('Training Score:', rf_tree.score(x_train, y_train))

print('OOB Score:', rf_tree.oob_score_)
     
print('Test Score:', rf_tree.score(x_test, y_test))

Training Score: 0.981293812888
OOB Score: 0.862658677856
Test Score: 0.880436491252


### MSE: .2687

Wait...why is this .005 higher than the untuned Random Forest model? Untuned Model had MSE of .2629

In [None]:
np.mean((y_test - rf_tree.predict(x_test))**2)

### Top 10 Important Variables

In [52]:
result = pd.concat([pd.DataFrame(x_train.columns.values),pd.DataFrame(rf_tree.feature_importances_)], axis = 1)
result.columns = ['feature','importance']
result.sort_values(by = 'importance', ascending = False).head(10)

Unnamed: 0,feature,importance
10,GrLivArea,0.115088
2,YearBuilt,0.063623
20,GarageArea,0.054118
19,GarageCars,0.053883
7,TotalBsmtSF,0.046121
183,ExterQual_TA,0.04118
8,1stFlrSF,0.040219
13,FullBath,0.037612
31,TotBath,0.036657
32,Bath_Capacity,0.033746


## Thoughts so far:

The random forest models had better training and test scores as well as lower MSE than the decision tree. 

The tuned Random Forest model had slightly higher R^2 than the untuned Random Forest, BUT it also had a slightly higher 