In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [2]:
iowa_file_path = 'train.csv'
home_data = pd.read_csv(iowa_file_path)

In [3]:
home_data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [4]:
y = home_data.SalePrice
print(y.shape)
print(y[:5])

(1460,)
0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64


In [5]:
feature_columns = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 
                 'BedroomAbvGr', 'TotRmsAbvGrd']

X = home_data[feature_columns]

In [6]:
X.describe()

Unnamed: 0,LotArea,YearBuilt,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,10516.828082,1971.267808,1162.626712,346.992466,1.565068,2.866438,6.517808
std,9981.264932,30.202904,386.587738,436.528436,0.550916,0.815778,1.625393
min,1300.0,1872.0,334.0,0.0,0.0,0.0,2.0
25%,7553.5,1954.0,882.0,0.0,1.0,2.0,5.0
50%,9478.5,1973.0,1087.0,0.0,2.0,3.0,6.0
75%,11601.5,2000.0,1391.25,728.0,2.0,3.0,7.0
max,215245.0,2010.0,4692.0,2065.0,3.0,8.0,14.0


In [7]:
X.head()

Unnamed: 0,LotArea,YearBuilt,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
0,8450,2003,856,854,2,3,8
1,9600,1976,1262,0,2,3,6
2,11250,2001,920,866,2,3,6
3,9550,1915,961,756,1,3,7
4,14260,2000,1145,1053,2,4,9


In [8]:
X.shape

(1460, 7)

In [9]:
# specify model
iowa_model = DecisionTreeRegressor(random_state = 1)
# fit model
iowa_model.fit(X, y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=1, splitter='best')

In [10]:
print(f'first in-sample predictions: {iowa_model.predict(X.head())}')
print(f'actual target values: {y.head().tolist()}')
print(type(y.head()))

first in-sample predictions: [208500. 181500. 223500. 140000. 250000.]
actual target values: [208500, 181500, 223500, 140000, 250000]
<class 'pandas.core.series.Series'>


In [11]:
# now lets do that in the correct way : split the data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 1)

In [12]:
# specify the model
iowa_model = DecisionTreeRegressor(random_state = 1)
# fit it
iowa_model.fit(train_X, train_y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=1, splitter='best')

In [13]:
# predict
val_predictions = iowa_model.predict(val_X)

In [14]:
print(f'first in-sample predictions: {val_predictions[:10]}')
print(f'actual target values: {val_y[:10].tolist()}')

first in-sample predictions: [186500. 184000. 130000.  92000. 164500. 220000. 335000. 144152. 215000.
 262000.]
actual target values: [231500, 179500, 122000, 84500, 142000, 325624, 285000, 151000, 195000, 275000]


In [15]:
# let us calc the mae
val_mae = mean_absolute_error(val_y, val_predictions)
print(val_mae)

29652.931506849316


In [16]:
# make the function to estimate various tree sizes 
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes = max_leaf_nodes, random_state = 0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return mae

In [17]:
# write a loop to find the bes tree size
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]

for max_leaf_nodes in candidate_max_leaf_nodes:
    mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print(f'for the max leaf nodes of {max_leaf_nodes} the mae equals: {mae}')

for the max leaf nodes of 5 the mae equals: 35044.51299744237
for the max leaf nodes of 25 the mae equals: 29016.41319191076
for the max leaf nodes of 50 the mae equals: 27405.930473214907
for the max leaf nodes of 100 the mae equals: 27282.50803885739
for the max leaf nodes of 250 the mae equals: 27893.822225701646
for the max leaf nodes of 500 the mae equals: 29454.18598068598


In [18]:
# fit model using the best tree size for the whole sample size
final_model = DecisionTreeRegressor(max_leaf_nodes = 100, random_state = 0)
final_model.fit(X, y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=100, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=0, splitter='best')

In [19]:
# write the fuction that checks the mean absolute error on the tree
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes = max_leaf_nodes, random_state = 0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [29]:
# compare the sizes of the different leaf nodes 
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500, 1000]
for max_leaf_nodes in candidate_max_leaf_nodes:
    spam = []
    mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print(f'for the max leaf nodes of {max_leaf_nodes} the mean absolute error equals {mae}') 

for the max leaf nodes of 5 the mean absolute error equals 35044.51299744237
for the max leaf nodes of 25 the mean absolute error equals 29016.41319191076
for the max leaf nodes of 50 the mean absolute error equals 27405.930473214907
for the max leaf nodes of 100 the mean absolute error equals 27282.50803885739
for the max leaf nodes of 250 the mean absolute error equals 27893.822225701646
for the max leaf nodes of 500 the mean absolute error equals 29454.18598068598
for the max leaf nodes of 1000 the mean absolute error equals 30136.05205479452


In [30]:
# now use all the data you have with the best leaf nodes size 
final_model = DecisionTreeRegressor(max_leaf_nodes = 100, random_state=0)
final_model.fit(X, y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=100, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=0, splitter='best')