In [1]:
# Import pandas for csv
import pandas as pd

In [2]:
#Load path for train and test data
data_path="home-data/train.csv"
#Load data
data=pd.read_csv(data_path)

In [3]:
#Describe data
print(data.describe())

                Id   MSSubClass  LotFrontage        LotArea  OverallQual  \
count  1460.000000  1460.000000  1201.000000    1460.000000  1460.000000   
mean    730.500000    56.897260    70.049958   10516.828082     6.099315   
std     421.610009    42.300571    24.284752    9981.264932     1.382997   
min       1.000000    20.000000    21.000000    1300.000000     1.000000   
25%     365.750000    20.000000    59.000000    7553.500000     5.000000   
50%     730.500000    50.000000    69.000000    9478.500000     6.000000   
75%    1095.250000    70.000000    80.000000   11601.500000     7.000000   
max    1460.000000   190.000000   313.000000  215245.000000    10.000000   

       OverallCond    YearBuilt  YearRemodAdd   MasVnrArea   BsmtFinSF1  \
count  1460.000000  1460.000000   1460.000000  1452.000000  1460.000000   
mean      5.575342  1971.267808   1984.865753   103.685262   443.639726   
std       1.112799    30.202904     20.645407   181.066207   456.098091   
min       1.000

In [4]:
#Check first 5 rows of data_path
print(data.head())
print(data.columns)

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities    ...     PoolArea PoolQC Fence MiscFeature MiscVal  \
0         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
1         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
2         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
3         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
4         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   

  MoSold YrSold  SaleType  SaleCondition  SalePrice  
0      2   2008     

In [5]:
#Target prediction column
y=data.SalePrice

In [6]:
#Select features
features=['LotArea','YearBuilt','1stFlrSF','2ndFlrSF','FullBath','BedroomAbvGr','TotRmsAbvGrd']

In [7]:
#Select feature columns from the dataset
X=data[features]
print(X.describe())
print(X.head())

             LotArea    YearBuilt     1stFlrSF     2ndFlrSF     FullBath  \
count    1460.000000  1460.000000  1460.000000  1460.000000  1460.000000   
mean    10516.828082  1971.267808  1162.626712   346.992466     1.565068   
std      9981.264932    30.202904   386.587738   436.528436     0.550916   
min      1300.000000  1872.000000   334.000000     0.000000     0.000000   
25%      7553.500000  1954.000000   882.000000     0.000000     1.000000   
50%      9478.500000  1973.000000  1087.000000     0.000000     2.000000   
75%     11601.500000  2000.000000  1391.250000   728.000000     2.000000   
max    215245.000000  2010.000000  4692.000000  2065.000000     3.000000   

       BedroomAbvGr  TotRmsAbvGrd  
count   1460.000000   1460.000000  
mean       2.866438      6.517808  
std        0.815778      1.625393  
min        0.000000      2.000000  
25%        2.000000      5.000000  
50%        3.000000      6.000000  
75%        3.000000      7.000000  
max        8.000000     14.

In [8]:
#import scikitlearn decisiontree regressor- fits sine curve with additional noise, higher the depth, higher chance of learning from noise or overfitting

from sklearn.tree import DecisionTreeRegressor
import numpy as np

In [9]:
#Fit regression model
regr_model=DecisionTreeRegressor(random_state=1)
regr_model.fit(X,y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=1, splitter='best')

In [10]:
#Predict
predictions=regr_model.predict(X)
print(predictions[:6])

[208500. 181500. 223500. 140000. 250000. 143000.]


In [11]:
#Mean absolute error
from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(y,predictions))

62.35433789954339


In [12]:
#Since Mean absolute error doesn't make sense in 'in sampling' of the data, we need validation set. We can make use of sklearn.model_selection import train_test_split

from sklearn.model_selection import train_test_split

In [13]:
train_X,val_X,train_y,val_y=train_test_split(X,y,random_state=1)
regr_model.fit(train_X,train_y)
val_predictions=regr_model.predict(val_X)
print(mean_absolute_error(val_y,val_predictions))

29652.931506849316


In [14]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    regr_model=DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes,random_state=0)
    regr_model.fit(train_X,train_y)
    prediction=regr_model.predict(val_X)
    mae=mean_absolute_error(val_y,prediction)
    return mae

In [15]:
candidate_max_leaf_nodes=[5,10,40,70,71,73,80,300]
scores={}
for max_leaf_node in candidate_max_leaf_nodes:
    scores[max_leaf_node]=get_mae(max_leaf_node,train_X,val_X,train_y,val_y)

In [16]:
best_tree_size=min(scores, key=scores.get)
print(scores)
print(best_tree_size)

{5: 35044.51299744237, 10: 31585.432831537662, 40: 28106.18249209847, 70: 26763.340028993345, 71: 26704.033546536175, 73: 27044.086130049753, 80: 27389.89243288238, 300: 28350.54685142823}
71


In [17]:
regr_model=DecisionTreeRegressor(max_leaf_nodes=71,random_state=1)
regr_model.fit(train_X,train_y)
prediction=regr_model.predict(val_X)
mae=mean_absolute_error(val_y,prediction)

In [18]:
print(mae)

26704.033546536175
