# Decision Tree Regressor

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, validation_curve
from sklearn.tree import DecisionTreeRegressor, export_graphviz

## bostong housing data

In [2]:
boston=pd.read_csv('boston_housing_data.csv',index_col=0)
boston.head(1)

Unnamed: 0_level_0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,lstat,target_medv
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,4.98,24.0


In [3]:
# since we will use validation_curve (w cross-val), we shuffle the data first
boston=boston.sample(frac=1,random_state=0)
boston.head(1)

Unnamed: 0_level_0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,lstat,target_medv
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
84,0.03551,25.0,4.86,0,0.426,6.167,46.7,5.4007,4,281,19.0,7.51,22.9


In [4]:
# create features and target sets
X_boston, y_boston=boston.iloc[:,:-1], boston['target_medv']
# check my work
display(X_boston.head(2))
display(y_boston.head(2))

Unnamed: 0_level_0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,lstat
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
84,0.03551,25.0,4.86,0,0.426,6.167,46.7,5.4007,4,281,19.0,7.51
81,0.04113,25.0,4.86,0,0.426,6.727,33.5,5.4007,4,281,19.0,5.29


ID
84    22.9
81    28.0
Name: target_medv, dtype: float64

In [5]:
# split to train and test (85/15)
X_train, X_test, y_train, y_test=train_test_split(X_boston,y_boston,
                                                 test_size=.15,
                                                 random_state=0)

### use validation_curve to find best param value

In [6]:
train_sc, test_sc=validation_curve(DecisionTreeRegressor(random_state=0),
                                  X_train,y_train,
                                  param_name='max_depth',
                                  param_range=[2,3,4,5], cv=5)

In [8]:
train_sc.mean(axis=1).round(4)

array([0.7359, 0.8339, 0.905 , 0.9376])

In [9]:
test_sc.mean(axis=1).round(4)
# max_depth 4 has the highest test score

array([0.5821, 0.6262, 0.687 , 0.6701])

### develope a DT with best param value

In [10]:
# define and fit the model
dt_reg1=DecisionTreeRegressor(max_depth=4,random_state=0)
dt_reg1.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=4, random_state=0)

In [12]:
# evaluate the performance of the model
print('dt_reg1 R2 on train: {:.2%}'.format(dt_reg1.score(X_train,y_train)))
print('dt_reg1 R2 on test: {:.2%}'.format(dt_reg1.score(X_test,y_test)))

dt_reg1 R2 on train: 89.77%
dt_reg1 R2 on test: 87.81%


In [15]:
dt_reg1.get_n_leaves()

15

### feature importance

In [16]:
dt_reg1.feature_importances_

array([0.04457087, 0.        , 0.        , 0.00112931, 0.        ,
       0.64923062, 0.00161161, 0.06438004, 0.        , 0.        ,
       0.02078456, 0.218293  ])

In [17]:
X_train.columns

Index(['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax',
       'ptratio', 'lstat'],
      dtype='object')

In [19]:
feat_imp=pd.DataFrame(data=dt_reg1.feature_importances_.round(3),
                     index=X_train.columns,
                     columns=['importance'])
feat_imp.sort_values('importance',ascending=False)

Unnamed: 0,importance
rm,0.649
lstat,0.218
dis,0.064
crim,0.045
ptratio,0.021
age,0.002
chas,0.001
zn,0.0
indus,0.0
nox,0.0


### visualizing dt_reg1

In [20]:
export_graphviz(dt_reg1, out_file='dt_reg1_vis.dot',
               feature_names=X_train.columns, filled=True)

## westroxburry_categorical data

In [22]:
# reading the data
west=pd.read_csv('WestRoxbury_categorical.csv')
west.head(2)

Unnamed: 0,TOTAL VALUE,LOT SQFT,YR BUILT,GROSS AREA,LIVING AREA,FLOORS,ROOMS,BEDROOMS,FULL BATH,HALF BATH,KITCHEN,FIREPLACE,REMODEL
0,344.2,9965,1880,2436,1352,2.0,6,3,1,1,1,No,No
1,412.6,6590,1945,3108,1976,2.0,10,4,2,1,1,No,Yes_Recently


In [23]:
# shuffle the data
west=west.sample(frac=1,random_state=0)

In [None]:
# create features and target sets (TOTAL VALUE targer, the rest features)
# change the FLOOR type to string
# encode categorical features
# split to train and test (85/15), make sure to use X_dummies
# use validation_curve to find best param value, max_leaf_nodes
### use this param_range=[12,13,14,15,16,17], cv=5
# built a DT regressor with best param value
# evaluate its performance
# check the feature importance
# visualize the tree

In [24]:
# create features and target sets (TOTAL VALUE targer, the rest features)
X_west, y_west=west.iloc[:,1:], west.iloc[:,0]
# check my work
display(X_west.head(1))
display(y_west.head(1))

Unnamed: 0,LOT SQFT,YR BUILT,GROSS AREA,LIVING AREA,FLOORS,ROOMS,BEDROOMS,FULL BATH,HALF BATH,KITCHEN,FIREPLACE,REMODEL
1519,4026,1940,2520,1047,1.0,6,3,1,1,1,Yes,No


1519    266.0
Name: TOTAL VALUE, dtype: float64

In [25]:
# change the FLOOR type to string
X_west['FLOORS']=X_west['FLOORS'].astype(str)
# encode categotical features
X_dummies=pd.get_dummies(X_west)
X_dummies.head(2)

Unnamed: 0,LOT SQFT,YR BUILT,GROSS AREA,LIVING AREA,ROOMS,BEDROOMS,FULL BATH,HALF BATH,KITCHEN,FLOORS_1.0,FLOORS_1.5,FLOORS_2.0,FLOORS_2.5,FLOORS_3.0,FIREPLACE_No,FIREPLACE_Yes,REMODEL_No,REMODEL_Yes,REMODEL_Yes_Recently
1519,4026,1940,2520,1047,6,3,1,1,1,1,0,0,0,0,0,1,1,0,0
3457,7000,1848,6235,3446,12,5,3,0,1,0,0,1,0,0,0,1,1,0,0


In [26]:
# split to train and test (85/15)
X_train, X_test, y_train, y_test=train_test_split(X_dummies, y_west,
                                                 test_size=.15,
                                                 random_state=0)

### validation_curve

In [27]:
train_sc1, test_sc1=validation_curve(DecisionTreeRegressor(random_state=0),
                                    X_train, y_train,
                                    param_name='max_leaf_nodes',
                                    param_range=[12,13,14,15,16,17], cv=5)

In [28]:
train_sc1.mean(axis=1).round(4)

array([0.7297, 0.7356, 0.741 , 0.7457, 0.7502, 0.7543])

In [29]:
test_sc1.mean(axis=1).round(4)
# max_leaf_nodes=16 has the highest test score

array([0.6867, 0.6935, 0.699 , 0.7015, 0.7055, 0.6963])

### build a DT regressor using best param value

In [30]:
dt_reg2=DecisionTreeRegressor(max_leaf_nodes=16, random_state=0)
dt_reg2.fit(X_train,y_train)

DecisionTreeRegressor(max_leaf_nodes=16, random_state=0)

In [32]:
# evaluate the performance
print('dt_reg2 R2 on train: {:.2%}'.format(dt_reg2.score(X_train,y_train)))
print('dt_reg2 R2 on test: {:.2%}'.format(dt_reg2.score(X_test,y_test)))

dt_reg2 R2 on train: 74.53%
dt_reg2 R2 on test: 70.23%


### feature importance

In [35]:
feat_imp2=pd.DataFrame(data=dt_reg2.feature_importances_.round(3),
                      index=X_train.columns,
                      columns=['importance'])
feat_imp2.sort_values('importance',ascending=False)

Unnamed: 0,importance
LIVING AREA,0.91
LOT SQFT,0.054
GROSS AREA,0.024
FIREPLACE_Yes,0.012
FLOORS_2.0,0.0
REMODEL_Yes,0.0
REMODEL_No,0.0
FIREPLACE_No,0.0
FLOORS_3.0,0.0
FLOORS_2.5,0.0


### visualize dt_reg2

In [36]:
export_graphviz(dt_reg2, out_file='dt_reg2_vis.dot',
               feature_names=X_train.columns, filled=True)