# Decision Tree Classification

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, validation_curve
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

## diabetes data

In [2]:
diabetes=pd.read_csv('diabetes-1.csv')
diabetes.head(3)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1


In [3]:
# creating features and target sets
X_diab, y_diab=diabetes.iloc[:,:-1], diabetes['Outcome']
X_diab.head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31


In [4]:
y_diab.head(2)

0    1
1    0
Name: Outcome, dtype: int64

In [5]:
# split to train and test
X_train, X_test, y_train, y_test=train_test_split(X_diab, y_diab, random_state=0)

In [6]:
train_scores, test_scores=validation_curve(DecisionTreeClassifier(random_state=0), X_train, y_train, param_name='max_leaf_nodes',
                                           param_range=[4,5,6,7,8], cv=5)

In [7]:
print(train_scores.mean(axis=1))
print(test_scores.mean(axis=1))
# max_leaf_nodes 5 is the winner

[0.7677893  0.77777987 0.79166462 0.79687447 0.79904555]
[0.724003   0.74473763 0.73607196 0.73607196 0.73607196]


### fit a DT w max_leaf_nodes 5

In [8]:
dt1=DecisionTreeClassifier(random_state=0, max_leaf_nodes=5)
dt1.fit(X_train, y_train)

DecisionTreeClassifier(max_leaf_nodes=5, random_state=0)

In [9]:
# evaluate the performance
print('dt1 acc on train: {:.2%}'.format(dt1.score(X_train, y_train)))
print('dt1 acc on test: {:.2%}'.format(dt1.score(X_test, y_test)))

dt1 acc on train: 74.83%
dt1 acc on test: 75.00%


In [10]:
feat_imp=pd.DataFrame(data=dt1.feature_importances_, index=X_diab.columns, columns=['importance'])
feat_imp.sort_values(by=['importance'], ascending=False)

Unnamed: 0,importance
Glucose,0.666825
Age,0.20533
BMI,0.127845
Pregnancies,0.0
BloodPressure,0.0
SkinThickness,0.0
Insulin,0.0
DiabetesPedigreeFunction,0.0


# Random Forest-Diabetes Data

In [11]:
from sklearn.ensemble import RandomForestClassifier

In [12]:
# define and fit the model
rf1=RandomForestClassifier(n_estimators=200, random_state=0)
rf1.fit(X_train, y_train)

RandomForestClassifier(n_estimators=200, random_state=0)

In [13]:
# evaluate the performance
print('rf1 on train: {:.2%}'.format(rf1.score(X_train, y_train)))
print('rf1 on test: {:.2%}'.format(rf1.score(X_test, y_test)))

rf1 on train: 100.00%
rf1 on test: 77.60%


## feature importance

In [14]:
rf1.feature_importances_

array([0.08318334, 0.24244063, 0.08714946, 0.07071538, 0.07832782,
       0.16791695, 0.12520402, 0.14506241])

In [17]:
rf_feat_imp=pd.DataFrame(data=rf1.feature_importances_, index=X_diab.columns, columns=['importance'])
rf_feat_imp.sort_values(by=['importance'], ascending=False) 

Unnamed: 0,importance
Glucose,0.242441
BMI,0.167917
Age,0.145062
DiabetesPedigreeFunction,0.125204
BloodPressure,0.087149
Pregnancies,0.083183
Insulin,0.078328
SkinThickness,0.070715


# Decision Tree Regression

## westroxbury_categorical data

In [18]:
house=pd.read_csv('WestRoxbury_categorical.csv')
house.head(2)

Unnamed: 0,TOTAL VALUE,LOT SQFT,YR BUILT,GROSS AREA,LIVING AREA,FLOORS,ROOMS,BEDROOMS,FULL BATH,HALF BATH,KITCHEN,FIREPLACE,REMODEL
0,344.2,9965,1880,2436,1352,2.0,6,3,1,1,1,No,No
1,412.6,6590,1945,3108,1976,2.0,10,4,2,1,1,No,Yes_Recently


In [30]:
# # shuffle data since we want to use validation_curve
house=house.sample(frac=1, random_state=0)

In [31]:
# creatng features and target sets
X_house, y_house=house.iloc[:,1:], house['TOTAL VALUE']
X_house.head(1)

Unnamed: 0,LOT SQFT,YR BUILT,GROSS AREA,LIVING AREA,FLOORS,ROOMS,BEDROOMS,FULL BATH,HALF BATH,KITCHEN,FIREPLACE,REMODEL
1403,9408,1961,2222,1056,1.0,6,3,1,0,1,Yes,No


In [32]:
# change the floor type to string
X_house['FLOORS']=X_house['FLOORS'].astype(str)

In [33]:
# encode the features set
X_dummies=pd.get_dummies(X_house)
X_dummies.head(1)

Unnamed: 0,LOT SQFT,YR BUILT,GROSS AREA,LIVING AREA,ROOMS,BEDROOMS,FULL BATH,HALF BATH,KITCHEN,FLOORS_1.0,FLOORS_1.5,FLOORS_2.0,FLOORS_2.5,FLOORS_3.0,FIREPLACE_No,FIREPLACE_Yes,REMODEL_No,REMODEL_Yes,REMODEL_Yes_Recently
1403,9408,1961,2222,1056,6,3,1,0,1,1,0,0,0,0,0,1,1,0,0


In [34]:
# split to train and test (85/15)---use X_dummies
X_train, X_test, y_train, y_test=train_test_split(X_dummies, y_house,random_state=0, test_size=.15)

In [35]:
# use validation_curve to find the best value for max_leaf_nodes among [12,13,14,15,16,17]
train_scores, test_scores=validation_curve(DecisionTreeRegressor(random_state=0), X_train, y_train, param_name='max_leaf_nodes',
                                           param_range=[12,13,14,15,16,17], cv=5)

In [36]:
print(train_scores.mean(axis=1).round(4))
print(test_scores.mean(axis=1).round(4))
# max_leaf_nodes 16 is the winer

[0.7295 0.7348 0.7399 0.7446 0.7488 0.7527]
[0.6974 0.7013 0.7063 0.7103 0.7125 0.7133]


### fit a DT w max_leaf_nodes 16

In [37]:
dt_reg2=DecisionTreeRegressor(random_state=0,max_leaf_nodes=16)
dt_reg2.fit(X_train, y_train)

DecisionTreeRegressor(max_leaf_nodes=16, random_state=0)

In [38]:
print('dt reg2 on train: {:.2%}'.format(dt_reg2.score(X_train, y_train)))
print('dt reg2 on test: {:.2%}'.format(dt_reg2.score(X_test, y_test)))

dt reg2 on train: 74.29%
dt reg2 on test: 70.95%


In [39]:
# feature importance
feat_imp2=pd.DataFrame(data=dt_reg2.feature_importances_, index=X_dummies.columns, columns=['importance'])
feat_imp2.sort_values(by=['importance'], ascending=False)

Unnamed: 0,importance
LIVING AREA,0.911689
LOT SQFT,0.052752
FIREPLACE_Yes,0.019651
GROSS AREA,0.008303
FLOORS_2.0,0.007606
REMODEL_Yes,0.0
REMODEL_No,0.0
FIREPLACE_No,0.0
FLOORS_3.0,0.0
FLOORS_2.5,0.0


# Random Forest-WestRoxbury

In [40]:
from sklearn.ensemble import RandomForestRegressor

In [41]:
# define and fit the model
rf2=RandomForestRegressor(n_estimators=300, random_state=0)
rf2.fit(X_train, y_train)

RandomForestRegressor(n_estimators=300, random_state=0)

In [42]:
# evaluate
print('rf2 on train: {:.2%}'.format(rf2.score(X_train, y_train)))
print('rf2 on test: {:.2%}'.format(rf2.score(X_test, y_test)))

rf2 on train: 97.60%
rf2 on test: 81.88%


## feature importance

In [43]:
rf2.feature_importances_

array([9.99627922e-02, 3.48074154e-02, 8.40287022e-02, 6.93318274e-01,
       1.21207346e-02, 6.59963514e-03, 1.06308254e-02, 1.04747676e-02,
       5.11590416e-04, 4.55992428e-03, 1.74563828e-03, 1.36218964e-02,
       6.74691370e-04, 4.67387917e-05, 6.68004883e-03, 7.22909100e-03,
       4.48969595e-03, 1.76872908e-03, 6.72880902e-03])

In [44]:
rf_feat_imp2=pd.DataFrame(data=rf2.feature_importances_, index=X_dummies.columns, columns=['importance'])
rf_feat_imp2.sort_values(by=['importance'], ascending=False)

Unnamed: 0,importance
LIVING AREA,0.693318
LOT SQFT,0.099963
GROSS AREA,0.084029
YR BUILT,0.034807
FLOORS_2.0,0.013622
ROOMS,0.012121
FULL BATH,0.010631
HALF BATH,0.010475
FIREPLACE_Yes,0.007229
REMODEL_Yes_Recently,0.006729
