# Decision Tree Classification

## Diabetes 2 Features Data

In [1]:
import pandas as pd

In [2]:
# reading the data
diab2 = pd.read_csv('diabetes_2_features.csv')
diab2.head(3)

Unnamed: 0,Glucose,BloodPressure,Outcome
0,131,109,diabetic
1,114,52,non-diabetic
2,108,50,non-diabetic


In [3]:
# creating features and target sets
x_diab2, y_diab2=diab2.iloc[:,:-1], diab2['Outcome']
# checking my work
display(x_diab2.head(3))
display(y_diab2.head(3))

Unnamed: 0,Glucose,BloodPressure
0,131,109
1,114,52
2,108,50


0        diabetic
1    non-diabetic
2    non-diabetic
Name: Outcome, dtype: object

In [4]:
from sklearn.model_selection import train_test_split, validation_curve
# make sure to import decision tree classifier!!
from sklearn.tree import DecisionTreeClassifier

In [5]:
# split data to train and test (80/20)
x_train, x_test, y_train, y_test = train_test_split(x_diab2, y_diab2, test_size=.2, random_state=0)

In [8]:
# define and fit a Decision Tree
dt1 = DecisionTreeClassifier(random_state=0)
dt1.fit(x_train, y_train)

DecisionTreeClassifier(random_state=0)

In [10]:
# evaluate performane on the model
print('dt1 acc on train: {:.2%}'.format(dt1.score(x_train, y_train)))
print('dt1 acc on test: {:.2%}'.format(dt1.score(x_test, y_test)))

dt1 acc on train: 100.00%
dt1 acc on test: 100.00%


In [12]:
# predictions
p1 = [170, 140]
p2 = [88,75]

dt1.predict([p1,p2])



array(['diabetic', 'non-diabetic'], dtype=object)

### visualizing our decision tree

In [None]:
# webgraphviz.com

In [13]:
from sklearn.tree import export_graphviz

In [15]:
export_graphviz(dt1, out_file='dt1_vis.dot', 
                feature_names=x_diab2.columns, class_names=dt1.classes_, filled=True)

In [16]:
x_diab2.columns

Index(['Glucose', 'BloodPressure'], dtype='object')

In [17]:
dt1.classes_

array(['diabetic', 'non-diabetic'], dtype=object)

## Diabetes Data

In [18]:
# reading the data 
diabetes = pd.read_csv('diabetes-1.csv')
diabetes.head(3)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1


In [34]:
# creating features and target sets
x_diabetes, y_diabetes=diabetes.iloc[:,:-1], diabetes['Outcome']
# checking my work
display(x_diabetes.head(3))
display(y_diabetes.head(3))

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32


0    1
1    0
2    1
Name: Outcome, dtype: int64

In [20]:
from sklearn.model_selection import train_test_split, validation_curve
# make sure to import decision tree classifier!!
from sklearn.tree import DecisionTreeClassifier

In [24]:
# split data to train and test (80/20)
x_train1, x_test1, y_train1, y_test1 = train_test_split(x_diabetes, y_diabetes, test_size=.2, random_state=0)

In [25]:
# define and fit a Decision Tree
dt2 = DecisionTreeClassifier(random_state=0)
dt2.fit(x_train1, y_train1)

DecisionTreeClassifier(random_state=0)

In [36]:
# evaluate performane on the model
print('dt1 acc on train: {:.2%}'.format(dt2.score(x_train1, y_train1)))
print('dt1 acc on test: {:.2%}'.format(dt2.score(x_test1, y_test1)))

dt1 acc on train: 100.00%
dt1 acc on test: 76.62%


In [35]:
# predictions
p1 = [3, 140, 120, 33, 0, 31, 0.452, 45]
p2 = [1, 80, 70, 22, 0, 23, 0.320, 30]

dt2.predict([p1,p2])



array([1, 0])

### visualizing our decision tree

In [29]:
# webgraphviz.com
from sklearn.tree import export_graphviz

In [38]:
export_graphviz(dt2, out_file='dt2_vis.dot', feature_names=x_diabetes.columns, 
                class_names=dt2.classes_.astype(str), filled=True) 

In [31]:
x_diabetes.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'],
      dtype='object')

In [32]:
dt2.classes_

array([0, 1])

### validation curve to find the best DT param value

In [40]:
train_scores, test_scores = validation_curve(DecisionTreeClassifier(random_state=0), x_train1, y_train1, 
                                             param_name='max_leaf_nodes', param_range=[2,6,8,10,15,25,50], cv= 5)

In [43]:
train_scores.mean(axis=1).round(3)

array([0.74 , 0.767, 0.785, 0.803, 0.831, 0.874, 0.937])

In [44]:
test_scores.mean(axis=1).round(3)
# max_leaf_nodes 10 has the highest test score

array([0.7  , 0.704, 0.71 , 0.73 , 0.715, 0.708, 0.707])

### Build a decision with best param values

In [47]:
# define and fit a model w best param value
dt3 = DecisionTreeClassifier(random_state=0, max_leaf_nodes=10)
dt3.fit(x_train1, y_train1)

DecisionTreeClassifier(max_leaf_nodes=10, random_state=0)

In [49]:
# evaluate the performance
print('dt3 acc on train: {:.2%}'.format(dt3.score(x_train1, y_train1)))
print('dt3 acc on test: {:.2%}'.format(dt3.score(x_test1, y_test1)))

dt3 acc on train: 77.36%
dt3 acc on test: 77.92%


In [50]:
export_graphviz(dt3, out_file='dt3_vis.dot', feature_names=x_diabetes.columns, 
                class_names=dt3.classes_.astype(str), filled=True) 

### Feature Importance

In [53]:
# the higher the value, the more important the feature is
# summation of all of these is always equal to 1
dt3.feature_importances_.round(3)

array([0.   , 0.583, 0.   , 0.   , 0.   , 0.21 , 0.039, 0.167])

In [54]:
x_diabetes.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'],
      dtype='object')

In [59]:
feat_imp=pd.DataFrame(data=dt3.feature_importances_.round(3), 
                       index = x_diabetes.columns,
                       columns=["Importance"])
feat_imp.sort_values("Importance", ascending=False)

Unnamed: 0,Importance
Glucose,0.583
BMI,0.21
Age,0.167
DiabetesPedigreeFunction,0.039
Pregnancies,0.0
BloodPressure,0.0
SkinThickness,0.0
Insulin,0.0
