# Decision Trees

In [50]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

In [51]:
df = pd.read_csv('drug200.csv', delimiter=',')
df.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY


In [52]:
df.shape

(200, 6)

### Pre-processing

In [87]:
X = df[df.columns[:len(df.columns) - 1]].values
X[0:5]

array([[23, 'F', 'HIGH', 'HIGH', 25.355],
       [47, 'M', 'LOW', 'HIGH', 13.093],
       [47, 'M', 'LOW', 'HIGH', 10.113999999999999],
       [28, 'F', 'NORMAL', 'HIGH', 7.797999999999999],
       [61, 'F', 'LOW', 'HIGH', 18.043]], dtype=object)

In [88]:
y = df['Drug'].values
y[0:5]

array(['drugY', 'drugC', 'drugC', 'drugX', 'drugY'], dtype=object)

In [89]:
from sklearn import preprocessing

As some features are categorical, and Decision Tree Classifier is not able to work with them, we will convert these fealures to numerical values.

In [91]:
num_sex = preprocessing.LabelEncoder()
num_sex.fit(['F', 'M'])
X[:,1] = num_sex.transform(X[:,1])

num_BP = preprocessing.LabelEncoder()
num_BP.fit(['LOW', 'NORMAL', 'HIGH'])
X[:,2] = num_BP.transform(X[:,2])

num_Chol = preprocessing.LabelEncoder()
num_Chol.fit(['NORMAL', 'HIGH'])
X[:,3] = num_Chol.transform(X[:,3])

In [92]:
X[0:5]

array([[23, 0, 0, 0, 25.355],
       [47, 1, 1, 0, 13.093],
       [47, 1, 1, 0, 10.113999999999999],
       [28, 0, 2, 0, 7.797999999999999],
       [61, 0, 1, 0, 18.043]], dtype=object)

### Decision Tree

In [93]:
from sklearn.model_selection import train_test_split

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=3)

In [99]:
print("X train set size: ",X_train.shape)
print("Y train set size: ",y_train.shape)
print("X test set size: ",X_test.shape)
print("Y test set size: ",y_test.shape)

X train set size:  (140, 5)
Y train set size:  (140,)
X test set size:  (60, 5)
Y test set size:  (60,)


In [100]:
drugTree = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
drugTree

DecisionTreeClassifier(criterion='entropy', max_depth=4)

In [101]:
drugTree.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=4)

In [102]:
y_predicted = drugTree.predict(X_test)
print(y_predicted[0:5])
print(y_test[0:5])

['drugY' 'drugX' 'drugX' 'drugX' 'drugX']
['drugY' 'drugX' 'drugX' 'drugX' 'drugX']


### Evaluation

In [103]:
from sklearn import metrics
import matplotlib.pyplot as plt

In [104]:
acc = metrics.accuracy_score(y_test, y_predicted)
print("Accuracy: ", acc)

Accuracy:  0.9833333333333333
