In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

In [2]:
#!conda install wget
#!wget -O drug200.csv https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/drug200.csv

In [3]:
myData = pd.read_csv('drug200.csv', delimiter = ',')
myData.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY


In [4]:
myData.size

1200

In [5]:
x = myData[['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K']].values
x[0:5]

array([[23, 'F', 'HIGH', 'HIGH', 25.355],
       [47, 'M', 'LOW', 'HIGH', 13.093],
       [47, 'M', 'LOW', 'HIGH', 10.113999999999999],
       [28, 'F', 'NORMAL', 'HIGH', 7.797999999999999],
       [61, 'F', 'LOW', 'HIGH', 18.043]], dtype=object)

Sex and BP columns contain categorical values but sklean decision trees do not handle categorical values. These values are to be converted to numerical/dummy/indicator variables

In [6]:
from sklearn import preprocessing

sex = preprocessing.LabelEncoder()
sex.fit(['F', 'M'])
x[:,1] = sex.transform(x[:,1])

bp = preprocessing.LabelEncoder()
bp.fit(['LOW', 'NORMAL', 'HIGH'])
x[:,2] = bp.transform(x[:,2])

chol = preprocessing.LabelEncoder()
chol.fit(['NORMAL', 'HIGH'])
x[:,3] = chol.transform(x[:,3])

x[0:5]

array([[23, 0, 0, 0, 25.355],
       [47, 1, 1, 0, 13.093],
       [47, 1, 1, 0, 10.113999999999999],
       [28, 0, 2, 0, 7.797999999999999],
       [61, 0, 1, 0, 18.043]], dtype=object)

In [7]:
y = myData['Drug']
y[0:5]

0    drugY
1    drugC
2    drugC
3    drugX
4    drugY
Name: Drug, dtype: object

#### Setting up the decision tree

In [8]:
from sklearn.model_selection import train_test_split

xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 0.3, random_state = 3)
print(xTrain.shape)
print(yTrain.shape)
print(xTest.shape)
print(yTest.shape)

(140, 5)
(140,)
(60, 5)
(60,)


Create an instance of the DecisionTreeClassifier called drugTree

specify criterion = 'entropy' to see information gain of each node

In [9]:
drugTree = DecisionTreeClassifier(criterion = 'entropy', max_depth = 4)
drugTree

DecisionTreeClassifier(criterion='entropy', max_depth=4)

In [10]:
drugTree.fit(xTrain, yTrain)

DecisionTreeClassifier(criterion='entropy', max_depth=4)

In [11]:
predictionTree = drugTree.predict(xTest)

In [12]:
print(predictionTree[0:5])
print(yTest[0:5])

['drugY' 'drugX' 'drugX' 'drugX' 'drugX']
40     drugY
51     drugX
139    drugX
197    drugX
170    drugX
Name: Drug, dtype: object


Evaluation

In [13]:
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline

# Computes subset accuracy : labels predicted for the dataset must exactly match the corresponding labels in y
# A strict perfect match results in a subset accuracy of 1.0 otherwise it is 0.0
print('Decision Tree Accuracy : ', metrics.accuracy_score(yTest, predictionTree))

Decision Tree Accuracy :  0.9833333333333333
