# Decision Trees

In [50]:
# importing required modules.

import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

<div id="about_dataset">
    <h2>About the dataset</h2>
    Imagine that you are a medical researcher compiling data for a study. You have collected data about a set of patients, all of whom suffered from the same illness. During their course of treatment, each patient responded to one of 5 medications, Drug A, Drug B, Drug c, Drug x and y. 
    <br>
    <br>
    Part of your job is to build a model to find out which drug might be appropriate for a future patient with the same illness. The features of this dataset are Age, Sex, Blood Pressure, and the Cholesterol of the patients, and the target is the drug that each patient responded to.
    <br>
    <br>
    It is a sample of multiclass classifier, and you can use the training part of the dataset 
    to build a decision tree, and then use it to predict the class of an unknown patient, or to prescribe a drug to a new patient.
</div>


In [3]:
# downloading the dataset.

!wget -O drug200.csv https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillsNetwork/labs/Module%203/data/drug200.csv

--2022-01-26 21:38:54--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillsNetwork/labs/Module%203/data/drug200.csv
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 169.45.118.108
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|169.45.118.108|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5827 (5.7K) [text/csv]
Saving to: ‘drug200.csv’


2022-01-26 21:38:57 (903 MB/s) - ‘drug200.csv’ saved [5827/5827]



In [51]:
# reading the data.

df = pd.read_csv('data/drug200.csv')

df.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY


In [52]:
df.describe()

Unnamed: 0,Age,Na_to_K
count,200.0,200.0
mean,44.315,16.084485
std,16.544315,7.223956
min,15.0,6.269
25%,31.0,10.4455
50%,45.0,13.9365
75%,58.0,19.38
max,74.0,38.247


In [53]:
# see all the attributes.

df.columns

Index(['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K', 'Drug'], dtype='object')

In [54]:
print(df['Sex'].unique())

print(df['BP'].unique())

print(df['Cholesterol'].unique())

['F' 'M']
['HIGH' 'LOW' 'NORMAL']
['HIGH' 'NORMAL']


In [55]:
# feature matrix of the data.

X = df[['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K']].values

X[:5]

array([[23, 'F', 'HIGH', 'HIGH', 25.355],
       [47, 'M', 'LOW', 'HIGH', 13.093],
       [47, 'M', 'LOW', 'HIGH', 10.114],
       [28, 'F', 'NORMAL', 'HIGH', 7.798],
       [61, 'F', 'LOW', 'HIGH', 18.043]], dtype=object)

In [56]:
# preprocessing.

# converting the categorical values of Sex, BP Cholestrol to numerical values.

from sklearn import preprocessing

le_sex = preprocessing.LabelEncoder()

le_sex.fit(['F', 'M'])

# transform the sex column that is colulmn 1.

X[:, 1] = le_sex.transform(X[:, 1])

# similarily for BP, Cholestrol.

le_BP = preprocessing.LabelEncoder()

le_BP.fit(df['BP'].unique())

X[:, 2] = le_BP.transform(X[:, 2])

le_cholestrol = preprocessing.LabelEncoder()

le_cholestrol.fit(df['Cholesterol'].unique())

X[:, 3] = le_cholestrol.transform(X[:, 3])

In [57]:
print(X[:5])

[[23 0 0 0 25.355]
 [47 1 1 0 13.093]
 [47 1 1 0 10.114]
 [28 0 2 0 7.798]
 [61 0 1 0 18.043]]


In [58]:
# target varible.

y = df['Drug']

In [59]:
# train/test split.

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=3)

In [60]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(140, 5)
(60, 5)
(140,)
(60,)


In [61]:
# modeling.

drugTree = DecisionTreeClassifier(criterion='entropy', max_depth=4)

drugTree

DecisionTreeClassifier(criterion='entropy', max_depth=4)

In [62]:
drugTree.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=4)

In [63]:
# predictions.

predictions = drugTree.predict(X_test)

print(predictions[:5])
print(y_test[:5].values)

['drugY' 'drugX' 'drugX' 'drugX' 'drugX']
['drugY' 'drugX' 'drugX' 'drugX' 'drugX']


In [64]:
# Model Evaluation.

from sklearn import metrics

print("Accuracy of the Decision Tree : ", metrics.accuracy_score(y_test, predictions))

Accuracy of the Decision Tree :  0.9833333333333333
