In [None]:
#Importing our needed packages and functions
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn 


from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score



---


# Let's import our data set and set the corresponding column names.


---



In [None]:

col_names = ["Number of times pregnant",
"Plasma glucose concentration a 2 hours in an oral glucose tolerance test",
"Diastolic blood pressure (mm Hg)",
"Triceps skin fold thickness (mm)",
"2-Hour serum insulin (mu U/ml)",
"Body mass index (weight in kg/(height in m)^2)",
"Diabetes pedigree function",
"Age (years)",
"Class variable (0 or 1)]"]
dataframe = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv',skiprows=1,names=col_names)

In [None]:
dataframe.head()



---


# We need to sort through our data to make sure that there does not exist any unknown values or outliers.



---



In [None]:
dataframe.dtypes

In [None]:
dataframe['Number of times pregnant'].unique()

In [None]:
dataframe['Age (years)'].unique()

In [None]:
dataframe['2-Hour serum insulin (mu U/ml)'].unique()



---


# We see that the first value is 0, so we need to take note and account for this later. Let's see how many entries have the value 0.


---



In [None]:
dataframe['2-Hour serum insulin (mu U/ml)'].value_counts()



---


# There are a whopping 374 instances of a 0 value.


---



In [None]:
dataframe['Body mass index (weight in kg/(height in m)^2)'].unique()

In [None]:
dataframe['Body mass index (weight in kg/(height in m)^2)'].value_counts()



---


# Another 0, 11 instances.


---



In [None]:
dataframe['Class variable (0 or 1)]'].unique()

In [None]:
dataframe['Class variable (0 or 1)]'].value_counts()



---


# So overall we have 500 people who are non diabetic and 268 who are.


---



In [None]:
dataframe['Diabetes pedigree function'].unique()

In [None]:
dataframe['Diastolic blood pressure (mm Hg)'].unique()

In [None]:
dataframe['Diastolic blood pressure (mm Hg)'].value_counts()



---


# Again another 35 instances of a value of 0. We will account for these later.


---



In [None]:
dataframe['Plasma glucose concentration a 2 hours in an oral glucose tolerance test'].unique()

In [None]:
dataframe['Plasma glucose concentration a 2 hours in an oral glucose tolerance test'].value_counts()



---


# Another 0 value.


---



In [None]:
dataframe['Triceps skin fold thickness (mm)'].unique()

In [None]:
dataframe['Triceps skin fold thickness (mm)'].value_counts()



---


# And one more 0 with 227 instances. Let's remove the rows that have 0's for the instances that will not dramatically decrease our sample size.


---



In [None]:
df_complete = dataframe.drop(dataframe['Plasma glucose concentration a 2 hours in an oral glucose tolerance test'].loc[dataframe['Plasma glucose concentration a 2 hours in an oral glucose tolerance test']==0].index)
df_complete = dataframe.drop(dataframe['Body mass index (weight in kg/(height in m)^2)'].loc[dataframe['Body mass index (weight in kg/(height in m)^2)']==0].index)
df_complete = dataframe.drop(dataframe['Diastolic blood pressure (mm Hg)'].loc[dataframe['Diastolic blood pressure (mm Hg)']==0].index)

len(df_complete)

In [None]:
df_complete['Class variable (0 or 1)]'].value_counts()

---


# So, we removed 35 data points (4% of the data set) and are left with 481 non diabetic and 252 diabetic.


---


In [None]:
df_complete.head()



---


# Now we have cleaned our data set from *most* missing values and may begin to construct our decision tree. Let's split our data frame into the columns used for classification and our parameter of interest, labeled as the class variable in our current dataframe.


---



In [None]:
X = df_complete.drop('Class variable (0 or 1)]', axis=1).copy()
X.head()
len(X)

In [None]:
Y=df_complete['Class variable (0 or 1)]'].copy()
Y.head()
len(Y)



---
# Alright so let's actually build the damn tree already! Let's get our test and training data seperated and feed it into SciKits DecisionTreeClassifier. Note that the default criterion for DecisionTreeClassifier is 'gini' but we want to use 'entropy' instead. We pass it absolutely no other arguments, since later on we will have other methods to completely determine the optimal parameters.
---





In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, random_state=20,test_size=.1)
X_train2,  X_fakeTest, Y_train2, Y_fakeTest = train_test_split(X_train,Y_train, random_state=20, test_size=.1)

---

# Notice that we created two test sets.  The first one labeled *X_test, Y_test* will be used at the end of the model with the 5-fold validation. The second one which is a subset of our training data labelled *X_fakeTest, Y_fakeTest*, will be used to test the initial decision tree and the pruning steps.

---

In [None]:
X_train2

In [None]:
first_dt = DecisionTreeClassifier(criterion='entropy',random_state=20)
first_dt= first_dt.fit(X_train2,Y_train2)
len(Y_train2)

In [None]:
len(Y_fakeTest)
len(X_fakeTest)

In [None]:
Y_fakeTest.value_counts()



---


# This is the true values for diabetic and non diabetic in our training test set (training subset), these will be important.

---



In [None]:
plt.figure(figsize=(20,8),dpi=100)
plot_tree(first_dt, filled=True, rounded=True, class_names=["Non-Diabetic","Diabetic"], feature_names=X.columns);



---


# This tree is huge, which suggests overfitting. Lets plot a confusion matrix to see how our tree would respond to the training test set.


---



In [None]:
fig, ax = plt.subplots(figsize=(5, 5),dpi=300)
plot_confusion_matrix(first_dt, X_fakeTest, Y_fakeTest, display_labels=["Non-Diabetic","Diabetic"],cmap=plt.cm.Blues,ax=ax);



---


# Our first tree did not perform as well as we would have hoped.  Let's try to improve this by pruning our tree and optimizing the parameters that we have. We're going to accomplish this by something called cost complexity pruning that will help improve our accuracy. We will define a list of 'alpha' values that will have a one to one correspondance with every subtree generated by cost_complexity_pruning_path.

---



In [None]:
tree_path=first_dt.cost_complexity_pruning_path(X_train2,Y_train2)
ccp_alphas=tree_path.ccp_alphas
ccp_alphas=ccp_alphas[:-1]
impurities=tree_path.impurities
len(ccp_alphas)



---

# SciKit has a really nice function that gives us every subtree possible by pruning leaves off of the parent tree that will help us find our optimal tree. We disregard the maximum value of ccp_alphas since that would only leave us the root node.


---



In [None]:
first_dts=[]

for ccp_alpha in ccp_alphas:
    first_dt = DecisionTreeClassifier(criterion='entropy',random_state=20, ccp_alpha=ccp_alpha)
    first_dt.fit(X_train2, Y_train2)
    first_dts.append(first_dt)



---

# We create an array to store every single decision tree that we could construct for each combination of leaves pruned off the original tree. I like to think of this as a pseudo-random forest, since it is a forest of subtrees. Let's see how the data is classified based upon which tree it is being evaluated on. 


---



In [None]:
train_scores = [first_dt.score(X_train2, Y_train2) for first_dt in first_dts]
training_test_scores = [first_dt.score(X_fakeTest, Y_fakeTest) for first_dt in first_dts]

fig, ax = plt.subplots(figsize=(6,4),dpi=300)
ax.set_xlabel("Alpha")
ax.set_ylabel("Accuracy")
ax.set_title("Accuracy vs Alpha Value for Training and Testing Set")
ax.plot(ccp_alphas, train_scores, marker='.', label="Train Set",
        drawstyle="steps-post")
ax.plot(ccp_alphas, training_test_scores, marker='.', label="Training Test Set",
        drawstyle="steps-post")
ax.legend()
plt.show()

In [None]:
print(max(training_test_scores))
print(training_test_scores.index(max(training_test_scores)))

In [None]:
plt.figure(figsize=(20,8),dpi=500)
plot_tree(first_dts[training_test_scores.index(max(training_test_scores))], filled=True, rounded=True, class_names=["Non-Diabetic","Diabetic"], feature_names=X.columns);

In [None]:
fig, ax = plt.subplots(figsize=(5, 5),dpi=300)
plot_confusion_matrix(first_dts[training_test_scores.index(max(training_test_scores))], X_fakeTest, Y_fakeTest, display_labels=["Non-Diabetic","Diabetic"],values_format='',cmap=plt.cm.Blues,ax=ax);



---

# This gives us the optimal tree for THIS training and training test set. But why should we assume that this is going to be the optimal tree? Let's use 5-fold cross validation to run through every single alpha and really find what value is going to work best.

---



In [None]:
mean_scores=[]

for ccp_alpha in ccp_alphas:
    first_dt = DecisionTreeClassifier(criterion='entropy', random_state=20, ccp_alpha=ccp_alpha)
    scores=cross_val_score(first_dt, X_train, Y_train, cv=5)
    mean_scores.append(np.mean(scores))

In [None]:
print(mean_scores.index(max(mean_scores)))
print(max(mean_scores))

--- 

# We are going to plot the decision tree with the maximum mean value from the 5-fold validation.

---

In [None]:
plt.figure(figsize=(20,8),dpi=500)
plot_tree(first_dts[mean_scores.index(max(mean_scores))], filled=True, rounded=True, class_names=["Non-Diabetic","Diabetic"], feature_names=X.columns);

In [None]:
fig, ax = plt.subplots(figsize=(5, 5),dpi=300)
plot_confusion_matrix(first_dts[mean_scores.index(max(mean_scores))], X_fakeTest, Y_fakeTest, display_labels=["Non-Diabetic","Diabetic"],values_format='',cmap=plt.cm.Blues,ax=ax);

---

# Now let's test this against our true test set.

---

In [None]:
fig, ax = plt.subplots(figsize=(5, 5),dpi=300)
plot_confusion_matrix(first_dts[mean_scores.index(max(mean_scores))], X_test, Y_test, display_labels=["Non-Diabetic","Diabetic"],values_format='',cmap=plt.cm.Blues,ax=ax);