# Intro to Machine Learning with Decision Trees

In [None]:
"""
Sources:
- https://www.datacamp.com/tutorial/decision-tree-classification-python
- https://www.kaggle.com/datasets/uciml/pima-indians-diabetes-database?resource=download

"This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases.
The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset.
Several constraints were placed on the selection of these instances from a larger database.
In particular, all patients here are females at least 21 years old of Pima Indian heritage." [kaggle.com]
"""

# Load libraries
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import pydotplus

from IPython.display import Image
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.tree import export_graphviz
from io import StringIO


In [None]:
# load dataset
pima = pd.read_csv("diabetes.csv")

In [None]:
# Look at the data
pima.head()

In [None]:
#split dataset in features and target variable
feature_cols = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age"]

X = pima[feature_cols] # Features
y = pima.Outcome # Target variable

In [None]:
# Use train_test_split to split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test

In [None]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(criterion="entropy", max_depth=2)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [None]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred).round(3))

# precision: true predicted positives/all predicted positives
print("Precision:",metrics.precision_score(y_test, y_pred).round(3))

# recall:  true predicted positives/all true positives
print("Recall:",metrics.recall_score(y_test, y_pred).round(3))

# Confusion Matrix
c_matrix = metrics.confusion_matrix(y_pred=y_pred, y_true=y_test)
print(c_matrix)

In [None]:
# Draw the heatmap for the confusion matrix
sns.heatmap(c_matrix.round(), annot=True, square=True)
plt.ylabel('True Label')
plt.xlabel('Predicted Label')

In [None]:
# graph decision tree
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,feature_names = feature_cols,class_names=['0','1'])

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('diabetes.png')
Image(graph.create_png())

In [None]:
# check cases based on decision tree split
X_train_eval = X_train.copy()

X_train_eval["Outcome"] = y_train

X_train_eval.query("Glucose > 127.5 and BMI > 28.15 and Outcome == 1")