# Solution Guide

In [None]:
# Load necessary pyhton modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/decision-tree-project-tutorial/main/diabetes.csv")
df.head()

## Guide on inspection of the dataset


- Verify if there are Null / Nan Values

- Is it a Balanced or Imbalanced Dataset?

- Convert non numerical data to numerical

- Find correlation - feature importance

- Get some Statistics - mean,median,25%,50%,75%,count , min,max on coloum wise

- Eliminate outliers with IQR Method

In [None]:
#Is it balanced or imbalanced?

df["Outcome"].value_counts()

In [None]:
#Eliminating Outliers with IQR Method
q1 = df['Insulin'].quantile(0.25)
q3 = df['Insulin'].quantile(0.75)
IQR = q3 - q1
lower_limit = q1 - 1.5 * IQR
upper_limit = q3 + 1.5 * IQR

df = df[(df['Insulin'] > lower_limit) & (df['Insulin'] < upper_limit)]

q1 = df['Age'].quantile(0.25)
q3 = df['Age'].quantile(0.75)
IQR = q3 - q1
lower_limit = q1 - 1.5 * IQR
upper_limit = q3 + 1.5 * IQR

df = df[(df['Age'] > lower_limit) & (df['Age'] < upper_limit)]

q1 = df['Glucose'].quantile(0.25)
q3 = df['Glucose'].quantile(0.75)
IQR = q3 - q1
lower_limit = q1 - 1.5 * IQR
upper_limit = q3 + 1.5 * IQR

df = df[(df['Glucose'] > lower_limit) & (df['Glucose'] < upper_limit)]

q1 = df['BMI'].quantile(0.25)
q3 = df['BMI'].quantile(0.75)
IQR = q3 - q1
lower_limit = q1 - 1.5 * IQR
upper_limit = q3 + 1.5 * IQR

df = df[(df['BMI'] > lower_limit) & (df['BMI'] < upper_limit)]

q1 = df['Pregnancies'].quantile(0.25)
q3 = df['Pregnancies'].quantile(0.75)
IQR = q3 - q1
lower_limit = q1 - 1.5 * IQR
upper_limit = q3 + 1.5 * IQR

df = df[(df['Pregnancies'] > lower_limit) & (df['Pregnancies'] < upper_limit)]

q1 = df['DiabetesPedigreeFunction'].quantile(0.25)
q3 = df['DiabetesPedigreeFunction'].quantile(0.75)
IQR = q3 - q1
lower_limit = q1 - 1.5 * IQR
upper_limit = q3 + 1.5 * IQR

df = df[(df['DiabetesPedigreeFunction'] > lower_limit) & (df['DiabetesPedigreeFunction'] < upper_limit)]
plt.figure(figsize=(13,5))
sns.boxplot(data=df,orient='h')
plt.show()


In [None]:
#Find correlations

print(df.corr())
f, ax = plt.subplots(figsize=(16, 12))
sns.heatmap(df.corr(), annot=True)
plt.show()

In [None]:
# Resample the data to make a more balanced dataset

from sklearn.utils import resample
df_majority = df[(df["Outcome"]==0)]
df_minority = df[(df["Outcome"]==1)]
df_minority_upsampled = resample(df_minority,replace=True,n_samples=500,random_state=42)
df = pd.concat([df_minority_upsampled,df_majority])
df["Outcome"].value_counts()

In [None]:
# Get some statistics

df.describe()

In [None]:
#Separate features from target

X=df.drop('Outcome',axis='columns')
Y=df["Outcome"]

In [None]:
#Scale the features

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit_transform(X)

In [None]:
#Split the data

X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.30)

## Modeling with Decision Tree

In [None]:
#Fit the data

model = DecisionTreeClassifier()
model.fit(X_train,Y_train)

In [None]:
#Get the score of train data just to verify its 1.

model.score(X_train,Y_train)

In [None]:
#Get the score of your predictions

model.score(X_test,Y_test)

In [None]:
#Get the confusion matrix

print(confusion_matrix(Y_test,model.predict(X_test)))
sns.heatmap(confusion_matrix(Y_test,model.predict(X_test)), annot=True)
plt.show()

In [None]:
#Print the classification report

print(classification_report(Y_test, model.predict(X_test)))

In [None]:
# Print feature importance

print(model.feature_importances_)

In [None]:
#Get the number of leaves

print(model.get_n_leaves())

In [None]:
# Get your tree params

print(model.get_params())

In [None]:
# Get your tree depth

print(dt_model.get_depth())

**Train Decision tree with entropy criterion**

In [None]:
dt_model = DecisionTreeClassifier(criterion="entropy")
dt_model.fit(X_train,Y_train)

In [None]:
dt_model.score(X_train,Y_train)

In [None]:
print(confusion_matrix(Y_test,dt_model.predict(X_test)))

In [None]:
print(classification_report(Y_test, dt_model.predict(X_test)))

In [None]:
print(accuracy_score(Y_test,dt_model.predict(X_test)))

In [None]:
print(dt_model.feature_importances_)

In [None]:
print(dt_model.get_n_leaves())

In [None]:
print(dt_model.get_params())

In [None]:
print(dt_model.get_depth())

**Tune your decision tree hyperparameters using GridSearch**

In [None]:
#Using Grid Search to get best hyperparameters

from sklearn.model_selection import GridSearchCV
tree_para = {'criterion':['gini','entropy'],'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150],'min_samples_split': [2, 3, 4]}
clf = GridSearchCV(DecisionTreeClassifier(), tree_para, cv=5)
clf.fit(X_train,Y_train)
print(clf.best_params_)
print(clf.best_estimator_)

**Once you train your model with your tuned hyperparameters, use the roc_auc_curve to measure your results**

In [None]:
#Train with the best hyperparameters

In [None]:
#Measure your results

from sklearn.metrics import roc_curve,auc
fpr,tpr, thresholds = roc_curve(Y_test,dt_model.predict_proba(X_test)[:,1])
plt.figure(figsize=(8,8))
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.plot([0,1],[0,1],color="navy",lw=2,label="Random-Model")
plt.plot(fpr,tpr,color="darkorange",lw=2, label="Decision Tree- Model")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic :ROC-AUC")
plt.legend()
plt.show()
print("Computed Area Under the Curve (AUC)",auc(fpr, tpr))