In [None]:
#Import pandas,numpy for Dataset Manupilation and matplotlib and seaborn for Visualization and graphviz for tree visualization
import pandas as pd  
import matplotlib.pyplot as plt 
import seaborn as sns 
import numpy as np
import graphviz

In [None]:
#Import functions for Model, Dataset Splitting and Evaluation
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

In [None]:
df=pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv") #Read the Dataset CSV File to a dataframe object

In [None]:
df.shape # To view the shape of our dataset (768 rows and 9 columns)

In [None]:
df.head()

In [None]:
df.info() #Information about the Dataframe

In [None]:
df.describe() # Further Statistical Information about the dataset

In [None]:
# Display BloodPressure against BMI
df.plot(kind="scatter", x="BloodPressure", y="BMI") # Plot the data points (x-BloodPressure and y-BMI)
plt.show()
print("From df.describe and the plot we can see that few rows have 0 as value for some columns")

In [None]:
# Let's replace those 0 values with the mean of column
zerocols=['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI']
for col in zerocols:
    df[col]=df[col].replace(0,df[col].mean())

In [None]:
# Display BloodPressure against BMI
df.plot(kind="scatter", x="BloodPressure", y="BMI") # Plot the data points (x-BloodPressure and y-BMI)
plt.show()
print("From df.describe and the plot we can see that few rows have 0 as value for some columns")

In [None]:
df.head()

In [None]:
sns.FacetGrid(df, hue="Outcome", height=5).map(plt.scatter, "BloodPressure", "BMI").add_legend() 
plt.show()

In [None]:
# Display distribution of data points of each class in each attribute
plt.figure(figsize=(15,10))
plt.subplot(3,3,1)
sns.stripplot(x = 'Outcome', y = 'Pregnancies', data = df, jitter = True)
plt.subplot(3,3,2)
sns.stripplot(x = 'Outcome', y = 'Glucose', data = df, jitter = True)
plt.subplot(3,3,3)
sns.stripplot(x = 'Outcome', y = 'BloodPressure', data = df, jitter = True)
plt.subplot(3,3,4)
sns.stripplot(x = 'Outcome', y = 'SkinThickness', data = df, jitter = True)
plt.subplot(3,3,5)
sns.stripplot(x = 'Outcome', y = 'Insulin', data = df, jitter = True)
plt.subplot(3,3,6)
sns.stripplot(x = 'Outcome', y = 'BMI', data = df, jitter = True)
plt.subplot(3,3,7)
sns.stripplot(x = 'Outcome', y = 'DiabetesPedigreeFunction', data = df, jitter = True)
plt.subplot(3,3,8)
sns.stripplot(x = 'Outcome', y = 'Age', data = df, jitter = True)

In [None]:
corr=df.corr() #Correlation Matrix

In [None]:
# Display the correlation matrix using a heatmap
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);
print("As you can see each of the attributes contribute reasonably towards the outcome")

In [None]:
# Exactly –1. A perfect downhill (negative) linear relationship

# –0.70. A strong downhill (negative) linear relationship

# –0.50. A moderate downhill (negative) relationship

# –0.25. A weak downhill (negative) linear relationship

# 0. No linear relationship


# +0.25. A weak uphill (positive) linear relationship

# +0.50. A moderate uphill (positive) relationship

# +0.70. A strong uphill (positive) linear relationship

# Exactly +1. A perfect uphill (positive) linear relationship

In [None]:
X=df[['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']].values
y=df['Outcome']

In [None]:
# Create the training and test sets using 0.2 as test size (i.e 80% of data for training rest 20% for model testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
print("Accuracy on Test Set:",metrics.accuracy_score(y_test, y_pred))

In [None]:
# Get the confusion Matrix of the Model
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

In [None]:
# Plot the Confusion Matrix as a HeatMap
class_names=[0,1] # Name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
print(metrics.classification_report(y, clf.predict(X))) # Displays a comprehensive Report of the Decision Tree Model On overall Dataset

In [None]:
#Two possible ways to visualize the Decision Tree
from sklearn import tree
text_representation = tree.export_text(clf)
print(text_representation)

In [None]:
#Display the decision tree as a graph
features=['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']
labels=['Outcome']
dot_data = tree.export_graphviz(clf, out_file=None, 
                                feature_names=features,  
                                filled=True)

# Draw graph
graph = graphviz.Source(dot_data, format="png") 
graph