In [None]:
#Import Libs
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from statistics import stdev
import warnings
warnings.filterwarnings('ignore')

In [None]:
#Read Data
data = pd.read_csv("../input/pima-indians-diabetes-database/diabetes.csv")
data.head()

In [None]:
data.tail()

# **Data Processing**

In [None]:
data.isnull().any()

> No Null Values

In [None]:
data.duplicated().any()

> No Duplicated

In [None]:
data.info()

In [None]:
dataClean = data.copy()

In [None]:
#Remove Outliers Function
def RemoveOutliers(df):
    std = stdev(df)*3
    mean = df.mean()
    limitL = mean - std
    limitR = mean + std
    outliers = dataClean.loc[(df > limitR) | (df < limitL)]
    dataClean.drop(outliers.index, inplace=True)
    return dataClean

# **Data Visualization**

In [None]:
dataClean.hist(figsize=(15,12))
plt.show()

In [None]:
plt.figure(figsize=(12,6))
plt.title("Distribution of Pregnancies", fontsize=20)
sns.distplot(RemoveOutliers(dataClean.Pregnancies).Pregnancies)
plt.show()

In [None]:
plt.figure(figsize=(12,7))
plt.title("No. of Pregnancies", fontsize=20)
ax = sns.countplot(data=dataClean, x="Pregnancies", palette="hls")
for bar in ax.patches:
    bar_value = bar.get_height()
    text = f'{bar_value:,}'
    text_x = bar.get_x() + bar.get_width() / 2
    text_y = bar.get_y() + bar_value
    bar_color = bar.get_facecolor()
    ax.text(text_x, text_y, text, ha='center', va='bottom', color=bar_color,size=12)

In [None]:
plt.figure(figsize=(12,6))
plt.title("Distribution of Glucose", fontsize=20)
sns.distplot(RemoveOutliers(dataClean.Glucose).Glucose)
plt.show()

In [None]:
plt.figure(figsize=(12,6))
plt.title("Distribution of BloodPressure", fontsize=20)
sns.distplot(RemoveOutliers(dataClean.BloodPressure).BloodPressure)
plt.show()

In [None]:
plt.figure(figsize=(12,6))
plt.title("Distribution of SkinThickness", fontsize=20)
sns.distplot(dataClean.SkinThickness)
plt.show()

In [None]:
plt.figure(figsize=(12,6))
plt.title("Distribution of Insulin", fontsize=20)
sns.distplot(dataClean.Insulin)
plt.show()

In [None]:
plt.figure(figsize=(12,6))
plt.title("Distribution of Body Mass Index (BMI)", fontsize=20)
sns.distplot(RemoveOutliers(data.BMI).BMI)
plt.show()

In [None]:
plt.figure(figsize=(12,6))
plt.title("Distribution of DiabetesPedigreeFunction", fontsize=20)
sns.distplot(dataClean.DiabetesPedigreeFunction)
plt.show()

In [None]:
plt.figure(figsize=(12,6))
plt.title("Distribution of Age", fontsize=20)
sns.distplot(dataClean.Age)
plt.show()

In [None]:
plt.figure(figsize=(12,7))
plt.title("No. of Ages", fontsize=20)
ax = sns.countplot(data=dataClean, x="Age", palette="hls")
for bar in ax.patches:
    bar_value = bar.get_height()
    text = f'{bar_value:,}'
    text_x = bar.get_x() + bar.get_width() / 2
    text_y = bar.get_y() + bar_value
    bar_color = bar.get_facecolor()
    ax.text(text_x, text_y, text, ha='center', va='bottom', color=bar_color,size=12)

In [None]:
plt.figure(figsize=(12,7))
plt.title("No. of Women has Diabetes or Not", fontsize=20)
ax = sns.countplot(data=dataClean, x="Outcome", palette="hls")
for bar in ax.patches:
    bar_value = bar.get_height()
    text = f'{bar_value:,}'
    text_x = bar.get_x() + bar.get_width() / 2
    text_y = bar.get_y() + bar_value
    bar_color = bar.get_facecolor()
    ax.text(text_x, text_y, text, ha='center', va='bottom', color=bar_color,size=12)

In [None]:
plt.figure(figsize=(12,7))
plt.title("Avg. Woman has Diabetes or Not with Pregnancy", fontsize=20)
ax = sns.boxplot(x="Outcome", y="Pregnancies", data=dataClean, palette="hls")

In [None]:
plt.figure(figsize=(12,7))
plt.title("Avg. Woman has Diabetes or Not with Ages", fontsize=20)
ax = sns.boxplot(x="Outcome", y="Age", data=dataClean, palette="hls")

In [None]:
corr = dataClean.corr()
plt.figure(figsize=(12,7))
matrix = np.triu(corr)
sns.heatmap(corr, annot= True, fmt=".1g", cmap= 'jet', linewidths=1, linecolor='black', mask=matrix)

# **Training Data**

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

# **Splitting Data**

In [None]:
features = dataClean.drop(["Outcome"],axis=1)
targets = dataClean.Outcome
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.2, random_state=42)
y_train = y_train.values.reshape(-1,1)
y_test = y_test.values.reshape(-1,1)

> Data was splitted to 20% Test Set and 80% Train Set

In [None]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
#Result Dict
res_dfTrain = {}
res_dfTest = {}

# **KNN Model With GridSearch**

In [None]:
GS = {
    "n_neighbors": np.arange(1,20),
    'weights': ['distance', 'uniform'],
    'p':np.arange(1,5),
    "algorithm": ['ball_tree', 'kd_tree', 'auto']
}

knn = KNeighborsClassifier()
knn_GS = GridSearchCV(knn, GS, cv=5)
knn_GS.fit(X_train, y_train)

res_dfTrain["KNN Train Score"] = knn_GS.best_score_
res_dfTest["KNN Test Score"] = knn_GS.score(X_test, y_test)

print("HP: " + str(knn_GS.best_params_))
print("Train Score: " + str(knn_GS.best_score_))
print("Test Score: " + str(knn_GS.score(X_test, y_test)))

# **RF Model With GridSearch**

In [None]:
GS = {
    "criterion": ["gini", "entropy"],
    "n_estimators": np.arange(0,50)
}

rf = RandomForestClassifier()
rf_GS = GridSearchCV(rf, GS, cv=5)
rf_GS.fit(X_train, y_train)

res_dfTrain["RF Train Score"] = rf_GS.best_score_
res_dfTest["RF Test Score"] = rf_GS.score(X_test, y_test)

print("HP: " + str(rf_GS.best_params_))
print("Train Score: " + str(rf_GS.best_score_))
print("Test Score: " + str(rf_GS.score(X_test, y_test)))

# **Models Visualization**

In [None]:
df_res_train = pd.DataFrame.from_dict(res_dfTrain, orient="index", columns=["Score"])
df_res_train.sort_values(by=["Score"], ascending=False)

In [None]:
plt.figure(figsize=(12,7))
plt.title("Train Score Result", fontsize=20)
ax = sns.barplot(x = df_res_train.index, y = df_res_train.Score)
for bar in ax.patches:
    bar_value = bar.get_height()
    text = f'{bar_value:,}'
    text_x = bar.get_x() + bar.get_width() / 2
    text_y = bar.get_y() + bar_value
    bar_color = bar.get_facecolor()
    ax.text(text_x, text_y, text, ha='center', va='bottom', color=bar_color,size=12)

In [None]:
df_res_test = pd.DataFrame.from_dict(res_dfTest, orient="index", columns=["Score"])
df_res_test.sort_values(by=["Score"], ascending=False)

In [None]:
plt.figure(figsize=(12,7))
plt.title("Test Score Result", fontsize=20)
ax = sns.barplot(x = df_res_test.index, y = df_res_test.Score)
for bar in ax.patches:
    bar_value = bar.get_height()
    text = f'{bar_value:,}'
    text_x = bar.get_x() + bar.get_width() / 2
    text_y = bar.get_y() + bar_value
    bar_color = bar.get_facecolor()
    ax.text(text_x, text_y, text, ha='center', va='bottom', color=bar_color,size=12)

**The Best Model for This problem is Random Forest (RF)**