In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler,RobustScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
%matplotlib inline

In [None]:
df=pd.read_csv(r"../input/pima-indians-diabetes-database/diabetes.csv")

## Data Investigation

In [None]:
df.info(verbose=True)

In [None]:
df.describe()

## Several values have a minimum that doesn't make sense such as:
-Glucose can't reach zero while the person is still alive.<br>
-Skin thickness is measured in mm alas it cannot reach 0.<br>
-Insulin production level could reach zero which is a high indication if the person is diabetic.<br>
-BMI which is body mass index is calculated from weight and height and cannot reach zero.<br>

In [None]:
#Boxplot for insulin and diabetes
sns.set_style("whitegrid")
box_plot = sns.boxplot(x="Outcome",y="Insulin",data=df)
medians = df.groupby(["Outcome"])["Insulin"].median()
vertical_offset = df["Insulin"].median() * 0.05 # offset from median for display

for xtick in box_plot.get_xticks():
    box_plot.text(xtick,medians[xtick] + vertical_offset,medians[xtick], 
            horizontalalignment='center',size='x-small',color='w',weight='semibold')

print("Since 0 values appear in both outcomes ,its not an indicator of the insulin level and should be considered null value")

In [None]:
#Counting Zero Values for each column:Glucose	BloodPressure	SkinThickness	Insulin	BMI
zero_attributes=["Glucose","BloodPressure","SkinThickness","Insulin","BMI"]
def zero_values(df,zero_attributes):
    for i in zero_attributes:
        df_count=df.loc[df[i] == 0]
        x=df_count[i].count()
        print(f'The Number of zero values in column {i} is {x}')
zero_values(df,zero_attributes)

## Visualizing Data Distribution Through Histplot

In [None]:
fig = plt.figure(figsize = (20,12))
ax = fig.gca()
df.hist(ax = ax)
plt.show()
print("we can see that most of the columns are skewed")

## Pairplot with target

In [None]:
di={0.0:0,1.0:1}
sns.pairplot(df.replace({"Outcome":di}),hue="Outcome")
plt.show()

## Boxplot for each column with target

In [None]:
c=[0,1,2,3]
r=[0,1]
cols_index=0
cols=df.columns[:-1]
fig,axs = plt.subplots(2, 4,figsize=(20,12))
for i in r:
    for j in c:              
        box_plot = sns.boxplot(x="Outcome",y=cols[cols_index],data=df,ax=axs[i,j])
        medians = df.groupby(["Outcome"])[cols[cols_index]].median()
        vertical_offset = df[cols[cols_index]].median() * 0.05 
        cols_index+=1

## Preprocessing
-Dealing with zero values by replacing each zero value with the mean of its column

In [None]:
#first copying the data
df_copy=df.copy()

## Splitting the Data

In [None]:
X=df_copy.iloc[:,:-1]
y=df_copy.iloc[:,-1]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,random_state=42)

In [None]:
missing_values=['Glucose', 'BloodPressure','BMI']
drop_columns=['Insulin','SkinThickness']

In [None]:
def preprocessing(df,missing_values,drop_columns):
    df.drop(columns=drop_columns,inplace=True)
    for col in missing_values:
        val = df[col].mean()
        df[col] = df[col].replace(0, val)
    return df

In [None]:

X_train_processed=preprocessing(X_train,missing_values,drop_columns)
X_test_processed=preprocessing(X_test,missing_values,drop_columns)

## Scaling Data Standard Scaler

In [None]:
#scaling data using standard scaler
scaler=StandardScaler()
scaler.fit(X_train)
X_train_Scaled=scaler.transform(X_train_processed)
X_test_Scaled=scaler.transform(X_test_processed)

## K Nearest Neighbor

In [None]:
knn=KNeighborsClassifier()
param_grid = dict(n_neighbors=np.arange(1,50,1))
grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
grid.fit(X_train_Scaled,y_train)

In [None]:
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)

In [None]:
knn=grid.best_estimator_
y_pred_train_knn=knn.predict(X_train_Scaled)
y_pred_test_knn=knn.predict(X_test_Scaled)

In [None]:
print(f"the train accuracy is for KNN {accuracy_score(y_train,y_pred_train_knn)}")
print(f"the test accuracy is for KNN {accuracy_score(y_test,y_pred_test_knn)}")

## Logistic Regression

In [None]:
logreg=LogisticRegression()
logreg.fit(X_train_Scaled,y_train)

In [None]:
LR=logreg
y_pred_train_lr=LR.predict(X_train_Scaled)
y_pred_test_lr=LR.predict(X_test_Scaled)

In [None]:
print(f"the train accuracy is for LR {accuracy_score(y_train,y_pred_train_lr)}")
print(f"the test accuracy is for LR {accuracy_score(y_test,y_pred_test_lr)}")

## SVC

In [None]:
svc=SVC(kernel="rbf",probability=True)
svc.fit(X_train_Scaled,y_train)

In [None]:
svc=svc_grid.best_estimator_
y_pred_train_svc=svc.predict(X_train_Scaled)
y_pred_test_svc=svc.predict(X_test_Scaled)

In [None]:
print(f"the train accuracy is for SVC {accuracy_score(y_train,y_pred_train_svc)}")
print(f"the test accuracy is for SVC {accuracy_score(y_test,y_pred_test_svc)}")

## Confusion Matrix

In [None]:
#KNN
print("KNN Confusion Matrix \n")
confusion_matrix(y_test,y_pred_test_knn)
print(pd.crosstab(y_test, y_pred_test_knn, rownames=['True'], colnames=['Predicted'], margins=True))
print("________________________________\n")
#LR
print("LR Confusion Matrix \n")
confusion_matrix(y_test,y_pred_test_lr)
print(pd.crosstab(y_test, y_pred_test_lr, rownames=['True'], colnames=['Predicted'], margins=True))
print("________________________________\n")
#SVC
print("SVC Confusion Matrix \n")
confusion_matrix(y_test,y_pred_test_svc)
print(pd.crosstab(y_test, y_pred_test_svc, rownames=['True'], colnames=['Predicted'], margins=True))
print("________________________________\n")

## Precision, Recall and Accuracy

In [None]:
#KNN
print("KNN Classification Report\n")
print(classification_report(y_test,y_pred_test_knn))
print("________________________________\n")
#LR
print("LR Classification Report\n")
print(classification_report(y_test,y_pred_test_lr))
print("________________________________\n")
#SVC
print("SVC Classification Report\n")
print(classification_report(y_test,y_pred_test_svc))
print("________________________________\n")

## ROC Curve

In [None]:
#SVC ROC Curve
y_pred_proba_svc = svc.predict_proba(X_test_Scaled)[:,1]
fpr_svc, tpr_svc, thresholds_svc = roc_curve(y_test, y_pred_proba_svc)
fig,axs=plt.subplots(1,3,figsize=(15,8))
axs[0].plot([0,1],[0,1],'k--')
axs[0].plot(fpr_svc,tpr_svc, label='SVC')
axs[0].set_title('SVC ROC curve')
#Logistic Regression ROC Curve
y_pred_proba_lr = LR.predict_proba(X_test_Scaled)[:,1]
fpr_lr, tpr_lr, thresholds_lr = roc_curve(y_test, y_pred_proba_lr)
axs[1].plot([0,1],[0,1],'k--')
axs[1].plot(fpr_lr,tpr_lr, label='LR')
axs[1].set_title('LR ROC curve')
#KNN Regression ROC Curve
y_pred_proba_knn = knn.predict_proba(X_test_Scaled)[:,1]
fpr_knn, tpr_knn, thresholds_knn = roc_curve(y_test, y_pred_proba_knn)
axs[2].plot([0,1],[0,1],'k--')
axs[2].plot(fpr_knn,tpr_knn, label='LR')
axs[2].set_title('KNN ROC curve')
plt.show()