In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
diabetes_df=pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")

In [None]:
diabetes_df.head()

In [None]:
diabetes_df.info()

In [None]:
diabetes_df.describe().T

In [None]:
print("Number of Zeros for BMI: ","%", round(len(diabetes_df[diabetes_df["BMI"]==0])/len(diabetes_df)*100,3))
print("Number of Zeros for Insulin: ","%", round(len(diabetes_df[diabetes_df["Insulin"]==0])/len(diabetes_df)*100,3))
print("Number of Zeros for SkinThickness: ","%", round(len(diabetes_df[diabetes_df["SkinThickness"]==0])/len(diabetes_df)*100,3))
print("Number of Zeros for BloodPressure: ","%", round(len(diabetes_df[diabetes_df["BloodPressure"]==0])/len(diabetes_df)*100,3))
print("Number of Zeros for Glucose: ","%", round(len(diabetes_df[diabetes_df["Glucose"]==0])/len(diabetes_df)*100,3))

Zero values look like uncommon.If we search about these features on the internet, We can learn that these features can't be zero so we can think them as missing values.

In [None]:
diabetes_df2=diabetes_df.copy()
diabetes_df2[["Glucose","BloodPressure","BMI","Insulin","SkinThickness"]] = diabetes_df2[["Glucose","BloodPressure",
                                                                                          "BMI","Insulin","SkinThickness"]].replace(0,np.NaN)
diabetes_df2.isnull().sum()

I changed zero values with NAN values.

In [None]:
diabetes_df2.dropna(subset=["Glucose","BloodPressure","BMI"],inplace=True)
diabetes_df2.isnull().sum()

Removed Nan values where missing ratio less than %5. SkinThickness and Insulin still have lots of missing values. I will handle them during model creation section. I dont want any data leakage.

# Exploratory Data Analysis

In [None]:
diabetes_df2.hist(color='blue',figsize = (14,8));

SkinThickness and Insulin have right skew distribution so we can use median for missing imputation or predict missing values using complex  models such as KNN or tree based algorithms.

In [None]:
sns.countplot("Outcome",data=diabetes_df2);

There seems to be a slight imbalance in target values.

In [None]:
fig, axes = plt.subplots(nrows = 2,ncols = 4,figsize = (20,6))
ax = sns.boxplot(y="Pregnancies",data=diabetes_df2,ax=axes[0,0])
ax = sns.boxplot(y="Glucose",data=diabetes_df2,ax=axes[0,1])
ax = sns.boxplot(y="BloodPressure",data=diabetes_df2,ax=axes[0,2])
ax = sns.boxplot(y="SkinThickness",data=diabetes_df2,ax=axes[0,3])
ax = sns.boxplot(y="Insulin",data=diabetes_df2,ax=axes[1,0])
ax = sns.boxplot(y="BMI",data=diabetes_df2,ax=axes[1,1])
ax = sns.boxplot(y="DiabetesPedigreeFunction",data=diabetes_df2,ax=axes[1,2])
ax = sns.boxplot(y="Age",data=diabetes_df2,ax=axes[1,3])

In [None]:
def kdeplot_Diabetes(col,ax=None):

    ax = sns.kdeplot(diabetes_df2[col][(diabetes_df2["Outcome"] == 1) ],color="Red",ax=ax)
    ax = sns.kdeplot(diabetes_df2[col][(diabetes_df2["Outcome"] == 0) ],color="Blue",ax=ax)
    ax.legend(["Diabetes","Not Diabetes"],loc='upper right')
    ax.set_ylabel('Density')
    ax.set_xlabel(col)
    ax.set_title("Distribution of "+ col +" by Diabetes")

In [None]:
fig, axes = plt.subplots(nrows = 2,ncols = 4,figsize = (22,12))
kdeplot_Diabetes("Pregnancies",ax=axes[0,0])
kdeplot_Diabetes("Glucose",ax=axes[0,1])
kdeplot_Diabetes("BloodPressure",ax=axes[0,2])
kdeplot_Diabetes("SkinThickness",ax=axes[0,3])
kdeplot_Diabetes("Insulin",ax=axes[1,0])
kdeplot_Diabetes("BMI",ax=axes[1,1])
kdeplot_Diabetes("DiabetesPedigreeFunction",ax=axes[1,2])
kdeplot_Diabetes("Age",ax=axes[1,3])

# Remove Outliers

In [None]:
def remove_outliers (data,col):
    
    outliers_val = data[data[col].notnull()][col].values
    q25, q75 = np.percentile(outliers_val, 25), np.percentile(outliers_val, 75)
    outliers_iqr = q75 - q25

    outliers_cut_off = outliers_iqr * 1.5
    outliers_lower, outliers_upper = q25 - outliers_cut_off, q75 + outliers_cut_off

    outliers = [x for x in outliers_val if x < outliers_lower or x > outliers_upper]

    data_new = data.drop(data[(data[col] > outliers_upper) | (data[col] < outliers_lower)].index)
    return data_new

In [None]:
diabetes_df3=diabetes_df2.copy()
diabetes_df3=remove_outliers(diabetes_df3,"Pregnancies")
diabetes_df3=remove_outliers(diabetes_df3,"SkinThickness")
diabetes_df3=remove_outliers(diabetes_df3,"BloodPressure")
diabetes_df3.shape

# Feature Engineering

In [None]:
diabetes_df3["Age_cat"]=pd.qcut(diabetes_df3["Age"],4)

In [None]:
diabetes_df3["BMI_cat"]=1
diabetes_df3.loc[:,'BMI_cat'] = np.where((diabetes_df3["BMI"] <= 18.5),0,
                                         np.where((diabetes_df3["BMI"] <=25)& (diabetes_df3["BMI"]> 18.5),1,
                                                  np.where((diabetes_df3["BMI"] <= 30)& (diabetes_df3["BMI"]> 25),2,
                                                           np.where((diabetes_df3["BMI"] > 30),3,9))))

In [None]:
diabetes_df3.drop(["Age","BMI"],axis=1,inplace=True)

Age and BMI were converted numeric to categorical variables.

In [None]:
diabetes_df3_d= pd.get_dummies(diabetes_df3)
diabetes_df3_d.head()

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
fig.suptitle('Correlation between Diabetes and features',fontsize=20)
ax=sns.heatmap(diabetes_df3_d.corr()[["Outcome"]].sort_values("Outcome"),vmax=1, vmin=-1, cmap="YlGnBu", annot=True, ax=ax);
ax.invert_yaxis()

In [None]:
diabetes_df3_d.info()

# Support Vector Machine Algorithm

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
from sklearn.metrics import roc_auc_score,roc_curve

In [None]:
X=diabetes_df3_d.drop("Outcome",axis=1)
y=diabetes_df3_d["Outcome"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,shuffle=True, stratify = y)

# Parameter Tuning

In [None]:
linsvc_pipe = Pipeline([
    ("imp", IterativeImputer(estimator= KNeighborsRegressor())),
    ("scaler", StandardScaler()),
    ("linsvc", LinearSVC(random_state=42,class_weight="balanced"))
])

params={
    "linsvc__C":[3,4,5,6],
    "linsvc__loss": ["hinge","squared_hinge"],
    "linsvc__penalty":["l1","l2"]
    
}

linsvc_grid = GridSearchCV(linsvc_pipe, params, cv=3,n_jobs=-1)
linsvc_grid.fit(X_train, y_train)
print("Best Parameters for Model:  ",linsvc_grid.best_params_)
y_pred=linsvc_grid.predict(X_train)
print("\n")
print(classification_report(y_train, y_pred))

In [None]:
fpr, tpr, thresholds =roc_curve(y_train, y_pred, pos_label=1)
roc_auc=roc_auc_score(y_train, y_pred)
plt.figure( figsize=(14,6))
plt.plot(fpr, tpr, color='darkorange', label='ROC curve (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy',linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve')
plt.legend(loc="lower right")
plt.show()

# Test Score

In [None]:
linsvc_pipe = Pipeline([
    ("imp", IterativeImputer(estimator=KNeighborsRegressor())),
    ("scaler", StandardScaler()),
    ("linsvc", LinearSVC(random_state=42,class_weight="balanced",
                         C= 5,loss="squared_hinge",penalty="l2"))
])


linsvc_pipe.fit(X_train, y_train)
y_pred=linsvc_pipe.predict(X_test)
print(classification_report(y_test, y_pred))