In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')
df.head(5)

In [None]:
df.info()

In [None]:
df.describe().T

## **Exploratory Data Analysis** 

In [None]:
fig = plt.figure(figsize=(10,4))
ax_1 = fig.add_subplot(121)
ax_2 = fig.add_subplot(122)

sns.histplot(df['Pregnancies'],ax = ax_1)
sns.boxplot(data=df,x='Outcome',y='Pregnancies',ax = ax_2);

In [None]:
fig = plt.figure(figsize=(10,4))
ax_1 = fig.add_subplot(121)
ax_2 = fig.add_subplot(122)

sns.histplot(df['Glucose'],ax = ax_1)
sns.boxplot(data=df,x='Outcome',y='Glucose',ax = ax_2);

In [None]:
fig = plt.figure(figsize=(10,4))
ax_1 = fig.add_subplot(121)
ax_2 = fig.add_subplot(122)

sns.histplot(df['BloodPressure'],ax = ax_1)
sns.boxplot(data=df,x='Outcome',y='BloodPressure',ax = ax_2);

In [None]:
fig = plt.figure(figsize=(10,4))
ax_1 = fig.add_subplot(121)
ax_2 = fig.add_subplot(122)

sns.histplot(df['Insulin'],ax = ax_1)
sns.boxplot(data=df,x='Outcome',y='Insulin',ax = ax_2);

In [None]:
fig = plt.figure(figsize=(10,4))
ax_1 = fig.add_subplot(121)
ax_2 = fig.add_subplot(122)

sns.histplot(df['BMI'],ax = ax_1)
sns.boxplot(data=df,x='Outcome',y='BMI',ax = ax_2);

In [None]:
fig = plt.figure(figsize=(10,4))
ax_1 = fig.add_subplot(121)
ax_2 = fig.add_subplot(122)

sns.histplot(df['DiabetesPedigreeFunction'],ax = ax_1)
sns.boxplot(data=df,x='Outcome',y='DiabetesPedigreeFunction',ax = ax_2);

In [None]:
fig = plt.figure(figsize=(10,4))
ax_1 = fig.add_subplot(121)
ax_2 = fig.add_subplot(122)

sns.histplot(df['Age'],ax = ax_1)
sns.boxplot(data=df,x='Outcome',y='Age',ax = ax_2);

### Outcome

In [None]:
sns.barplot(x=sorted(df.Outcome.unique()),y=df.Outcome.value_counts())
plt.title("Outcomes",fontsize=16);
#imbalance target data

**Note:**
* **imbalance target data.**

### **Mutual information**

In [None]:
from sklearn.feature_selection import mutual_info_regression

def mutual_info_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X, y, discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

In [None]:
mi_scores = mutual_info_scores(df.drop(columns=['Outcome']), df['Outcome'], discrete_features=False)
plt.figure(dpi=100, figsize=(10, 5))
plot_mi_scores(mi_scores)

In [None]:
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import MinMaxScaler,PowerTransformer
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df.drop(['Outcome','BloodPressure'],axis=1))
ten_fold = KFold(n_splits=10,shuffle=True,random_state=42)

## **Deal with Imbalance Data**

In [None]:
from sklearn.metrics import classification_report,plot_roc_curve
from imblearn.over_sampling import SMOTE 
smote = SMOTE(sampling_strategy='minority')
X_balanced, y_balanced = smote.fit_resample(scaled_data,df['Outcome'])

In [None]:
pt = PowerTransformer(method='box-cox') 
gaussian_data = pt.fit_transform(df.drop('Outcome',axis=1)+1e-8)
Xg_balanced, yg_balanced = smote.fit_resample(gaussian_data, df['Outcome'])

# Modeling

### **KNeighbors**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
param_grid = {'n_neighbors':np.arange(3,9)}
knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, param_grid, cv=ten_fold)
knn_cv.fit(X_balanced, y_balanced)
print(classification_report(y_balanced,knn_cv.predict(X_balanced)))

### **Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression
param_grid = {'C':np.arange(0.1,5,0.1)}
lr = LogisticRegression()
lr_cv = GridSearchCV(lr, param_grid, cv=ten_fold)
lr_cv.fit(Xg_balanced, yg_balanced)
print(classification_report(yg_balanced,lr_cv.predict(Xg_balanced)))

### **Support Vector machine**

In [None]:
from sklearn.svm import SVC
param_grid = {'C':np.arange(1,8),
             'kernel':['poly', 'rbf', 'sigmoid'],
             'degree':[2,3,4],
             'gamma':['scale']}
svc = SVC()
svc_cv = GridSearchCV(svc, param_grid, cv=ten_fold)
svc_cv.fit(Xg_balanced, yg_balanced)
print(classification_report(yg_balanced,svc_cv.predict(Xg_balanced)))

### **ROC Curve**

In [None]:
fig = plt.figure(figsize=(14,5))

ax_1 = fig.add_subplot(131)
plot_roc_curve(knn_cv,X_balanced,y_balanced,color='red',ax=ax_1)  
plt.title("KNeighborsClassifier")

ax_2 = fig.add_subplot(132)
plot_roc_curve(lr_cv,Xg_balanced,yg_balanced,color='blue',ax=ax_2)  
plt.title("LogisticRegression")

ax_3 = fig.add_subplot(133)
plot_roc_curve(svc_cv,Xg_balanced,yg_balanced,color='green',ax=ax_3)
plt.title("Support Vector Classifier");  