In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn')

import warnings
warnings.simplefilter(action='ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score, roc_curve

In [None]:
df = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')
print(df.shape)
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
non_zero_features = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[non_zero_features] = df[non_zero_features].replace(0, np.nan)

In [None]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=0)

In [None]:
df_train.info()

In [None]:
sns.heatmap(df_train.corr(), annot=True)

In [None]:
df_train.describe()

## EDA

In [None]:
features = df_train.drop('Outcome', axis=1).columns
for c in features:
    f, axis = plt.subplots(1,2, figsize=(20, 5))
    sns.distplot(df_train[c], ax=axis[0], kde=True)
    sns.boxplot(df_train['Outcome'], df_train[c], ax=axis[1])
    plt.show()

In [None]:
df_train['Outcome'].value_counts().plot.bar()

In [None]:
for c in non_zero_features:
    df_train[df_train[c].isnull()]['Outcome'].value_counts().plot.bar()
    plt.title(c)
    plt.show()


# Model Building

In [None]:
X_train = df_train.drop('Outcome', axis=1)
y_train = df_train['Outcome']
X_test = df_test.drop('Outcome', axis=1)
y_test = df_test['Outcome']

imputer = SimpleImputer(strategy='median')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

scale = StandardScaler()
X_train = scale.fit_transform(X_train)
X_test = scale.transform(X_test)

for c in ['BloodPressure', 'Insulin']:
    X_train = np.c_[X_train, np.where(df_train[c].isnull(), 1, 0)]
    X_test = np.c_[X_test, np.where(df_test[c].isnull(), 1, 0)]

models = [('LogisticRegression', LogisticRegression()), ('Knn', KNeighborsClassifier()), ('SVC', SVC())]

for name, model in models:
    y_pred = cross_val_predict(model, X_train, y_train)
    print(name + '\n')
    print('accuracy_score : ', accuracy_score(y_train, y_pred))
    print('recall_score : ', recall_score(y_train, y_pred))
    print('precision : ', precision_score(y_train, y_pred))
    print('f1_score : ', f1_score(y_train, y_pred))
    print('roc_auc_score : ', roc_auc_score(y_train, y_pred))
    print('\n')

In [None]:
lr = LogisticRegression(random_state=0)

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'max_iter': list(range(100,800,100)),
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
}
lr_search = GridSearchCV(lr, param_grid=param_grid, cv=5, scoring='roc_auc')

lr_search.fit(X_train , y_train)

print('Config: %s' % lr_search.best_params_)
print('Best Score: %s' % lr_search.best_score_)

In [None]:
y_train_pred = lr_search.predict(X_train)
y_test_pred = lr_search.predict(X_test)
y_train_prob= lr_search.predict_proba(X_train)[:,1]
y_test_prob= lr_search.predict_proba(X_test)[:,1]

print('Train accuracy_score : ', accuracy_score(y_train, y_train_pred))
print('Test accuracy_score : ', accuracy_score(y_test, y_test_pred))
print('\n')
print('Train recall_score : ', recall_score(y_train, y_train_pred))
print('Test recall_score : ', recall_score(y_test, y_test_pred))
print('\n')
print('Train precision : ', precision_score(y_train, y_train_pred))
print('Test precision : ', precision_score(y_test, y_test_pred))
print('\n')
print('Train f1_score : ', f1_score(y_train, y_train_pred))
print('Test f1_score : ', f1_score(y_test, y_test_pred))
print('\n')
print('Train roc_auc_score : ', roc_auc_score(y_train, y_train_prob))
print('Test roc_auc_score : ', roc_auc_score(y_test, y_test_prob))

In [None]:
def plot_roc_curve(false_positive_rate, true_positive_rate):
    plt.plot(false_positive_rate, true_positive_rate, linewidth=2)
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate (FPR)', fontsize=16)
    plt.ylabel('True Positive Rate (TPR)', fontsize=16)
    plt.plot([0, 1], [0, 1], 'r--')

false_positive_rate, true_positive_rate, _ = roc_curve(y_test, y_test_prob)
  
plot_roc_curve(false_positive_rate, true_positive_rate)