In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn')
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score, roc_curve, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

<center> <h1> Data Wrangling </h1> </center>

In [None]:
data = pd.read_csv("../input/pima-indians-diabetes-database/diabetes.csv")
data.head()

In [None]:
data.columns.array

In [None]:
data.shape

<center> <h1> EDA </h1> </center>

In [None]:
data.dtypes

- No need to change dtypes.

In [None]:
data.isna().sum()

- Data has no missing values.

In [None]:
for col in data.columns:
    print(data[col].value_counts())

- Inspecting value counts for each variable.

In [None]:
data.info()

In [None]:
data.describe()

- `Glucose BloodPressure SkinThickness Insulin BMI` columns have non logical 0 values

In [None]:
data[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = \
data[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)

In [None]:
data.isna().sum()

In [None]:
100 * data.isna().sum() / data.shape[0]

- Now we have to impute this values

In [None]:
data.hist(figsize = (20,20));

- I will fill `BloodPressure and Glucose` with the mean (Normally Distributed).
- `SkinThickness, Insulin and BMI` with median.

In [None]:
data['Glucose'].fillna(data['Glucose'].mean(), inplace = True)
data['BloodPressure'].fillna(data['BloodPressure'].mean(), inplace = True)
data['SkinThickness'].fillna(data['SkinThickness'].median(), inplace = True)
data['Insulin'].fillna(data['Insulin'].median(), inplace = True)
data['BMI'].fillna(data['BMI'].median(), inplace = True)

In [None]:
data.isna().sum()

In [None]:
data.hist(figsize = (20,20));

- The Distribution of each variable didn't change alot.

In [None]:
sns.pairplot(data=data, hue='Outcome')

---

- It is always advisable to bring all the features to the same scale for applying distance based algorithms like KNN.

In [None]:
scaler = StandardScaler()
data_scaled = pd.DataFrame(
        scaler.fit_transform((data.drop(["Outcome"], axis = 1))),
        columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']
)

---

<center> <h1> Model Building </h1> </center>

In [None]:
y = data.Outcome

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_scaled, y ,test_size=0.3, random_state=42, stratify=y)

#### KNN:

In [None]:
param_grid = {
                'n_neighbors': np.arange(1, 50, 1)
             }

knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn,param_grid, cv=5)
knn_cv.fit(X_train, y_train)

print("Best Score:" + str(knn_cv.best_score_))
print("Best Parameters: " + str(knn_cv.best_params_))

In [None]:
y_pred = knn_cv.best_estimator_.predict(X_test)
cnf_matrix = confusion_matrix(y_test, y_pred)
p = sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
print(classification_report(y_test, y_pred))

----

### LR:

In [None]:
lr = LogisticRegression(random_state=0)

param_grid = \
{
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'max_iter': list(range(100,800,100)),
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
}

lr_cv = GridSearchCV(lr, param_grid=param_grid, cv=5, scoring='roc_auc')

lr_cv.fit(X_train, y_train)

print('Config: %s' % lr_cv.best_params_)
print('Best Score: %s' % lr_cv.best_score_)

In [None]:
y_pred = lr_cv.best_estimator_.predict(X_test)
cnf_matrix = confusion_matrix(y_test, y_pred)
p = sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
print(classification_report(y_test, y_pred))

---

#### SVC

In [None]:
svc = SVC(kernel='rbf')
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)

In [None]:
y_pred = svc.predict(X_test)
cnf_matrix = confusion_matrix(y_test, y_pred)
p = sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
print(classification_report(y_test, y_pred))