#### First, we import the libraries.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("fivethirtyeight")

In [None]:
data = pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")

In [None]:
data.shape

In [None]:
data.head()

## ABOUT DATA

### Context
This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.

### Content
The datasets consists of several medical predictor variables and one target variable, Outcome. Predictor variables includes the number of pregnancies the patient has had, their BMI, insulin level, age, and so on.

### Acknowledgements
Smith, J.W., Everhart, J.E., Dickson, W.C., Knowler, W.C., & Johannes, R.S. (1988). Using the ADAP learning algorithm to forecast the onset of diabetes mellitus. In Proceedings of the Symposium on Computer Applications and Medical Care (pp. 261--265). IEEE Computer Society Press.

### Inspiration
Can you build a machine learning model to accurately predict whether or not the patients in the dataset have diabetes or not?

In [None]:
data.describe().T

Looks like, we don't have null values. How lovely!
But, lets check it again.

In [None]:
data.isna().sum()

#### Something is off. As you can see above, more than one columns has 0 as 'Insulin' value. Also, we have 0 as 'SkinThickness' value. Is that possible?

We should first determine which variables cannot have '0' as their values.

For 'BloodPressure' Wikipedia says : If the heart is stopped, blood pressure falls, but it does not fall to zero.
#### So, it is quite impossible.

For 'SkinThickness' : Skin thickness of a human being cannot be less than 10 mm.
#### Being 0 is impossible too.

For 'BMI' : Body Mass Index cannot be 0 too. This is beyond unhealty.
#### It is impossible too.

For 'Glucos' : Glucos can't be zero too.
#### Impossible.

For 'Insulin' : A person can live with zero insulin in rare situation.
#### Not that impossible.

### Let's check how many zeros we have.

In [None]:
bp_zeros = data[data['BloodPressure'] == 0].shape[0]
st_zeros = data[data['SkinThickness'] == 0].shape[0]
glucos_zeros = data[data['Glucose'] == 0].shape[0]
bmi_zeros = data[data['BMI'] == 0].shape[0]
insulin_zeros = data[data['Insulin'] == 0].shape[0]

In [None]:
print(f"Number of zeros 'BloodPressure' column have : {bp_zeros}")
print(f"Number of zeros 'SkinThickness' column have : {st_zeros}")
print(f"Number of zeros 'Glucos' column have : {glucos_zeros}")
print(f"Number of zeros 'BMI' column have : {bmi_zeros}")
print(f"Number of zeros 'Insulin' column have : {insulin_zeros}")

* Slightly more than half of the insulin column has zero as value.

In [None]:
data.hist(color='red', figsize=(20,15));

* SkinThickness, Insulin is right skewed.
* BMI, and BloodPressure is normally distributed.
* Glucose is left skewed.

In [None]:
from IPython.display import Image 

In [None]:
Image(url="https://upload.wikimedia.org/wikipedia/commons/9/9b/Measures_of_Central_Tendency.png")

* a is normally distributed like BMI, and BloodPressure. If we fill zeros with median of that columns, we wouldn't disrupt the data.
* For left, and right skewed data, we can fill zeros with median of that columns.

In [None]:
data_2 = data.copy()

In [None]:
data_2['Insulin'].replace(0, data_2['Insulin'].median(), inplace=True)
data_2['SkinThickness'].replace(0, data_2['SkinThickness'].median(), inplace=True)
data_2['BMI'].replace(0, data_2['BMI'].mean(), inplace=True)
data_2['Glucose'].replace(0, data_2['Glucose'].median(), inplace=True)
data_2['BloodPressure'].replace(0, data_2['BloodPressure'].mean(), inplace=True)

In [None]:
data_2.head()

In [None]:
data_2.describe().T

Now, we can check the correlations.

In [None]:
## You can find the source code here : 
# https://github.com/manukalia/handy_data_viz_functions/blob/master/handy_data_visualization_functions.ipynb
def half_corr_heatmap(data, title=None):
    plt.figure(figsize=(9,9))
    sns.set(font_scale=1)
    
    mask = np.zeros_like(data.corr())
    mask[np.tril_indices_from(mask)] = True
    
    with sns.axes_style("white"):
        sns.heatmap(data.corr(), mask=mask, annot=True, cmap="coolwarm")
    
    if title: plt.title(f"\n{title}\n", fontsize=18)
    plt.show()
    return

In [None]:
half_corr_heatmap(data_2, 'Correlation Between Variables')

#### There is no high correlation between variables.

In [None]:
def corr_to_target(dataframe, target, title=None, file=None):
    plt.figure(figsize=(4,6))
    sns.set(font_scale=1)
    
    sns.heatmap(dataframe.corr()[[target]].sort_values(target,
                                                ascending=False)[1:],
                annot=True,
                cmap='coolwarm')
    
    if title: plt.title(f'\n{title}\n', fontsize=18)
    plt.xlabel('')    # optional in case you want an x-axis label
    plt.ylabel('')    # optional in case you want a  y-axis label
    if file: plt.savefig(file, bbox_inches='tight')
    plt.show();
    
    return

In [None]:
corr_to_target(data_2, 'Outcome', 'Outcome');

In [None]:
def gen_boxplots(dataframe, cols=1, file=None):
    rows      = math.ceil(len(dataframe.columns)/cols)
    figwidth  = 5 * cols
    figheight = 4 * rows

    fig, ax = plt.subplots(nrows   = rows,
                           ncols   = cols,
                           figsize = (figwidth, figheight))
    
    plt.subplots_adjust(wspace=0.3, hspace=0.3)
    ax = ax.ravel()         # Ravel turns a matrix into a vector... easier to iterate

    for i, column in enumerate(dataframe.columns):
        ax[i].boxplot(dataframe[column])
        
        ax[i].set_title(f'{dataframe[column].name}', fontsize=18)
        ax[i].set_ylabel('', fontsize=14)
        ax[i].set_xlabel('', fontsize=14)
        ax[i].tick_params(labelbottom=False)
        
    fig.suptitle('\nBoxplots for All Variables in Dataframe', size=24)
    fig.tight_layout()
    fig.subplots_adjust(bottom=0, top=0.88)
    if file: plt.savefig(file, bbox_inches='tight')
    plt.show();

    return

In [None]:
import math
gen_boxplots(data_2, 3);

#### Insulin, DiabetesPedigreeFunction, BMI, SkinThickness has outliers.

In [None]:
data_2.groupby('Outcome')[['BMI', 'Age', 'Insulin', 'Pregnancies']].agg(['min', 'max', 'mean'])

In [None]:
Image(url="https://www.cdc.gov/healthyweight/images/assessing/bmi-adult-fb-600x315.jpg")

* BMI should be between 30, and 34.9, for you to be counted as obese. Greater than 35 shows that you are extremely obese. As we see above, mean of the BMI values shows us that excess weight can cause diabetes.
* With age mean, which 37 is very young, means that high weight, and young age with diabetes is in the majority in this data.
* Someone has 13, and 17 children which is incredible. Since, we do not know how many of them has children with that amount, we cannot decide whether this is decisive for them to being diabetes. But, average number of children across two possibilities is low.

#### DiabetesPedigreeFunction:  It provides information about diabetes history in relatives and genetic relationship of those relatives with patients. Higher Pedigree Function means patient is more likely to have diabetes.
https://github.com/niharikagulati/diabetesprediction

In [None]:
sns.scatterplot(data = data_2, x = 'DiabetesPedigreeFunction', y = 'Pregnancies', hue = 'Outcome');

In [None]:
data_2.groupby('Pregnancies').Pregnancies.count()

* Number of people with 17 pregnancies is 1.

In [None]:
data_2.groupby('Pregnancies').size().plot(kind = 'line', color = 'red', linewidth = 1.2);

In [None]:
sns.countplot(data_2['Outcome']);

#### It's time for detecting, and removing outliers.

In [None]:
Q1 = data_2.quantile(0.25)
Q2 = data_2.quantile(0.75)
IQR = Q2 - Q1
IQR

In [None]:
data2_out = data_2[~((data_2 < (Q1 - 1.5 * IQR)) |(data_2 > (Q2 + 1.5 * IQR))).any(axis=1)]

In [None]:
data2_out.shape

In [None]:
gen_boxplots(data2_out,3)

#### Looks good!

In [None]:
g = sns.FacetGrid(data2_out, col="Outcome", height=3.5, aspect=1.6)
g.map_dataframe(sns.scatterplot, x="Glucose", y="Insulin", hue="Pregnancies", size = 'Pregnancies', sizes=(20, 200))
g.set_axis_labels("Glucose Level", "Insulin Level")
g.add_legend();

In [None]:
g2 = sns.FacetGrid(data2_out, col="Outcome", height=3.5, aspect=1.6)
g2.map(sns.countplot, "Pregnancies", color = 'red');

In [None]:
g3 = sns.FacetGrid(data2_out, col="Outcome", height=3.5, aspect=1.6)
g3.map(sns.distplot, "BMI", color = 'green');

In [None]:
g4 = sns.FacetGrid(data2_out, col="Outcome", height=3.5, aspect=1.6)
g4.map(sns.distplot, "SkinThickness", color = 'black');

In [None]:
sns.pairplot(data = data2_out, hue = 'Outcome');

## Model Building

### Model Performance Analysis

In [None]:
X = data2_out.drop('Outcome', axis = 1)
y = data2_out['Outcome']

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .33, random_state = 0)

In [None]:
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_curve, roc_auc_score

### KNeighborsClassifier

In [None]:
TestScores = []
TrainScores = []

for i in range(1, 20):
    knn = KNeighborsClassifier(i)
    knn.fit(X_train, y_train)
    
    TrainScores.append(knn.score(X_train, y_train))
    TestScores.append(knn.score(X_test, y_test))

In [None]:
plt.figure(figsize=(12,6))

sns.lineplot(range(1, 20), TrainScores, marker = 'o', color = 'red', label = 'Train Score')
sns.lineplot(range(1, 20), TestScores, marker = '+', color = 'blue', label = 'Test Score');

After k = 15 model score goes down. k = 12 looks fine.

#### Model Tunning

In [None]:
knn = KNeighborsClassifier(12)
knn.fit(X_train_sc, y_train)
knn_y_pred = knn.predict(X_test_sc)
knn_y_pred_train = knn.predict(X_train_sc)

In [None]:
knn_as = accuracy_score(knn_y_pred, y_test)
knn_as_train = accuracy_score(knn_y_pred_train, y_train)
knn_as_train

In [None]:
print(classification_report(knn_y_pred, y_test))

In [None]:
knn_cm = confusion_matrix(knn_y_pred, y_test)
knn_cm

In [None]:
y_pred_proba = knn.predict_proba(X_test_sc)[:,1]
fpr, tpr, threshold = roc_curve(y_test, y_pred_proba)

In [None]:
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr, label = 'KNN')
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('Knn(n_neighbors=12) ROC curve');

In [None]:
roc_auc_score(y_test, y_pred_proba)

### Logistic Regression

In [None]:
gscv = GridSearchCV(LogisticRegression(solver='liblinear', multi_class = 'auto'), 
                    {'C' : [1, 10, 20]}, 
                    cv = 5, return_train_score=False)
gscv.fit(X, y)

In [None]:
gscv.best_params_

In [None]:
gscv.best_score_

#### Model Tunning

In [None]:
log = LogisticRegression(C = 10)
log.fit(X_train_sc, y_train)
log_y_pred = log.predict(X_test_sc)
log_y_pred_train = log.predict(X_train_sc)

In [None]:
log_as = accuracy_score(y_test, log_y_pred)
log_as_train = accuracy_score(log_y_pred_train, y_train)
log_as_train

In [None]:
log_as

In [None]:
print(classification_report(log_y_pred, y_test))

In [None]:
log_cm = confusion_matrix(log_y_pred, y_test)

### SVC

In [None]:
svc = SVC()

parameters = [{'C': [1, 10, 100], 'kernel': ['linear']},
              {'C': [1, 10, 100], 'kernel': ['rbf'], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}]
grid_search = GridSearchCV(estimator = svc,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search.fit(X_train_sc, y_train)

print(f"Best parameters for SVC : {grid_search.best_params_}")
print(f"Best score for SVC : {grid_search.best_score_}")

In [None]:
svc_best = grid_search.best_params_

In [None]:
svc = SVC(**svc_best)
svc.fit(X_train_sc, y_train)
svc_y_pred = svc.predict(X_test_sc)
svc_y_pred_train = svc.predict(X_train_sc)
svc_cm = confusion_matrix(svc_y_pred, y_test)

In [None]:
svc_as = accuracy_score(svc_y_pred, y_test)

In [None]:
accuracy_score(svc_y_pred_train, y_train)

In [None]:
svc_as

### Gradient Boosting Classifier

In [None]:
gradient_boosting = GradientBoostingClassifier(n_estimators=100, random_state=0)

In [None]:
gradient_boosting.fit(X_train_sc, y_train)

In [None]:
gb_y_pred = gradient_boosting.predict(X_test_sc)
gb_y_pred_train = gradient_boosting.predict(X_train_sc)

In [None]:
gb_as = accuracy_score(gb_y_pred, y_test)
gb_as_train = accuracy_score(gb_y_pred_train, y_train)
confusion_matrix(gb_y_pred, y_test)

In [None]:
confusion_matrix(gb_y_pred_train, y_train)

#### As you can see, model is overfitting.

In [None]:
print(f"Accuracy score of train data : {accuracy_score(gb_y_pred_train, y_train)}")
print(f"Accuracy score of test data : {accuracy_score(gb_y_pred, y_test)}")

### Extra Tree Classification

In [None]:
extra = ExtraTreesClassifier(n_estimators=1000, max_depth = 7, random_state = 0)

In [None]:
extra.fit(X_train_sc, y_train)
extra_y_pred = extra.predict(X_test_sc)
extra_y_pred_train = extra.predict(X_train_sc)

In [None]:
print(f"Accuracy score of train data : {accuracy_score(extra_y_pred, y_test)}")
print(f"Accuracy score of test data : {accuracy_score(extra_y_pred_train, y_train)}")

In [None]:
extra_as = accuracy_score(extra_y_pred, y_test)

In [None]:
extra_cm = confusion_matrix(extra_y_pred, y_test)
extra_cm

In [None]:
extra_as_train = accuracy_score(extra_y_pred_train, y_train)
confusion_matrix(extra_y_pred_train, y_train)

### Adaboost Classifier

In [None]:
ada = AdaBoostClassifier(n_estimators=40)

In [None]:
ada.fit(X_train_sc, y_train)

In [None]:
ada_y_pred = ada.predict(X_test_sc)
ada_y_pred_train = ada.predict(X_train_sc)

In [None]:
ada_as_train = accuracy_score(ada_y_pred_train, y_train)
ada_as = accuracy_score(ada_y_pred, y_test)
ada_as_train

In [None]:
accuracy_score(ada_y_pred, y_test)

In [None]:
print(f"KNN model accuracy score for test data : {knn_as}")
print(f"KNN model accuracy score for train data : {knn_as_train}\n")
print(f"Logistic Regression model accuracy score for test data : {log_as}")
print(f"Logistic Regression model accuracy score for train data : {log_as_train}\n")
print(f"SVC model accuracy score for test data : {log_as}")
print(f"SVC model accuracy score for train data : {log_as_train}\n")
print(f"Gradient Boosting Classifier model accuracy score for test data : {gb_as}")
print(f"Gradient Boosting Classifier model accuracy score for train data : {gb_as_train}\n")
print(f"Extra Tree Classifier model accuracy score for test data : {extra_as}")
print(f"Extra Tree Classifier model accuracy score for train data : {extra_as_train}\n")
print(f"Adaboost Classifier model accuracy score for test data : {ada_as}")
print(f"Adaboost Classifier model accuracy score for train data : {ada_as_train}")