# Diabetes Prediction

## Logistic regression fit with classification performance analysis, e.g. ROC curve

*with some explanations*

### Importing Libraries

In [None]:
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt

import seaborn as sns

import statsmodels.api as sm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, plot_roc_curve

### Loading Data

In [None]:
df = pd.read_csv("../input/diabetes-dataset/diabetes2.csv")

In [None]:
df.head()

* If Outcome is 1,then person has diabetes.
* If Outcome is 0,then person has not diabetes.

In [None]:
df.info()

### Data Preparation and pre-processing

  - define endogene and exogene data 
  - split data to clealy separate train and test sub-population
  - scale data for prediction


In [None]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

x_names = X.columns
y_name = y.name

In [None]:
df_X_train, df_X_test, y_train, y_test = train_test_split(X, y,test_size=0.25, random_state=0)

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(df_X_train)
X_test = sc.transform(df_X_test)

# # what means scaling?
display(
    df_X_train.describe(),
    # mean -> 0, std.dev. -> 1
    pd.DataFrame(data=X_train, index=df_X_train.index, columns=df_X_train.columns).describe()
)

In [None]:
train_index = y_train.index
test_index = y_test.index

### Plotting Data - Visualization

In [None]:
if False: # takes some time
# if True: 
    _ = sns.pairplot(hue=y_name, data=df.loc[train_index]) #, kind='reg')

In [None]:
_, axs = plt.subplots(1, 2, figsize=(14, 6))

for i, X in enumerate([df_X_train, df_X_test]):
    with sns.axes_style("white"):
        corr = X.corr()
        mask = np.zeros_like(corr)
        mask[np.triu_indices_from(mask)] = True
        sns.heatmap(corr, robust=True, cmap='viridis', mask=mask, ax=axs[i])

### Model: fit and predict

#### using `statsmodels`

In [None]:
logit_model = sm.Logit(y_train, X_train)

result = logit_model.fit()

result.summary()

In [None]:
y_pred_proba_sm = result.predict()
y_pred_sm = (y_pred_proba_sm > 0.5).astype(int)

In [None]:
cols = df_X_train.columns

In [None]:
_, axs = plt.subplots(2, len(cols)//2, figsize=(14, 6), tight_layout=True)
for i, col in enumerate(cols):
    ax = axs[i % 2][i // 2]
    df_X_train.join(y_train).plot.scatter(x=col, y='Outcome', ax=ax)
    ax.plot(df_X_train[col], y_pred_proba_sm, '.c')
    ax.plot(df_X_train[col], y_pred_sm, '.r')
    ax.set_title(f'x{i+1}')

#### using `sklearn`

and make use of train and test split

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logreg.coef_

In [None]:
y_pred = logreg.predict(X_test)
y_pred_proba = logreg.predict_proba(X_test)[:, 1]

In [None]:
_, axs = plt.subplots(2, len(cols)//2, figsize=(14, 6), tight_layout=True)
for i, col in enumerate(cols):
    ax = axs[i % 2][i // 2]
    df_X_test.join(y_test).plot.scatter(x=col, y='Outcome', ax=ax)
    ax.plot(df_X_test[col], y_pred_proba, '.c')
    ax.plot(df_X_test[col], y_pred, '.r')
    ax.set_title(f'coef #{i+1}')

#### Confusion Matrix, accuracy score and classification report

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

confusion_matrix_result = confusion_matrix(y_test, y_pred)
accuracy_score_result = accuracy_score(y_test,y_pred)
classification_report_result = classification_report(y_test, y_pred)

display(
    confusion_matrix_result,
    accuracy_score_result,
)

print(classification_report_result)

#### understanding the classification performance

In [None]:
support_0 = confusion_matrix_result[0, :].sum()
support_1 = confusion_matrix_result[1, :].sum()
cnt_pred_0 = confusion_matrix_result[:, 0].sum()
cnt_pred_1 = confusion_matrix_result[:, 1].sum()

display(
    f'support of "0" -> {support_0}',
    f'support of "1" -> {support_1}',
    # Negative Predicitve Value
    f'precision of "0" = NPV -> {confusion_matrix_result[0, 0] / cnt_pred_0 :.2f}',
    # Positive Predicitive Value
    f'precision of "1" = PPV -> {confusion_matrix_result[1, 1] / cnt_pred_1 :.2f}',
    # True Negative Rate, specificity
    f'recall of "0" = TNR -> {confusion_matrix_result[0, 0] / support_0 :.2f}',
    # True Positive Rate, sensitivity
    f'recall of "1" = TPR -> {confusion_matrix_result[1, 1] / support_1 :.2f}',
    # False Negative Rate, misses
    f'FNR = 1 - TPR -> {confusion_matrix_result[1, 0] / support_1 :.2f}',
    # False Positive Rate, fall-out
    f'FPR = 1 - TNR -> {confusion_matrix_result[0, 1] / support_0 :.2f}',
)

fit_FPR = confusion_matrix_result[0, 1] / support_0
fit_TPR = confusion_matrix_result[1, 1] / support_1

### ROC curve (variant 1)

In [None]:
logit_roc_auc = roc_auc_score(y_test, y_pred_proba)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

fig, axs = plt.subplots(2, 1, sharex=True, figsize=(6, 8), tight_layout=True)
ax = axs[0]
ax.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
ax.plot([0, 1], [0, 1],'k:')
ax.plot(fit_FPR, fit_TPR, 'ro')
ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_ylabel('True Positive Rate')
ax.set_title('Receiver Operating Characteristic')
ax.legend(loc="lower right")

# index of 0.5 threshold
idx_thres_05 = np.where(thresholds <= 0.5)[0][0]

ax = axs[1]
ax.plot(fpr, thresholds)
ax.axvline(fpr[idx_thres_05], color='r', ls=':')
ax.set_ylim(0, 1)
ax.set_title('Thresholds')
ax.set_xlabel('False Positive Rate')

fig.savefig('Log_ROC')

### ROC curve (variant 2)

using `plot_roc_curve`

In [None]:
ax = plt.figure().gca()
_ = plot_roc_curve(logreg, X_test, y_test, ax=ax)

##### ToDo
eventually extend with cross validation like [scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html](https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html)