# **Imports**

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.figure_factory as ff
from mlxtend.plotting import plot_confusion_matrix

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_curve, auc, roc_auc_score, precision_recall_curve, average_precision_score



In [None]:
df = pd.read_csv('../input/early-stage-diabetes-risk-prediction-dataset/diabetes_data_upload.csv')

# **Data Exploration**

In [None]:
df.head()

In [None]:
df.describe().T

In [None]:
df.info()

In [None]:
df.isnull().sum()

The target variable is the diagnosis located in the "class" column, it has a binary value Positive or Negative.<br>
Except for the age all the other features are categorical with binary values Yes or No.<br>
There are no NULL values

# Age factor Analysis

In [None]:
# age distribution

hist_data =[df["Age"].values]
group_labels = ['Age'] 

fig = ff.create_distplot(hist_data, group_labels)
fig.update_layout(title_text='Age Distribution plot')

fig.show()

In [None]:
fig = go.Figure()
fig.add_trace(go.Box(x=df['Age'], name="All patients", boxpoints='all', boxmean='sd'))
fig.update_layout(title_text="Box plot and distribution by Age")
fig.show()

with the boxplot we can appreciate the interquartile range that tells us that 50% of the patients are between 39 and 57 years old, with a mean of 48 and standard deviation of 12. There are only few patients younger than 25 and older than 72.

In [None]:
# diagnosis distribution by age and total count by diagnosis

fig = make_subplots(rows=1, cols=2)
fig.add_trace(go.Box(y=df['Age'].where(df['class']=='Negative'), name="Negatives by Age", boxpoints='all', boxmean='sd'),
    row=1, col=1
            )

fig.add_trace(go.Box(y=df['Age'].where(df['class']=='Positive'), name="Positives by Age", boxpoints='all', boxmean='sd'),
              row=1, col=1
             )


y_list = [df['Age'].where(df['class']=='Negative').agg('count'),
          df['Age'].where(df['class']=='Positive').agg('count')]
fig.add_trace(
    go.Bar(y=y_list, x=['Negative', 'Positive'], name="Diagnosis Counts",text=y_list, textposition='auto', 
           marker_color='rgb(158,202,225)', marker_line_color='rgb(8,48,107)',
           marker_line_width=1.5, opacity=0.6),
    row=1, col=2
)
fig.update_layout(height=600, width=1000, title_text="Diagnosis distribution by Age total count by Diagnosis")
fig.show()

# Gender factor Analysis

In [None]:
fig = px.box(df, x='Gender', y='Age', color="class", points="all", width=800, height=600, color_discrete_sequence=['#F04B50','#2C85C1'])
fig.update_layout(legend=dict(
    title='Diagnosis:',
    yanchor="top",
    y=0.98,
    xanchor="right",
    x=0.98
))

fig.show()

In [None]:
# fig = px.violin(df, x='Gender', y='Age', color="class", points="all", box=True, width=800, height=600, color_discrete_sequence=['#F04B50','#2C85C1'])
# fig.update_layout(legend=dict(
#     title='Diagnosis:',
#     yanchor="top",
#     y=0.98,
#     xanchor="right",
#     x=0.98
# ))

# fig.show()

The dataset is fairly balanced between the overall positive and negative diagnosis, however there is an imbalance between female patients diagnosis that should not create any issue. We will use a stratified split to minimize any problem that may occur.

# Feature Engineering

## Encoding the categorical features
Trees based algorithms are capable to handle categorical features, thus the label encoder is enough, there is no need of the one hot encoder

In [None]:
df1=df.copy()

In [None]:
df=df1.copy()
le = LabelEncoder()
for i in df.columns[1:] :
    df[i] = le.fit_transform(df[i])
df.head()

# Feature Ranking with Mutual Information
Because the target and the majority of the features are categorical (nominal variables), the heatmap of the correlation matrix it's not the right tool to rank the feaures ([source](https://machinelearningmastery.com/feature-selection-with-real-and-categorical-data/)).<br>
We're going to use mutual information (have a look to the [2nd lesson of the Kaggle Feature Engineering course](https://www.kaggle.com/ryanholbrook/mutual-information)).

In [None]:
X = df.copy()
y = X.pop('class')

def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    color = np.array(["C0"] * scores.shape[0])
    # Create plot
    plt.barh(width, scores, color=color)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")
    
def make_mi_scores(X, y):
    mi_scores = mutual_info_classif(X, y)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

mi_scores = make_mi_scores(X, y)
mi_scores  # show a few features with their MI scores
plt.figure(dpi=100, figsize=(8, 5))
plot_mi_scores(mi_scores.head(20))

### Data Split
Splitting the data using the default train_test_split stratified split to assure that train and test set have the same distribution of the target variable

In [None]:
X = df.copy()
y = X.pop('class')


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# DecisionTreeClassifier

dt = DecisionTreeClassifier(max_features=0.5,max_depth=15, random_state=1)

dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
dt_pred_proba = dt.predict_proba(X_test)
dt_acc = accuracy_score(y_test, dt_pred)
print(dt_acc)

In [None]:
cm = confusion_matrix(y_test, dt_pred)
plt.figure()
plot_confusion_matrix(cm, figsize=(4,4), hide_ticks=True, cmap=plt.cm.Blues)
plt.title("Random Forest Model - Confusion Matrix")
plt.xticks(range(2), ["No Diabetes","Diabetes"], fontsize=16)
plt.yticks(range(2), ["No Diabetes","Diabetes"], fontsize=16)
plt.show()

In [None]:
# RandomForestClassifier

rf = RandomForestClassifier(max_features=0.5,max_depth=15, random_state=1)
# rf = RandomForestClassifier(max_features=0.5, max_depth=15, random_state=1)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_pred_proba = rf.predict_proba(X_test)
rf_acc = accuracy_score(y_test, rf_pred)
print(rf_acc)

In [None]:
cm = confusion_matrix(y_test, rf_pred)
plt.figure()
plot_confusion_matrix(cm, figsize=(4,4), hide_ticks=True, cmap=plt.cm.Blues)
plt.title("Random Forest Model - Confusion Matrix")
plt.xticks(range(2), ["No Diabetes","Diabetes"], fontsize=16)
plt.yticks(range(2), ["No Diabetes","Diabetes"], fontsize=16)
plt.show()

In [None]:
xgb = XGBClassifier(eval_metric='logloss' ,use_label_encoder=False)

xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)
xgb_pred_proba = xgb.predict_proba(X_test)
xgb_acc = accuracy_score(y_test, xgb_pred)
print(xgb_acc)

In [None]:
cm = confusion_matrix(y_test, xgb_pred)
plt.figure()
plot_confusion_matrix(cm, figsize=(4,4), hide_ticks=True, cmap=plt.cm.Blues)
plt.title("XGBoost Model - Confusion Matrix")
plt.xticks(range(2), ["No Diabetes","Diabetes"], fontsize=16)
plt.yticks(range(2), ["No Diabetes","Diabetes"], fontsize=16)
plt.show()

# ROC CURVES
"ROC curves describe the trade-off between the true positive rate (TPR) and false positive (FPR) rate along different probability thresholds for a classifier. True positive rate is also known as recall and sometimes Sensitivity — it’s a measure of how well you can find the needle in a haystack." [(source)](https://medium.com/cascade-bio-blog/making-sense-of-real-world-data-roc-curves-and-when-to-use-them-90a17e6d1db)

In [None]:
plt.figure(0).clf()
plt.figure(dpi=150)
fpr, tpr, thresh = roc_curve(y_test, dt_pred_proba[:,1])
roc_auc = roc_auc_score(y_test, dt_pred_proba[:,1])
plt.plot(fpr,tpr,label="Decision Tree, auc= %.4f"% roc_auc)

fpr, tpr, thresh = roc_curve(y_test, rf_pred_proba[:,1])
roc_auc = roc_auc_score(y_test, rf_pred_proba[:,1])
plt.plot(fpr,tpr,label="Random Forest, auc= %.4f"% roc_auc)

fpr, tpr, thresh = roc_curve(y_test, xgb_pred_proba[:,1])
roc_auc = roc_auc_score(y_test, xgb_pred_proba[:,1])
plt.plot(fpr,tpr,label="XGBoost, auc= %.4f"% roc_auc)

plt.title('ROC Curves Classifiers Comparison')
plt.xlabel('False Positive Rate (1-Specificity)')
plt.ylabel('True Positive Rate (Recall)')
plt.legend(loc=0)
plt.grid(True)

# PR Curves
"Precision-Recall curves describe the relationship between true TPR and the precision or positive predictive value (PPV), which is the ratio of your true positives to all positives. In other words, it helps you understand how many fake needles you will discover on your way to finding all the real ones." [(source)](https://medium.com/cascade-bio-blog/making-sense-of-real-world-data-roc-curves-and-when-to-use-them-90a17e6d1db)

In [None]:
plt.figure(0).clf()
plt.figure(dpi=150)
precision, recall, thresholds = precision_recall_curve(y_test, dt_pred_proba[:,1])
avg_precision = average_precision_score(y_test, dt_pred_proba[:,1])
pr_auc = auc(recall, precision)
plt.plot(recall, precision, label=f'Decision Tree, AP={avg_precision:.3f}; AUC={pr_auc:.3f}')

precision, recall, thresholds = precision_recall_curve(y_test, rf_pred_proba[:,1])
avg_precision = average_precision_score(y_test, rf_pred_proba[:,1])
pr_auc = auc(recall, precision)
plt.plot(recall, precision, label=f'Random Forest, AP={avg_precision:.3f}; AUC={pr_auc:.3f}')

precision, recall, thresholds = precision_recall_curve(y_test, xgb_pred_proba[:,1])
avg_precision = average_precision_score(y_test, xgb_pred_proba[:,1])
pr_auc = auc(recall, precision)
plt.plot(recall, precision, label=f'XGBoost, AP={avg_precision:.3f}; AUC={pr_auc:.3f}')

plt.title('PR Curves Classifiers Comparison')
plt.xlabel('Recall (TPR)')
plt.ylabel('Precision')
plt.legend(loc=0)
plt.grid(True)