# [Predicting the Risk of Diabetes at Early Stage Using Machine Learning](http://)

Diabetes is a chronic, metabolic disease characterized by elevated levels of blood glucose (or blood sugar), which leads over time to serious damage to the heart, blood vessels, eyes, kidneys and nerves. The most common is type 2 diabetes, usually in adults, which occurs when the body becomes resistant to insulin or doesn't make enough insulin. In the past three decades the prevalence of type 2 diabetes has risen dramatically in countries of all income levels. Type 1 diabetes, once known as juvenile diabetes or insulin-dependent diabetes, is a chronic condition in which the pancreas produces little or no insulin by itself. For people living with diabetes, access to affordable treatment, including insulin, is critical to their survival. There is a globally agreed target to halt the rise in diabetes and obesity by 2025. 

![](https://northmemorial.com/wp-content/uploads/2016/10/Diabetes-illustration.png)

> About 422 million people worldwide have diabetes, the majority living in low-and middle-income countries, and 1.6 million deaths are directly attributed to diabetes each year. Both the number of cases and the prevalence of diabetes have been steadily increasing over the past few decades. 

![](https://everydaydiabetes.com/wp-content/uploads/Diabetes-Statistics-Worldwide-Everyday-Diabetes.jpg)

In [None]:
import pandas as pd
import pandas_profiling as pp
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import os
import plotly.graph_objects as go
import plotly.io as pio
import pickle

# Metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, auc, roc_curve

# Validation
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.pipeline import Pipeline

# Tuning
from sklearn.model_selection import GridSearchCV

# Feature Extraction
from sklearn.feature_selection import RFE

# Preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer, Binarizer, LabelEncoder

from sklearn.pipeline import Pipeline, make_pipeline

# Models
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Ensembles
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier

warnings.filterwarnings('ignore')
%matplotlib inline

sns.set_style("whitegrid", {'axes.grid' : False})
pio.templates.default = "plotly_white"

df = pd.read_csv('/kaggle/input/early-stage-diabetes-risk-prediction-datasets/diabetes_data_upload.csv')
df.head()


# [Size of Dataset](http://)
> Our data  contains the sign and symptom data of newly diabetic or would be diabetic patient and have 520 rows and 17 columns.

In [None]:
print("Number of Instances and Attributes:", df.shape)

# [Attribute Information:]()

* **Age**: 20-65
* **Sex**: Male, Female ( 1 , 0 )
* **Polyuria**: Yes, No ( 1 , 0 )
* **Polydipsia**: Yes, No ( 1 , 0 )
* **sudden weight loss**: Yes, No ( 1 , 0 )
* **weakness**: Yes, No ( 1 , 0 )
* **Polyphagia**: Yes, No ( 1 , 0 )
* **Genital thrush**: Yes, No ( 1 , 0 )
* **visual blurring**: Yes, No ( 1 , 0 )
* **Itching**: Yes, No ( 1 , 0 )
* **Irritability**: Yes, No ( 1 , 0 )
* **delayed healing**: Yes, No ( 1 , 0 )
* **partial paresis**: Yes, No ( 1 , 0 )
* **muscle stiness**: Yes, No ( 1 , 0 )
* **Alopecia**: Yes, No ( 1 , 0 )
* **Obesity**: Yes, No ( 1 , 0 )
* **Class**: 1.Positive, 2.Negative.



In [None]:
df.columns

# [Data type of each columns](http://)

In [None]:
print(df.dtypes)

# [Checking for NaN Values](http://)

In [None]:
df.isna().sum()

# [Data Analysis](http://)



In [None]:

df.columns = df.columns.str.replace(" ", "_")
df.rename(columns={'weakness':'Weakness', 'visual_blurring':'Visual_blurring', 'delayed_healing':'Delayed_healing', 'partial_paresis':'Partial_paresis','muscle_stiffness':'Muscle_stiffness'}, inplace=True)
labels = ['Male','Female']
values = df.Gender.value_counts()

colors = ['STEELBLUE','crimson']

fig_1 = go.Figure(data=[go.Pie(labels=labels, values=values, textinfo='label+percent', showlegend=False)])

fig_1.update_traces(marker=dict(colors=colors, line=dict(color='#000000', width=0.5)))

fig_1.update_layout( margin={"r":0,"t":100,"l":0,"b":0},
    title_text="<br>Gender Distribution:<br>",
    font=dict(size=15, color='black', family="Arial, Balto, Courier New, Droid Sans"),

)
fig_1.show()

In [None]:
f,ax=plt.subplots(1,2,figsize=(22,10))
#sns.stripplot(x="class", y="Age", data=df, jitter=True, palette="Set1", ax=ax[0])

sns.swarmplot(x="class", y="Age",data=df, palette="Set1", ax=ax[0])
sns.violinplot(x="class", y="Age", data=df, palette="Set1", ax=ax[1])
f.suptitle('Age of Positives vs Negatives', fontweight="bold");



In [None]:
plt.figure(figsize=(12,8))
sns.countplot(x="class", hue="Gender", palette=['STEELBLUE','crimson'],data=df);
plt.title('Gender of Positives vs Negatives',fontweight="bold");

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x=df['class'], palette='Set1');
plt.title('Class Distribution',fontweight="bold",alpha=0.8);

In [None]:
for c in df.columns:
    if df[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(df[c].values)) 
        df[c] = lbl.transform(list(df[c].values))
        
df.describe().T

In [None]:
colormap = plt.cm.RdBu
plt.figure(figsize=(18,15));
plt.title('Pearson Correlation of Features', y=1.05, size=50);
sns.heatmap(df.corr(),linewidths=0.1, vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True);

# [Splitting data to training and validation set](http://)



In [None]:
X = df.drop('class', axis=1)
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y,
                                                    test_size=0.2,
                                                   random_state=0)
print('X_train: ',X_train.shape)
print('X_test: ',X_test.shape)
print('y_train: ',y_train.shape)
print('y_test: ',y_test.shape)

# [Training model without preprocessing techniques](http://)



In [None]:
models = []
models.append(( ' LR ' , LogisticRegression()))
models.append(( ' LDA ' , LinearDiscriminantAnalysis()))
models.append(( ' KNN ' , KNeighborsClassifier()))
models.append(( ' NB ' , GaussianNB()))
models.append(( ' SVM ' , SVC()))
models.append(('CART', DecisionTreeClassifier()))

results = []
names = []

for name, model in models:
    Kfold = KFold(n_splits=10, random_state=0)
    cv_results = cross_val_score(model, X_train, y_train, cv=Kfold, scoring= 'accuracy')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std());
    print(msg)


# [Improve Performance with Ensembles](http://)



In [None]:
models = []
models.append(( 'Adab' , AdaBoostClassifier()))
models.append(( 'Bagging' , BaggingClassifier()))
models.append(( 'GBC' , GradientBoostingClassifier()))
models.append(( 'RF' , RandomForestClassifier()))
models.append(('ET'  , ExtraTreesClassifier()))


results = []
names = []

for name, model in models:
    Kfold = KFold(n_splits=10, random_state=0)
    cv_results = cross_val_score(model, X_train, y_train, cv=Kfold, scoring= 'accuracy')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std());
    print(msg)

# [Parameter tuning](http://)

In [None]:
model = ExtraTreesClassifier()
n_estimators = [5, 10, 15, 100, 1000]

max_depth= [3, 5, 15, 25]

max_features = ['sqrt', 'log2']

criterion = ['gini', 'entropy']

grid = dict(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, max_features= max_features)

grid_search = GridSearchCV(estimator=model,param_grid=grid, n_jobs=-1, cv=10, scoring='accuracy')
grid_result = grid_search.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# [Classification Metrics](http://)

In [None]:
pipeline = make_pipeline(StandardScaler(), ExtraTreesClassifier(criterion='gini',max_depth=15, max_features='sqrt', n_estimators=15))
model = pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(f"Training Accuracy Score: {model.score(X_train, y_train) * 100:.1f}%")
print(f"Validation Accuracy Score: {model.score(X_test, y_test) * 100:.1f}%")
print("-----"*30)
print(classification_report(y_test,y_pred))

# [Confusion Matrix](http://)

In [None]:
plt.figure(figsize=(10,5))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='2.0f');
plt.title("Confusion Matrix",fontweight="bold", fontsize=20);
plt.ylabel('Actual label', fontsize=15)
plt.xlabel('Predicted label', fontsize=15)
plt.show()

# [AUC Curve](http://)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.figure(figsize=(8,6))
print(f"roc_auc score: {auc(fpr, tpr)*100:.1f}%")
plt.plot(fpr, tpr, color='orange', label='ROC')
plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
plt.xlabel('False Positive Rate',fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('Receiver Operating Characteristic (ROC) Curve', fontsize=20)
plt.legend()
plt.show()

In [None]:
# saving model
pickle.dump(model, open('model.pkl', 'wb'))    

# [Reference](http://)

* https://en.wikipedia.org/wiki/Diabetes
* https://www.who.int/health-topics/diabetes#tab=tab_1
* https://www.medicalnewstoday.com/articles/323185
* https://www.diabetes.org/diabetes/type-1/symptoms
* https://www.medicinenet.com/diabetes_symptoms_in_men/article.htm