In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Load libraries and read the data

## 1.1. Load libraries

Loading the libraries

In [None]:
import os
from os.path import join
import copy
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd

#Plots
import sklearn
import missingno as msno
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
import graphviz
from sklearn import tree
from sklearn.pipeline import make_pipeline
import plotly.graph_objs as go
import plotly.offline as py


#Data processing, metrics and modeling
from hyperopt import fmin, tpe, hp
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from xgboost import plot_tree
from mlxtend.plotting import plot_decision_regions
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, auc
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve

## 1.2. Read data

In [None]:
#Use the first set of data to predict the occurence of diabetes in second dataset
data1=pd.read_csv('../input/korean-genome-and-epidemiology-study-koges/follow_01_data.csv')
data2=pd.read_csv('../input/korean-genome-and-epidemiology-study-koges/follow_02_data.csv')
data3=pd.read_csv('../input/korean-genome-and-epidemiology-study-koges/follow_03_data.csv')
data4=pd.read_csv('../input/korean-genome-and-epidemiology-study-koges/follow_04_data.csv')
data5=pd.read_csv('../input/korean-genome-and-epidemiology-study-koges/follow_05_data.csv')

# 2. Data cleaning and overview

## 2.1. Cleaning Data

In [None]:
#Randomize dataset
df1=data1.sample(frac=1)

#A list of columns that contains N/A (77777) values which can be converted into zero
replace_column=['T01_DRDU','T01_SOJUAM','T01_BEERAM','T01_SMDU','T01_SMAM','T01_PREG','T01_CHILD','T01_PMYN_C']

for col in replace_column:
    df1[col].replace(77777.0,0,inplace=True)

#A list of columns that contains either 1 or 2 values and therefore need to be converted into 0 and 1 respectively.
categorical_1_2=['T00_SEX','T01_PSM','T01_EXER','T01_HTN','T01_LIP','T01_FMFHT','T01_FMMHT','T01_FMFDM','T01_FMMDM','T01_PREG','T01_CHILD','T01_PMYN_C']

df1[categorical_1_2]=df1[categorical_1_2].replace({1.0:0,1:0,2.0:1,2:1})

#Remove the samples that has either T2DM history or diagnosed with T2DM (Fasting blood glucose level >=126)
df2=df1.drop(df1[(df1.T01_DM==2)|(df1.T01_GLU0>=126)].index)


#A list of columns that cannot be used for T2DM prediction
missing_column=['T00_DATA_CLASS','T01_EDATE','T01_SMAG','T01_HTNAG','T01_DM','T01_DMAG','T01_LIPAG','T01_FMFHTAG','T01_FMMHTAG','T01_FMFDMAG','T01_FMMDMAG','T01_MNSAG','T01_FPREGAG','T01_FLABAG','T01_PMAG_C','T01_TAKAM','T01_RICEAM','T01_WINEAM','T01_HLIQAM','T01_TAKFQ','T01_RICEFQ','T01_WINEFQ','T01_HLIQFQ']
df2.drop(missing_column, axis=1, inplace=True)


#A list of columns that are categorical, thus need to be converted using get_dummies functionk
categorical=['T01_MARRY','T01_DRINK','T01_SMOKE']

#Replace the unsurveyed(66666) and unanswered(99999) values into nan
df3=df2.replace(dict.fromkeys([66666.0,99999.0],None))

df3.head()

## 2.2. Data overview

In [None]:
p=df3.hist(figsize=(20,20))

In [None]:
def missing_plot(dataset, key, name) :
    null_feat = pd.DataFrame(len(dataset[key]) - dataset.isnull().sum(), columns = ['Count'])
    percentage_null = pd.DataFrame((len(dataset[key]) - (len(dataset[key]) - dataset.isnull().sum()))/len(dataset[key])*100, columns = ['Count'])
    percentage_null = percentage_null.round(2)

    trace = go.Bar(x = null_feat.index, y = null_feat['Count'] ,opacity = 0.8, text = percentage_null['Count'],  textposition = 'auto',marker=dict(color = '#7EC0EE',
            line=dict(color='#000000',width=1.5)))

    layout = dict(title =  name)

    fig = dict(data = [trace], layout=layout)
    py.iplot(fig)
    
missing_plot(df3, 'T00_ID', 'Missing Values (count & %)')

In [None]:
plt.style.use('ggplot') # Using ggplot2 style visuals 

f, ax = plt.subplots(figsize=(11, 15))

ax.set_facecolor('#fafafa')
ax.set(xlim=(-.05, 200))
plt.ylabel('Variables')
plt.title("Overview Data Set")
ax = sns.boxplot(data = df3, 
  orient = 'h', 
  palette = 'Set2')

## 2.3. Fill in the empty values

**Replace the nan values by either mean or median value**

In [None]:
col_mean=['T01_HEIGHT','T01_WEIGHT','T01_WAIST','T01_HIP','T01_PULSE','T01_SBP','T01_DBP','T01_HBA1C','T01_GLU0','T01_TCHL','T01_HDL','T01_HBA1C']
for header in df3.keys():
    if df3[header].isna().sum()>0:
        if header in col_mean:
            df3[header].fillna(df3[header].mean(), inplace = True)
        else:
            df3[header].fillna(df3[header].median(), inplace = True)
df3.head()

In [None]:
missing_plot(df3, 'T00_ID', 'Missing Values after filling (count & %)')

## 2.4. Make the label using the data achieved after 2 years

In [None]:
d2f1=data2.replace(dict.fromkeys([66666.0,77777.0,99999.0],None))
d2f1.drop(d2f1[(d2f1.T02_DM==None)|(d2f1.T02_GLU0==None)].index, inplace=True)
d2f1['T2DM']=(d2f1.T02_DM==2)|(d2f1.T02_GLU0>=126)
T2DM_set=d2f1[['T00_ID','T2DM']].replace(False,0).replace(True,1)
df4=df3.merge(T2DM_set, on="T00_ID")
label=df4['T2DM']
label

## 2.5. Target

In [None]:
# 2 datasets
D = df4[(df4['T2DM'] != 0)]
H = df4[(df4['T2DM'] == 0)]

print(df4['T2DM'].value_counts().values.tolist())
#------------COUNT-----------------------
def target_count():
    trace = go.Bar( x = df4['T2DM'].value_counts().values.tolist(), 
                    y = ['healthy','diabetic' ], 
                    orientation = 'h', 
                    text=df4['T2DM'].value_counts().values.tolist(), 
                    textfont=dict(size=15),
                    textposition = 'auto',
                    opacity = 0.8,marker=dict(
                    color=['lightskyblue', 'gold'],
                    line=dict(color='#000000',width=1.5)))

    layout = dict(title =  'Count of T2DM variable')

    fig = dict(data = [trace], layout=layout)
    py.iplot(fig)

#------------PERCENTAGE-------------------
def target_percent():
    trace = go.Pie(labels = ['healthy','diabetic'], values = df4['T2DM'].value_counts(), 
                   textfont=dict(size=15), opacity = 0.8,
                   marker=dict(colors=['lightskyblue', 'gold'], 
                               line=dict(color='#000000', width=1.5)))


    layout = dict(title =  'Distribution of T2DM variable')

    fig = dict(data = [trace], layout=layout)
    py.iplot(fig)

target_count()
target_percent()

## 2.6. StandardScaler

In [None]:
df4_1=df4.copy(deep=True)
scale_column = ['T01_HEIGHT','T01_WEIGHT','T01_WAIST','T01_HIP','T01_PULSE','T01_SBP','T01_DBP','T01_CREATININE','T01_AST','T01_ALT','T01_TCHL','T01_HDL','T01_TG','T01_INS0']
features = df4_1[scale_column]
features.values
scaler = StandardScaler()
features = scaler.fit_transform(features.values)
df4_1[scale_column] = features
df4_1.head()

## 2.7. Correlation matrix

In [None]:
colormap = plt.cm.viridis
plt.figure(figsize=(10,10))
sns.heatmap(df4_1.corr(), linewidths=0.1, vmax=1.0, square=True, cmap=colormap, annot=False)

In [None]:
df4_1.drop('T2DM', axis=1).corrwith(df4_1.T2DM).plot(kind='bar', grid=True, figsize=(12, 8), 
                                                   title="Correlation with T2DM")

## 2.8. Convert categorical to seperate columns (get_dummies)

In [None]:
df5=pd.get_dummies(data=df4_1,columns=categorical)
df5.drop(columns=['T00_ID','T2DM'], inplace=True)
df5.head()

## 2.9. SMOTE Oversampling

In [None]:
X_train,X_test,y_train,y_test = train_test_split(df5,label,test_size=1/3,random_state=42, stratify=label)
smote=SMOTE(k_neighbors=5)
smoted_X_train,smoted_y_train=smote.fit_resample(X_train,y_train)
smoted_X_test,smoted_y_test=smote.fit_resample(X_test,y_test)
smoted_X_train

# 3. Machine Learning

## 3.1. Linear SVM, Radial SVM, LR, KNN, Decision Tree

In [None]:
abc=[]
classifiers=['Linear Svm','Radial Svm','Logistic Regression','KNN','Decision Tree','XGBoost','LightGBM']
models=[svm.SVC(kernel='linear'),svm.SVC(kernel='rbf'),LogisticRegression(),KNeighborsClassifier(),DecisionTreeClassifier(),XGBClassifier(),LGBMClassifier()]
for i in models:
    model = i
    model.fit(smoted_X_train,smoted_y_train)
    prediction=model.predict(smoted_X_test)
    abc.append([roc_auc_score(smoted_y_test,prediction),accuracy_score(prediction,smoted_y_test)])
models_dataframe=pd.DataFrame(abc,index=classifiers)   
models_dataframe.columns=['AUC','Accuracy']
models_dataframe

## 3.2. Feature Extraction

In [None]:

smote=SMOTE(k_neighbors=5)
smoted_X,smoted_y=smote.fit_resample(df5,label)
df6=smoted_X.copy(deep=True)

##**Remove the columns with high correlation (abs(x) > 0.8)**
# def correlation(dataset, threshold):
#     col_corr = set() # Set of all the names of deleted columns
#     corr_matrix = dataset.corr()
#     for i in range(len(corr_matrix.columns)):
#         for j in range(i):
#             if (abs(corr_matrix.iloc[i, j]) >= threshold) and (corr_matrix.columns[j] not in col_corr):
#                 colname = corr_matrix.columns[i] # getting the name of column
#                 col_corr.add(colname)
#                 if colname in dataset.columns:
#                     print('Delete: '+colname+" "+str(corr_matrix.iloc[i, j]))
#                     del dataset[colname] # deleting the column from the dataset
                    
# correlation(df6,0.8)

**Random Forest Classifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier 
model= RandomForestClassifier(n_estimators=100,random_state=0)
X=df6[df6.columns]
Y=smoted_y
model.fit(X,Y)
RF_list=pd.Series(model.feature_importances_,index=X.columns).sort_values(ascending=False)
RF_list

## 3.3. PCA on top5 features

In [None]:
df5_label=df5.copy(deep=True)
df5_label['T2DM']=label
figure, axes = plt.subplots(nrows=2, ncols=2, figsize=(10,10))
axis_dic={2:[0,0],3:[0,1],4:[1,0],5:[1,1]}
plt.subplots_adjust(left=0.125,
                    bottom=0.1, 
                    right=0.9, 
                    top=0.9, 
                    wspace=0.2, 
                    hspace=0.35)

for num in range(2,6):
    top_features=list(RF_list[:num].keys())
    x=df5.loc[:,top_features].values
    print(top_features) 
    pca = PCA(n_components=2)
    principalComponents  = pca.fit_transform(x)
    principalDf = pd.DataFrame(data = principalComponents
                 , columns = ['principal component 1', 'principal component 2'])
    finalDf = pd.concat([principalDf, df5_label['T2DM']], axis = 1)
    subplot=axes[axis_dic.get(num)[0]][axis_dic.get(num)[1]]

    subplot.set_title(str(num)+' component PCA', fontsize = 15)
    targets = [0,1]
    colors = ['r', 'g', 'b']
    for target, color in zip(targets,colors):
        indicesToKeep = finalDf['T2DM'] == target
            
        subplot.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
                   , finalDf.loc[indicesToKeep, 'principal component 2']
                   , c = color
                   , s = 50)
    figure.text(0.5, 0.04, 'Principal Component 1', fontsize = 20, ha='center')
    figure.text(0.04, 0.5, 'Principal Component 2', fontsize = 20, va='center', rotation='vertical')
    ax.legend(targets)
    


**Observations:**
1. Using only the top 2 features shows the most stratification

## 3.4. Re-calculate the models using GLU0 and HBA1C

In [None]:
df5_label=df5.copy(deep=True)
df5_label['T2DM']=label
sns.lmplot('T01_HBA1C','T01_GLU0', data=df5_label, fit_reg=False, scatter_kws={"s":50},markers=["o","x"],hue="T2DM")
plt.title('HBA1C and GLU0 in 2d plane')

smoted_X_simple=smoted_X.loc[:,['T01_HBA1C','T01_GLU0']]
smoted_X_simple=smoted_X.loc[:,['T01_HBA1C','T01_GLU0']]

smoted_X_train_simple=smoted_X_train.loc[:,['T01_HBA1C','T01_GLU0']]
smoted_X_test_simple=smoted_X_test.loc[:,['T01_HBA1C','T01_GLU0']]

## 3.5. Cross Validation

In [None]:
xyz=[]
roc=[]
classifiers=['Linear Svm','Radial Svm','LR','KNN','Decision Tree','XGBoost','LightGBM']
models=[svm.SVC(kernel='linear'),svm.SVC(kernel='rbf'),LogisticRegression(),KNeighborsClassifier(),DecisionTreeClassifier(),XGBClassifier(),LGBMClassifier()]
for i in models:
    model = i
    cv_result = cross_val_score(model,smoted_X_simple[['T01_HBA1C','T01_GLU0']],smoted_y, scoring = "roc_auc")
    xyz.append(cv_result.mean())
    roc.append(cv_result)
new_models_dataframe=pd.DataFrame(xyz,index=classifiers)   
new_models_dataframe.columns=['AUC mean']    
new_models_dataframe

In [None]:
box=pd.DataFrame(roc,index=classifiers)
sns.boxplot(data=box.T)
sns.set(rc={'figure.figsize':(15.7,8.27)})
plt.show()

The above boxplot shows that XGBoost and LightGBM model perform the best while Radial SVM performs the worst.

## 3.6. XGBoost - Decision region

In [None]:
clf = XGBClassifier().fit(smoted_X_simple.values,smoted_y.ravel())
plot_decision_regions(smoted_X_simple.values, smoted_y.values.astype(np.integer), clf=clf, legend=2)

# Adding axes annotations
plt.xlabel('T01_HBA1C')
plt.ylabel('T01_GLU0')
plt.title('XGBoost with Diabetes Data')
plt.show()

* Decision region plot shows over-fitting
* To prevent over-fitting, reduce the max_depth (default=6) and num_leaves parameter

## 3.7. Hyperparameter

In [None]:
# 초모수 탐색공간 정의
param_space = {'subsample': hp.uniform('subsample', 0.1, 0.9),
               'learning_rate': hp.uniform('learning_rate', 0.01,0.05)
              }

# 목적함수 정의
def objective_xgb(params):
    params = {'max_depth': 2,
              'subsample': params['subsample'],
              'learning_rate': params['learning_rate']
             }
    xgb_clf = XGBClassifier(n_estimators=100, **params) 
    best_score = cross_val_score(xgb_clf, smoted_X_simple[['T01_HBA1C','T01_GLU0']],smoted_y,
                                 scoring='roc_auc', 
                                 cv=5, 
                                 n_jobs=8).mean()
    loss = 1 - best_score
    return loss


# 알고리즘 실행
best_xgb_param = fmin(fn=objective_xgb, space=param_space, 
            max_evals=50, 
            rstate=np.random.RandomState(777), 
            algo=tpe.suggest)

print(best_xgb_param)

**Observation**
1. Optimal XGB param: {'learning_rate': 0.04988810025292616, 'subsample': 0.5780885404563175, 'max_depth'=2}

In [None]:
new_xgb=XGBClassifier(max_depth=2, subsample=0.5780885404563175, learning_rate=0.04988810025292616)

xyz=[]
roc=[]
classifiers=['XGBoost']

cv_result = cross_val_score(new_xgb,smoted_X_simple[['T01_HBA1C','T01_GLU0']],smoted_y, scoring = "roc_auc")
xyz.append(cv_result.mean())
roc.append(cv_result)

new_models_dataframe2=pd.DataFrame(xyz,index=classifiers)   
new_models_dataframe2.columns=['New AUC mean']
new_models_dataframe=new_models_dataframe.loc[['XGBoost']].merge(new_models_dataframe2,left_index=True,right_index=True,how='left')
new_models_dataframe['Increase']=new_models_dataframe['New AUC mean']-new_models_dataframe['AUC mean']

new_models_dataframe[['AUC mean','New AUC mean','Increase']]

## 3.8. XGBoost (Optimized) - Decision region

In [None]:
clf=XGBClassifier(max_depth=2, subsample=0.5780885404563175, learning_rate=0.04988810025292616).fit(smoted_X_simple.values,smoted_y.ravel())
plot_decision_regions(smoted_X_simple.values, smoted_y.values.astype(np.integer), clf=clf, legend=2)

# Adding axes annotations
plt.xlabel('T01_HBA1C')
plt.ylabel('T01_GLU0')
plt.title('Optimized XGBoost with Diabetes Data')
plt.show()

# 4. Evaluation

## 4.1. Confusion metrix

In [None]:
smoted_y_pred = clf.predict(smoted_X_test_simple.values)
cf_matrix=confusion_matrix(smoted_y_test,smoted_y_pred)
pd.crosstab(smoted_y_test, smoted_y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

In [None]:
p = sns.heatmap(pd.DataFrame(cf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

## 4.2. Classification Report

In [None]:
print(classification_report(smoted_y_test,smoted_y_pred))

## 4.3. ROC curve

In [None]:
smoted_y_pred_proba = clf.predict_proba(smoted_X_test_simple)[:,1]
fpr, tpr, thresholds = roc_curve(smoted_y_test, smoted_y_pred_proba)

plt.plot([0,1],[0,1],'k--')
plt.plot(fpr,tpr, label='Knn')
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('Decision Tree ROC curve')
plt.show()

## 4.4. Decision Tree graph

In [None]:
fig, ax = plt.subplots(figsize=(20, 20)) 
plot_tree(clf, rankdir="LR", ax=ax)
plt.show()

# 5. Conclusion

**Results**
* If HbA1C>5.8%, the fasting glucose level cuf-off is 90.5mg/dL
* If not, the cut-off decrease to 88.5mg/dL

**Interpretation**
![Diabetes_cutoff](https://cdn.rcsb.org/pdb101/global-health/diabetes-mellitus/files/Blood-Test-Levels-for-Diagnosing-DM_0.PNG)
* According to American Diabetes Association (2012) guideline, HbA1C>5.7% and fasting glucose>100mg/dL are diagnosed as 'prediabetes', and this is a global standard.
* However, the fasting glucose cut-off calculated using ML model which is trained with Korean electronic medical records (EMR) was significantly lower than the guideline (88.5-90.5 vs 100mg/dL).
* The difference could be explained by the ancestrial (genetic) or lifestyle difference, and further studies comparing the groups could elucidate the reason behind the difference. 

## 6. Usage: Diabetes risk calculator

Q) Predict the diabetes risk of a patient with following features:
* fasting glucose = 140mg/dL
* HbA1C = 7%

In [None]:
example=np.asarray([[6.5,106]])
prediction=clf.predict_proba(example)[0][1]
print('Diabetes risk = '+str(round(prediction*100,1))+"%")

## Thank you all !