In [None]:
import  numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import plotly.express as px
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

px_template = "simple white"

In [None]:
data = pd.read_csv('./data/diabetes_012_health_indicators_BRFSS2015.csv')

# **1. Data Preparation**

Data: https://www.kaggle.com/code/bharat04/diabetes-detailed-eda-with-conclusion/data

target variable Diabetes_012:
- 0 : no diabetes/ only during pregnancy
- 1 : prediabetes
- 2 : diabetes


## Summary: 

- 22 features (15 Discrete & 7 continuous)
- major feature variables for Diabetes are : HIghBP , HighChol , BMI , Stroke , GenHlth , MentHlth , PhysHlth , Age , Eduation and Income.
- Feature increasing diabetesrisk seem to be: Smoking and HvyAlcoholConsump , Stroke and HeartDiseaseorAttack , HighBP and HighChol.
- Features with least effect are: PhysActivity , Fruits , Veggies , AnyHealthcare , CholChek. (They seem to help in decreasing the risk)

In [None]:
data.describe().T

In [None]:
# how many unique values per feature?
unique_values = {}
for col in data.columns:
    unique_values[col] = data[col].value_counts().shape[0]

pd.DataFrame(unique_values, index=['unique value count']).transpose()

In [None]:
# checking for missing values of columns in data
data.isnull().sum()

In [None]:
# Checking duplicates rows in data

duplicates = data[data.duplicated()]
# print("Duplicate Rows : ",len(duplicates))
duplicates.head()

data.drop_duplicates(inplace = True)
data.shape

In [None]:
# visualizing data in histograms for understanding
data.hist(figsize=(20,15));

In [None]:
# correlaton heatmap 
plt.figure(figsize = (20,10))
corrmat = data.corr()
sns.heatmap(corrmat, annot=True, cmap ='YlOrRd' )
plt.title("Correlation of features")

In [None]:
# checking the outliers for BMW feature
# sns.boxplot(data["BMI"])

# condensing BMI into groups ( <20, 20 - 50, 50<)


BMI_0 = data.iloc[np.where(data["BMI"]<=20)]
BMI_1 = data.iloc[np.where((data["BMI"]>20)&(data["BMI"]<=50))]
BMI_2 = data.iloc[np.where(data["BMI"]>50)]

print("len of BMI_0 : ", len(BMI_0))
print("len of BMI_1 : ", len(BMI_1))
print("len of BMI_2 : ", len(BMI_2))

# checking for percentage of patients with diabetes in each group


print("BMI_0: \n\n ", ((BMI_0["Diabetes_012"].value_counts() / BMI_0.shape[0])) *100)
print("BMI_1: \n\n ", ((BMI_1["Diabetes_012"].value_counts() / BMI_1.shape[0])) *100)
print("BMI_2: \n\n ", ((BMI_2["Diabetes_012"].value_counts() / BMI_2.shape[0])) *100)

# **2. model**

In [None]:
# buidling model
features = data.drop('Diabetes_012',axis=1)
target = data['Diabetes_012']
groundtruth = target

features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [None]:
print(features_train.shape)
print(features_test.shape)
print(target_train.shape)
print(target_test.shape)

In [None]:
# Using SMOTE for imbalanced classification
smt=SMOTE(random_state=42)
X_train_res,Y_train_res=smt.fit_resample(features_train, target_train.ravel())

In [None]:
#target variable Diabetes_012:
#- 0 : no diabetes/ only during pregnancy
#- 1 : prediabetes
#- 2 : diabetes

### Classifier: #######################################################################################################


def no_diabetes_classifier() -> bool:
    return (True)

def diabetes_classifier() -> bool: 
    return(False)

def classifier(features):
    predictions = []
    for i in range(0, len(features), 1):
        # is it diabetes?
        if ((diabetes_classifier())):
            predictions.append(2.0)
        # is it no diabetes?
        elif (no_diabetes_classifier()):
            predictions.append(0.0)
        # its neither, so patient is at risk
        else: predictions.append(1.0)
    # print(predictions)
    return predictions


predictions = classifier(features)
print(predictions[1242])
print(groundtruth[1242])


In [None]:
# define performance metrics (True negatives, false negatives, true positives, false positives)
predictions = classifier(features)

In [None]:
def calc_metrics(predictions, groundtruth):
    # predictions = predictions
    TP_no_diabetes = TN_prediabetes = TN_diabetes = TN_no_diabetes = TN_diabetes = TP_prediabetes = TP_diabetes = FP_no_diabetes = FN_diabetes = FN_prediabetes = FP_diabetes = FP_prediabetes = FN_no_diabetes = 0
    for i in range(0, len(predictions)):
        if (predictions[i] == groundtruth[i]):
            if (predictions[i] == 0):
                TP_no_diabetes += 1
                TN_prediabetes += 1
                TN_diabetes += 1
            elif (predictions[i] ==1):
                TP_prediabetes += 1
                TN_diabetes += 1
                TN_no_diabetes += 1
            else : 
                TP_diabetes += 1
                TN_no_diabetes += 1
                TN_prediabetes += 1
                
        elif (predictions[i] == 0):
            if (groundtruth[i] == 1):
                FP_no_diabetes += 1
                FN_prediabetes += 1
                TN_diabetes += 1
            elif (groundtruth[i] == 2):
                FP_no_diabetes += 1
                FN_diabetes += 1
                TN_prediabetes += 1
                
        elif (predictions[i] == 1):
            if (groundtruth[i] == 0):
                FP_prediabetes += 1
                FN_no_diabetes += 1
                TN_diabetes += 1
            elif(groundtruth[i] == 2):
                FP_prediabetes += 1
                FN_diabetes += 1
                TN_no_diabetes+= 1
                
        elif (predictions[i] == 2):
            if (groundtruth[i] == 0):
                FP_diabetes += 1
                FN_no_diabetes += 1
                TN_prediabetes += 1
            elif(groundtruth[i] == 1):
                FP_diabetes += 1
                FN_prediabetes += 1
                TN_no_diabetes += 1
                
    TP_total = TP_diabetes + TP_no_diabetes + TP_prediabetes 
    TN_total = TN_diabetes + TN_no_diabetes + TN_prediabetes
    FP_total = FP_diabetes + FP_no_diabetes + FP_prediabetes
    FN_total = FN_diabetes + FN_no_diabetes + FN_prediabetes 
    # recall = Sensitifity = True Positive Rate
    # specificity = True Negative Rate
    # precision = TP / ( TP + FP )
    recall = TP_total/ (TP_total + FN_total)
    specificity =  TN_total / (TN_total + FP_total)
    precision = (TP_total / (TP_total + FP_total))
    return (recall, specificity, precision)


recall, specificity, precision = calc_metrics(predictions, groundtruth)
print (recall, specificity, precision)
    