In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
df = pd.read_csv("diabetes_binary_health_indicators_BRFSS2015.csv")
X = df.drop("Diabetes_binary", axis = 1)
y = df["Diabetes_binary"]
# Assuming X_train, y_train are your training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [18]:
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier()

In [19]:
model.fit(X_train, y_train)

In [20]:
pred = model.predict(X_test)

In [21]:
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         0.0       0.88      0.97      0.93     43739
         1.0       0.54      0.20      0.29      6997

    accuracy                           0.87     50736
   macro avg       0.71      0.59      0.61     50736
weighted avg       0.84      0.87      0.84     50736



In [32]:
df_train = X_train.copy()
df_train["Diabetes_binary"] = y_train

In [33]:
df_train["Diabetes_binary"].value_counts() # the train data is imbalanced

0.0    174595
1.0     28349
Name: Diabetes_binary, dtype: int64

In [39]:
df_class1 = df_train[df_train["Diabetes_binary"] == 1]
df_class0 = df_train[df_train["Diabetes_binary"] != 1]

In [41]:
df_sampled = df_class1.sample(n = len(df_class0), replace=True)

In [43]:
len(df_sampled) == len(df_class0)

True

In [46]:
df_train_oversampled = pd.concat([df_sampled, df_class0], axis = 0)

In [47]:
X_train = df_train_oversampled.drop("Diabetes_binary", axis = 1)
y_train = df_train_oversampled["Diabetes_binary"]

In [48]:
model = AdaBoostClassifier()
model.fit(X_train, y_train)

In [49]:
pred = model.predict(X_test)

In [50]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         0.0       0.95      0.72      0.82     43739
         1.0       0.31      0.79      0.45      6997

    accuracy                           0.73     50736
   macro avg       0.63      0.75      0.63     50736
weighted avg       0.87      0.73      0.77     50736



**we can see an improvement in the f1 score but the accuracy has taken a hit**

In [52]:
from xgboost import XGBClassifier

In [53]:
model = XGBClassifier()
model.fit(X_train, y_train)

In [54]:
pred = model.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         0.0       0.95      0.72      0.82     43739
         1.0       0.31      0.78      0.45      6997

    accuracy                           0.73     50736
   macro avg       0.63      0.75      0.63     50736
weighted avg       0.87      0.73      0.77     50736



**we will try a different oversampling method - SMOTE**

In [77]:
df = pd.read_csv("diabetes_binary_health_indicators_BRFSS2015.csv")
X = df.drop("Diabetes_binary", axis = 1)
y = df["Diabetes_binary"]
# Assuming X_train, y_train are your training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [56]:
df_train = X_train.copy()
df_train["Diabetes_binary"] = y_train
df_class1 = df_train[df_train["Diabetes_binary"] == 1]
df_class0 = df_train[df_train["Diabetes_binary"] != 1]

In [78]:
from imblearn.over_sampling import SMOTE

In [79]:
y_train.value_counts()

0.0    174809
1.0     28135
Name: Diabetes_binary, dtype: int64

In [80]:
smote = SMOTE(sampling_strategy='minority', random_state= 42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [73]:
y_train.value_counts()

0.0    174595
1.0    174595
Name: Diabetes_binary, dtype: int64

In [64]:
len(X_train) == len(y_train)

True

now we will check adaboost and XGboost again

In [81]:
model = AdaBoostClassifier()
model.fit(X_train, y_train)

In [82]:
pred = model.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         0.0       0.92      0.87      0.89     43525
         1.0       0.40      0.52      0.45      7211

    accuracy                           0.82     50736
   macro avg       0.66      0.70      0.67     50736
weighted avg       0.84      0.82      0.83     50736



In [83]:
model = XGBClassifier()
model.fit(X_train, y_train)
pred = model.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         0.0       0.88      0.98      0.93     43525
         1.0       0.58      0.18      0.28      7211

    accuracy                           0.87     50736
   macro avg       0.73      0.58      0.60     50736
weighted avg       0.84      0.87      0.83     50736



**we can see that the f1 score improved a litle bit with the adaboost model.
same result with different random_state, meaning that the data is pretty representitive**

**we will now try cross validation ensamble undersample**

In [84]:
df = pd.read_csv("diabetes_binary_health_indicators_BRFSS2015.csv")
X = df.drop("Diabetes_binary", axis = 1)
y = df["Diabetes_binary"]
# Assuming X_train, y_train are your training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

df_train = X_train.copy()
df_train["Diabetes_binary"] = y_train
df_class1 = df_train[df_train["Diabetes_binary"] == 1]
df_class0 = df_train[df_train["Diabetes_binary"] != 1]

In [87]:
def run_model(start, end, model):
    X_train = pd.concat([df_class0[start:end],df_class1 ], axis = 0).drop("Diabetes_binary", axis = 1)
    y_train = pd.concat([df_class0[start:end],df_class1 ], axis = 0)["Diabetes_binary"]
    
    model = model
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    print(classification_report(y_test, pred))
    return pred

In [105]:
y_pred1 = run_model(0,len(df_class1),AdaBoostClassifier())
y_pred2 = run_model(len(df_class1),len(df_class1)*2,AdaBoostClassifier())
y_pred3 = run_model(len(df_class1)*2 ,len(df_class1)*3,AdaBoostClassifier())
y_pred4 = run_model(len(df_class1)*3 ,len(df_class1)*4,AdaBoostClassifier())
y_pred5 = run_model(len(df_class1)*4 ,len(df_class1)*5,AdaBoostClassifier())
y_pred6 = run_model(len(df_class1)*5, len(df_class0),AdaBoostClassifier())



              precision    recall  f1-score   support

         0.0       0.95      0.72      0.82     43739
         1.0       0.31      0.78      0.45      6997

    accuracy                           0.73     50736
   macro avg       0.63      0.75      0.63     50736
weighted avg       0.87      0.73      0.77     50736

              precision    recall  f1-score   support

         0.0       0.96      0.72      0.82     43739
         1.0       0.31      0.79      0.45      6997

    accuracy                           0.73     50736
   macro avg       0.63      0.75      0.63     50736
weighted avg       0.87      0.73      0.77     50736

              precision    recall  f1-score   support

         0.0       0.95      0.72      0.82     43739
         1.0       0.31      0.78      0.45      6997

    accuracy                           0.73     50736
   macro avg       0.63      0.75      0.63     50736
weighted avg       0.87      0.73      0.77     50736

              preci

array([ True,  True,  True, ...,  True,  True,  True])

In [103]:
y_pred_final = y_pred1
for i in range(len(y_test)):
    n_ones = y_pred1[i] + y_pred2[i] + y_pred3[i] + y_pred4[i] + y_pred5[i]+ y_pred6[i]
    if n_ones >=3:
        y_pred_final[i] =1
    else:
        y_pred_final[i] = 0

In [104]:
print(classification_report(y_pred_final, y_test))

              precision    recall  f1-score   support

         0.0       0.72      0.96      0.82     32717
         1.0       0.79      0.31      0.44     18019

    accuracy                           0.73     50736
   macro avg       0.75      0.63      0.63     50736
weighted avg       0.74      0.73      0.69     50736



In [107]:
 a = y_pred1 == y_pred2

In [110]:
pd.Series(a).value_counts()

True     49840
False      896
dtype: int64

In [111]:
y_pred1 = run_model(0,len(df_class1),XGBClassifier())
y_pred2 = run_model(len(df_class1),len(df_class1)*2,XGBClassifier())
y_pred3 = run_model(len(df_class1)*2 ,len(df_class1)*3,XGBClassifier())
y_pred4 = run_model(len(df_class1)*3 ,len(df_class1)*4,XGBClassifier())
y_pred5 = run_model(len(df_class1)*4 ,len(df_class1)*5,XGBClassifier())
y_pred6 = run_model(len(df_class1)*5, len(df_class0),XGBClassifier())

              precision    recall  f1-score   support

         0.0       0.96      0.70      0.81     43739
         1.0       0.30      0.80      0.44      6997

    accuracy                           0.72     50736
   macro avg       0.63      0.75      0.62     50736
weighted avg       0.87      0.72      0.76     50736

              precision    recall  f1-score   support

         0.0       0.96      0.70      0.81     43739
         1.0       0.30      0.80      0.44      6997

    accuracy                           0.72     50736
   macro avg       0.63      0.75      0.63     50736
weighted avg       0.87      0.72      0.76     50736

              precision    recall  f1-score   support

         0.0       0.96      0.70      0.81     43739
         1.0       0.30      0.80      0.44      6997

    accuracy                           0.71     50736
   macro avg       0.63      0.75      0.62     50736
weighted avg       0.87      0.71      0.76     50736

              preci

In [112]:
y_pred_final = y_pred1
for i in range(len(y_test)):
    n_ones = y_pred1[i] + y_pred2[i] + y_pred3[i] + y_pred4[i] + y_pred5[i]+ y_pred6[i]
    if n_ones >=3:
        y_pred_final[i] =1
    else:
        y_pred_final[i] = 0

In [113]:
print(classification_report(y_pred_final, y_test))

              precision    recall  f1-score   support

           0       0.70      0.96      0.81     31806
           1       0.81      0.30      0.44     18930

    accuracy                           0.71     50736
   macro avg       0.76      0.63      0.62     50736
weighted avg       0.74      0.71      0.67     50736

