In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE

df = pd.read_csv('/content/diabetes_binary_health_indicators_BRFSS2015.csv')
X = df.drop(columns=['Diabetes_binary', 'HeartDiseaseorAttack', 'Age'])
y_diabetes = df['Diabetes_binary']
y_heart = df['HeartDiseaseorAttack']

# Training and testing split
X_train, X_test, y_train_diabetes, y_test_diabetes = train_test_split(X, y_diabetes, test_size=0.2, random_state=42)

# Appling SMOTE
smote = SMOTE(random_state=42)
X_train_resampled_diabetes, y_train_resampled_diabetes = smote.fit_resample(X_train, y_train_diabetes)

# Defining the algorithms used for the ensemble
rf_classifier = RandomForestClassifier(random_state=42)
gb_classifier = GradientBoostingClassifier(random_state=42)
ada_classifier = AdaBoostClassifier(random_state=42)
xgb_classifier = XGBClassifier(random_state=42)

# defining the ensembling classifier
ensemble_classifier_diabetes = VotingClassifier(estimators=[
    ('rf', rf_classifier),
    ('gb', gb_classifier),
    ('ada', ada_classifier),
    ('xgb', xgb_classifier)
], voting='soft')

# training ensemble method
ensemble_classifier_diabetes.fit(X_train_resampled_diabetes, y_train_resampled_diabetes)
y_pred_proba_diabetes_all = ensemble_classifier_diabetes.predict_proba(X_test)[:, 1]
y_pred_diabetes_all = ensemble_classifier_diabetes.predict(X_test)

# AUC ROC evaluation
auc_roc_diabetes_all = roc_auc_score(y_test_diabetes, y_pred_proba_diabetes_all)
accuracy_diabetes_all = accuracy_score(y_test_diabetes, y_pred_diabetes_all)
precision_diabetes_all = precision_score(y_test_diabetes, y_pred_diabetes_all)
recall_diabetes_all = recall_score(y_test_diabetes, y_pred_diabetes_all)
f1_diabetes_all = f1_score(y_test_diabetes, y_pred_diabetes_all)

print(f"Ensemble - Diabetes Prediction Metrics:")
print(f"AUC-ROC: {auc_roc_diabetes_all:.3f}")
print(f"Accuracy: {accuracy_diabetes_all:.3f}")
print(f"Precision: {precision_diabetes_all:.3f}")
print(f"Recall: {recall_diabetes_all:.3f}")
print(f"F1 Score: {f1_diabetes_all:.3f}")
print("")

# spling again for heart disease
X_train, X_test, y_train_heart, y_test_heart = train_test_split(X, y_heart, test_size=0.2, random_state=42)
X_train_resampled_heart, y_train_resampled_heart = smote.fit_resample(X_train, y_train_heart)

# same process as heart disease as done for the diabetes above
ensemble_classifier_heart = VotingClassifier(estimators=[
    ('rf', rf_classifier),
    ('gb', gb_classifier),
    ('ada', ada_classifier),
    ('xgb', xgb_classifier)
], voting='soft')

ensemble_classifier_heart.fit(X_train_resampled_heart, y_train_resampled_heart)
y_pred_proba_heart_all = ensemble_classifier_heart.predict_proba(X_test)[:, 1]
y_pred_heart_all = ensemble_classifier_heart.predict(X_test)

auc_roc_heart_all = roc_auc_score(y_test_heart, y_pred_proba_heart_all)
accuracy_heart_all = accuracy_score(y_test_heart, y_pred_heart_all)
precision_heart_all = precision_score(y_test_heart, y_pred_heart_all)
recall_heart_all = recall_score(y_test_heart, y_pred_heart_all)
f1_heart_all = f1_score(y_test_heart, y_pred_heart_all)

print(f"Ensemble - Heart Disease Prediction Metrics:")
print(f"AUC-ROC: {auc_roc_heart_all:.3f}")
print(f"Accuracy: {accuracy_heart_all:.3f}")
print(f"Precision: {precision_heart_all:.3f}")
print(f"Recall: {recall_heart_all:.3f}")
print(f"F1 Score: {f1_heart_all:.3f}")
print("")

# Ensemble method using different age groups
results_diabetes = {}
results_heart = {}

for age_value in sorted(df['Age'].unique()):
    X_age = X[df['Age'] == age_value]
    y_diabetes_age = y_diabetes[df['Age'] == age_value]
    y_heart_age = y_heart[df['Age'] == age_value]

    if X_age.shape[0] < 50:
        print(f"Skipping age value {age_value} due to insufficient samples.")
        continue

    #same process repeats but for different age groups
    X_train, X_test, y_train, y_test = train_test_split(X_age, y_diabetes_age, test_size=0.2, random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    ensemble_classifier_diabetes.fit(X_train_resampled, y_train_resampled)
    y_pred_proba_diabetes = ensemble_classifier_diabetes.predict_proba(X_test)[:, 1]
    auc_diabetes = roc_auc_score(y_test, y_pred_proba_diabetes)
    results_diabetes[age_value] = auc_diabetes

    X_train, X_test, y_train, y_test = train_test_split(X_age, y_heart_age, test_size=0.2, random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    ensemble_classifier_heart.fit(X_train_resampled, y_train_resampled)
    y_pred_proba_heart = ensemble_classifier_heart.predict_proba(X_test)[:, 1]
    auc_heart = roc_auc_score(y_test, y_pred_proba_heart)
    results_heart[age_value] = auc_heart

results_df_diabetes = pd.DataFrame.from_dict(results_diabetes, orient='index', columns=['AUC-ROC'])
results_df_heart = pd.DataFrame.from_dict(results_heart, orient='index', columns=['AUC-ROC'])

print("Age Value Model Evaluation Results for Diabetes")
print(results_df_diabetes)
print("Age Value Model Evaluation Results for Heart Disease")
print(results_df_heart)


Ensemble - Diabetes Prediction Metrics:
AUC-ROC: 0.814
Accuracy: 0.858
Precision: 0.476
Recall: 0.260
F1 Score: 0.336

Ensemble - Heart Disease Prediction Metrics:
AUC-ROC: 0.813
Accuracy: 0.900
Precision: 0.425
Recall: 0.174
F1 Score: 0.247

Age Value Model Evaluation Results for Diabetes
       AUC-ROC
1.0   0.589333
2.0   0.781133
3.0   0.814639
4.0   0.846956
5.0   0.830446
6.0   0.814214
7.0   0.822626
8.0   0.815680
9.0   0.800991
10.0  0.783113
11.0  0.745845
12.0  0.724059
13.0  0.698460
Age Value Model Evaluation Results for Heart Disease
       AUC-ROC
1.0   0.587820
2.0   0.859670
3.0   0.788965
4.0   0.822337
5.0   0.819928
6.0   0.837692
7.0   0.821727
8.0   0.822220
9.0   0.802057
10.0  0.770479
11.0  0.754623
12.0  0.718011
13.0  0.696102
