In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, classification_report, accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
import numpy as np

In [None]:
df = pd.read_csv('/content/diabetes_binary_health_indicators_BRFSS2015.csv')
X = df.drop(columns=['Diabetes_binary', 'HeartDiseaseorAttack', 'Age'])
y_diabetes = df['Diabetes_binary']
y_heart = df['HeartDiseaseorAttack']

In [None]:
# splitting the training and testing set
X_train, X_test, y_train_diabetes, y_test_diabetes = train_test_split(X, y_diabetes, test_size=0.2, random_state=42)
all_features = X.columns

# Train-test split for diabetes
X_train_all = X_train[all_features]
X_test_all = X_test[all_features]

# Defining the Classifiers
rf_classifier = RandomForestClassifier(random_state=42)
gb_classifier = GradientBoostingClassifier(random_state=42)
ada_classifier = AdaBoostClassifier(random_state=42)
xgb_classifier = XGBClassifier(random_state=42)

ensemble_classifier_diabetes = VotingClassifier(estimators=[
    ('rf', rf_classifier),
    ('gb', gb_classifier),
    ('ada', ada_classifier),
    ('xgb', xgb_classifier)
], voting='soft')

# Training the ensemble classifier for diabetes
ensemble_classifier_diabetes.fit(X_train_all, y_train_diabetes)
y_pred_diabetes_all = ensemble_classifier_diabetes.predict(X_test_all)
y_pred_proba_diabetes_all = ensemble_classifier_diabetes.predict_proba(X_test_all)
auc_roc_diabetes_all = roc_auc_score(y_test_diabetes, y_pred_proba_diabetes_all[:, 1])
print(f"AUC-ROC for diabetes prediction using all features: {auc_roc_diabetes_all}")

# training for heart disease
X_train, X_test, y_train_heart, y_test_heart = train_test_split(X, y_heart, test_size=0.2, random_state=42)
X_train_all_heart = X_train[all_features]
X_test_all_heart = X_test[all_features]

ensemble_classifier_heart = VotingClassifier(estimators=[
    ('rf', rf_classifier),
    ('gb', gb_classifier),
    ('ada', ada_classifier),
    ('xgb', xgb_classifier)
], voting='soft')
ensemble_classifier_heart.fit(X_train_all_heart, y_train_heart)
y_pred_heart_all = ensemble_classifier_heart.predict(X_test_all_heart)
y_pred_proba_heart_all = ensemble_classifier_heart.predict_proba(X_test_all_heart)
auc_roc_heart_all = roc_auc_score(y_test_heart, y_pred_proba_heart_all[:, 1])
print(f"AUC-ROC for heart disease prediction using all features: {auc_roc_heart_all}")

# Age group performance
ages = df['Age']
results_diabetes = {}
results_heart = {}
for age_value in sorted(df['Age'].unique()):
    X_age = X[ages == age_value]
    y_diabetes_age = y_diabetes[ages == age_value]
    y_heart_age = y_heart[ages == age_value]

    if X_age.shape[0] < 50:
        print(f"Skipping age value {age_value} due to insufficient samples.")
        continue

    X_train, X_test, y_train, y_test = train_test_split(X_age, y_diabetes_age, test_size=0.2, random_state=42)
    X_train_all_age = X_train[all_features]
    X_test_all_age = X_test[all_features]

    ensemble_classifier_diabetes.fit(X_train_all_age, y_train)
    y_pred_proba = ensemble_classifier_diabetes.predict_proba(X_test_all_age)
    auc = roc_auc_score(y_test, y_pred_proba[:, 1])
    results_diabetes[age_value] = auc

    X_train, X_test, y_train, y_test = train_test_split(X_age, y_heart_age, test_size=0.2, random_state=42)
    X_train_all_age = X_train[all_features]
    X_test_all_age = X_test[all_features]

    ensemble_classifier_heart.fit(X_train_all_age, y_train)
    y_pred_proba = ensemble_classifier_heart.predict_proba(X_test_all_age)
    auc = roc_auc_score(y_test, y_pred_proba[:, 1])
    results_heart[age_value] = auc

results_df_diabetes = pd.DataFrame.from_dict(results_diabetes, orient='index', columns=['AUC-ROC'])
results_df_heart = pd.DataFrame.from_dict(results_heart, orient='index', columns=['AUC-ROC'])

print("Age Value Model Evaluation Results for Diabetes")
print(results_df_diabetes)
print("Age Value Model Evaluation Results for Heart Disease")
print(results_df_heart)

# Calculating the accuracy precision recall and f1 score
def print_classification_report(y_test, y_pred, y_pred_proba, disease_name):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average=None)
    recall = recall_score(y_test, y_pred, average=None)
    f1 = f1_score(y_test, y_pred, average=None)

    report = pd.DataFrame({
        'Accuracy': [accuracy, accuracy],
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }, index=['Class 0', 'Class 1'])

    print(f"\nClassification Report for {disease_name}")
    print(report)

# Print classification report for diabetes
print_classification_report(y_test_diabetes, y_pred_diabetes_all, y_pred_proba_diabetes_all, "Diabetes (All Features)")

# Print classification report for heart disease
print_classification_report(y_test_heart, y_pred_heart_all, y_pred_proba_heart_all, "Heart Disease (All Features)")


AUC-ROC for diabetes prediction using all features: 0.8168264641825069
AUC-ROC for heart disease prediction using all features: 0.8168885965061484
Age Value Model Evaluation Results for Diabetes
       AUC-ROC
1.0   0.643259
2.0   0.810540
3.0   0.803810
4.0   0.829684
5.0   0.832616
6.0   0.814340
7.0   0.824205
8.0   0.811291
9.0   0.802157
10.0  0.785281
11.0  0.745971
12.0  0.728365
13.0  0.697070
Age Value Model Evaluation Results for Heart Disease
       AUC-ROC
1.0   0.501072
2.0   0.874191
3.0   0.806751
4.0   0.821778
5.0   0.807612
6.0   0.840810
7.0   0.816215
8.0   0.825219
9.0   0.801467
10.0  0.772305
11.0  0.750857
12.0  0.716036
13.0  0.694243

Classification Report for Diabetes (All Features)
         Accuracy  Precision    Recall  F1 Score
Class 0  0.865835   0.878247  0.980269  0.926458
Class 1  0.865835   0.549582  0.150493  0.236284

Classification Report for Heart Disease (All Features)
         Accuracy  Precision    Recall  F1 Score
Class 0  0.907403   0.912799 

In [None]:
# Splitting the dataset into training and testing
X_train, X_test, y_train_diabetes, y_test_diabetes = train_test_split(X, y_diabetes, test_size=0.2, random_state=42)

# Applying SMOTE
smote = SMOTE(random_state=42)
X_train_resampled_diabetes, y_train_resampled_diabetes = smote.fit_resample(X_train, y_train_diabetes)
all_features = X_train_resampled_diabetes.columns
X_train_all = X_train_resampled_diabetes[all_features]
X_test_all = X_test[all_features]

# defining the classifier
rf_classifier = RandomForestClassifier(random_state=42)
gb_classifier = GradientBoostingClassifier(random_state=42)
ada_classifier = AdaBoostClassifier(random_state=42)
xgb_classifier = XGBClassifier(random_state=42)

ensemble_classifier_diabetes = VotingClassifier(estimators=[
    ('rf', rf_classifier),
    ('gb', gb_classifier),
    ('ada', ada_classifier),
    ('xgb', xgb_classifier)
], voting='soft')

# Training ensemble model for diabetes
ensemble_classifier_diabetes.fit(X_train_all, y_train_resampled_diabetes)
y_pred_diabetes_all = ensemble_classifier_diabetes.predict(X_test_all)
y_pred_proba_diabetes_all = ensemble_classifier_diabetes.predict_proba(X_test_all)
auc_roc_diabetes_all = roc_auc_score(y_test_diabetes, y_pred_proba_diabetes_all[:, 1])
print(f"AUC-ROC for diabetes prediction using all features: {auc_roc_diabetes_all}")

# training the model for heart disease
X_train, X_test, y_train_heart, y_test_heart = train_test_split(X, y_heart, test_size=0.2, random_state=42)
X_train_resampled_heart, y_train_resampled_heart = smote.fit_resample(X_train, y_train_heart)
X_train_all_heart = X_train_resampled_heart[all_features]
X_test_all_heart = X_test[all_features]

ensemble_classifier_heart = VotingClassifier(estimators=[
    ('rf', rf_classifier),
    ('gb', gb_classifier),
    ('ada', ada_classifier),
    ('xgb', xgb_classifier)
], voting='soft')
ensemble_classifier_heart.fit(X_train_all_heart, y_train_resampled_heart)
y_pred_heart_all = ensemble_classifier_heart.predict(X_test_all_heart)
y_pred_proba_heart_all = ensemble_classifier_heart.predict_proba(X_test_all_heart)
auc_roc_heart_all = roc_auc_score(y_test_heart, y_pred_proba_heart_all[:, 1])
print(f"AUC-ROC for heart disease prediction using all features: {auc_roc_heart_all}")

# performance evaluation across different age groups
ages = df['Age']
results_diabetes = {}
results_heart = {}
for age_value in sorted(df['Age'].unique()):
    X_age = X[ages == age_value]
    y_diabetes_age = y_diabetes[ages == age_value]
    y_heart_age = y_heart[ages == age_value]

    if X_age.shape[0] < 50:
        print(f"Skipping age value {age_value} due to insufficient samples.")
        continue

    X_train, X_test, y_train, y_test = train_test_split(X_age, y_diabetes_age, test_size=0.2, random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    X_train_all_age = X_train_resampled[all_features]
    X_test_all_age = X_test[all_features]

    ensemble_classifier_diabetes.fit(X_train_all_age, y_train_resampled)
    y_pred_proba = ensemble_classifier_diabetes.predict_proba(X_test_all_age)
    auc = roc_auc_score(y_test, y_pred_proba[:, 1])
    results_diabetes[age_value] = auc

    X_train, X_test, y_train, y_test = train_test_split(X_age, y_heart_age, test_size=0.2, random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    X_train_all_age = X_train_resampled[all_features]
    X_test_all_age = X_test[all_features]

    ensemble_classifier_heart.fit(X_train_all_age, y_train_resampled)
    y_pred_proba = ensemble_classifier_heart.predict_proba(X_test_all_age)
    auc = roc_auc_score(y_test, y_pred_proba[:, 1])
    results_heart[age_value] = auc

results_df_diabetes = pd.DataFrame.from_dict(results_diabetes, orient='index', columns=['AUC-ROC'])
results_df_heart = pd.DataFrame.from_dict(results_heart, orient='index', columns=['AUC-ROC'])

print("Age Value Model Evaluation Results for Diabetes")
print(results_df_diabetes)
print("Age Value Model Evaluation Results for Heart Disease")
print(results_df_heart)

# evaluation metrics
def print_classification_report(y_test, y_pred, y_pred_proba, disease_name):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average=None)
    recall = recall_score(y_test, y_pred, average=None)
    f1 = f1_score(y_test, y_pred, average=None)

    report = pd.DataFrame({
        'Accuracy': [accuracy, accuracy],
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }, index=['Class 0', 'Class 1'])

    print(f"\nClassification Report for {disease_name}")
    print(report)

AUC-ROC for diabetes prediction using all features: 0.8143749394506697
AUC-ROC for heart disease prediction using all features: 0.8128162603668557
Age Value Model Evaluation Results for Diabetes
       AUC-ROC
1.0   0.589333
2.0   0.781133
3.0   0.814639
4.0   0.841246
5.0   0.830127
6.0   0.818661
7.0   0.822626
8.0   0.815680
9.0   0.800991
10.0  0.783113
11.0  0.745845
12.0  0.724059
13.0  0.698460
Age Value Model Evaluation Results for Heart Disease
       AUC-ROC
1.0   0.587820
2.0   0.859670
3.0   0.788965
4.0   0.822337
5.0   0.811998
6.0   0.838232
7.0   0.818607
8.0   0.820558
9.0   0.802057
10.0  0.770479
11.0  0.754623
12.0  0.718011
13.0  0.696102

Classification Report for Diabetes (All Features)
         Accuracy  Precision    Recall  F1 Score
Class 0  0.858444   0.889642  0.954160  0.920772
Class 1  0.858444   0.475817  0.260111  0.336352

Classification Report for Heart Disease (All Features)
         Accuracy  Precision    Recall  F1 Score
Class 0  0.900229   0.919256 