# **AdaBoost**

## **Load Dataset from scratch**

### Dataset balancing and discretization should be performed only taking into account the train splits in the K-fold cross validation process without analysing the information found in the test splits.

In [None]:
# load the preprocessed dataset
data = pd.read_csv("./datasets/preprocessed.csv", dtype={'gender': 'category', 'patient_type': 'category', 'pneumonia': 'category', 'age': 'int64', 'pregnancy': 'category', 'diabetes': 'category', 'copd': 'category',
             'asthma': 'category', 'immunosuppression': 'category', 'hypertension': 'category', 'other_diseases': 'category', 'cardiovascular_disease': 'category',
             'obesity': 'category', 'chronic_kidney_disease': 'category', 'smoking': 'category'})

# print dataset
data

In [None]:
# print dataset info
data.info(verbose = True)

In [None]:
# use numerical encoding for categorical attributes
data['gender'] = data['gender'].cat.codes
data['pneumonia'] = data['pneumonia'].cat.codes
data['pregnancy'] = data['pregnancy'].cat.codes
data['diabetes'] = data['diabetes'].cat.codes
data['copd'] = data['copd'].cat.codes
data['asthma'] = data['asthma'].cat.codes
data['immunosuppression'] = data['immunosuppression'].cat.codes
data['hypertension'] = data['hypertension'].cat.codes
data['other_diseases'] = data['other_diseases'].cat.codes
data['cardiovascular_disease'] = data['cardiovascular_disease'].cat.codes
data['obesity'] = data['obesity'].cat.codes
data['chronic_kidney_disease'] = data['chronic_kidney_disease'].cat.codes
data['smoking'] = data['smoking'].cat.codes
data['patient_type'] = data['patient_type'].map({'HOSPITALIZED': 1, 'NOT-HOSPITALIZED': 0})
data['patient_type'] = data['patient_type'].astype('int8')

# print dataset
data

In [None]:
# print dataset info
data.info(verbose = True)

In [None]:
# separate class label from other features
labels = np.array(data['patient_type'])
data = data.drop('patient_type', axis = 1)

In [None]:
# k-fold cross validation folds
k = 12

# number of attributes to be selected
attributes_selection_threshold = 9

## AdaBoost with 100 Categorical Naive Bayes Classifiers

In [None]:
# store accuracy, f1, precision and recall
adaboost_accuracies_train = []
adaboost_accuracies_test = []
adaboost_f1_train = []
adaboost_f1_test = []
adaboost_precision_train = []
adaboost_precision_test = []
adaboost_recall_train = []
adaboost_recall_test = []

# store ROC curve parameters
adaboost_fpr = 0
adaboost_tpr = 0
adaboost_thresh = 0
adaboost_auc = 0

# store confusion matrix scores
adaboost_conf_matrix_list = []

# k-fold cross validation
skf = StratifiedKFold(n_splits=k, random_state=None, shuffle=True)
for train_index, test_index in skf.split(data, labels):
    display(Markdown("# **FOLD " + str(len(adaboost_conf_matrix_list)+1) + "**"))

    # split data in train and test set
    X_train, X_test = data.values[train_index], data.values[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    # random undersampling for the majority class on the training set
    initial_counter = Counter(y_train)
    undersample = RandomUnderSampler(sampling_strategy='majority')
    X_train, y_train = undersample.fit_resample(X_train, y_train)
    final_counter = Counter(y_train)
    majority_records_count = [initial_counter[0], final_counter[0]]
    minority_records_count = [initial_counter[1], final_counter[1]]
    label_initial = (majority_records_count[0], majority_records_count[1])
    label_final = (minority_records_count[0], minority_records_count[1])
    ind = np.arange(len(label_initial))
    width = 0.25
    fig, ax = plt.subplots(figsize=(7, 5), dpi=200, facecolor='w', edgecolor='k')
    rects_initial = ax.bar(ind - width/2, label_initial, width, label='Majority Class')
    rects_final = ax.bar(ind + width/2, label_final, width, label='Minority Class')
    ax.set_xticks(ind)
    ax.set_xticklabels(('Initial Class Distribution', 'Class Distribution after Undersampling'))
    ax.legend(loc=0, prop={'size': 10})
    autolabel(rects_initial, "center")
    autolabel(rects_final, "center")
    fig.suptitle('Train Set Class Balancing', fontsize=15, y=0.98)
    plt.xlabel('Class Distributions Before and After Undersampling', fontsize=12, labelpad=12)
    plt.ylabel('Count', fontsize=12, labelpad=12)
    plt.savefig('plots/adaboost/balancing-k' + str(len(adaboost_conf_matrix_list)+1) + '.png', dpi=200, bbox_inches='tight')
    plt.show()
    initial_counter = Counter(y_test)
    final_counter = Counter(y_test)
    majority_records_count = [initial_counter[0], final_counter[0]]
    minority_records_count = [initial_counter[1], final_counter[1]]
    label_initial = (majority_records_count[0], majority_records_count[1])
    label_final = (minority_records_count[0], minority_records_count[1])
    ind = np.arange(len(label_initial))
    width = 0.25
    fig, ax = plt.subplots(figsize=(7, 5), dpi=200, facecolor='w', edgecolor='k')
    rects_initial = ax.bar(ind - width/2, label_initial, width, label='Majority Class')
    rects_final = ax.bar(ind + width/2, label_final, width, label='Minority Class')
    ax.set_xticks(ind)
    ax.set_xticklabels(('Initial Class Distribution', 'Class Distribution after Undersampling'))
    ax.legend(loc=0, prop={'size': 10})
    autolabel(rects_initial, "center")
    autolabel(rects_final, "center")
    fig.suptitle('Test Set Class Balancing', fontsize=15, y=0.98)
    plt.xlabel('Class Distributions Before and After Undersampling', fontsize=12, labelpad=12)
    plt.ylabel('Count', fontsize=12, labelpad=12)
    plt.show()

    # attribute selection using mutual information on train set
    high_score_features = []
    high_score_features_idx = []
    feature_scores = mutual_info_classif(X_train, y_train, discrete_features=True, random_state=None)
    for score, f_name in sorted(zip(feature_scores, data.columns), reverse=True)[:attributes_selection_threshold]:
        high_score_features.append(f_name)
        high_score_features_idx.append(data.columns.get_loc(f_name))
    fig, ax = plt.subplots(figsize=(10, 5), dpi=200, facecolor='w', edgecolor='k')
    ax = fig.add_axes([0, 0, 1, 1])
    for score, f_name in sorted(zip(feature_scores, data.columns), reverse=True)[:len(feature_scores)]:
        p = ax.bar(f_name, round(score, 3), color = 'g')
        ax.bar_label(p, label_type='edge', color='black')
    for tick in ax.get_xticklabels():
        tick.set_rotation(90)
    fig.suptitle('Attributes Mutual Information Score', fontsize=15, y=1.05)
    plt.xlabel('Attributes', fontsize=12, labelpad=12)
    plt.ylabel('Score', fontsize=12, labelpad=12)
    plt.savefig('plots/adaboost/features-k' + str(len(adaboost_conf_matrix_list)+1) + '.png', dpi=200, bbox_inches='tight')
    plt.show()
    X_train = X_train[:, high_score_features_idx]
    X_test = X_test[:, high_score_features_idx]

    # train model
    adaboost = AdaBoostClassifier(n_estimators=100, base_estimator=CategoricalNB(), random_state=None)
    adaboost = adaboost.fit(X_train, y_train)

    # test model on train and test set
    adaboost_pred_train = adaboost.predict(X_train)
    adaboost_pred_test = adaboost.predict(X_test)
    adaboost_probs = adaboost.predict_proba(X_test)[:, 1]

    # collect statistics
    adaboost_accuracies_train.append(metrics.accuracy_score(y_train, adaboost_pred_train))
    adaboost_accuracies_test.append(metrics.accuracy_score(y_test, adaboost_pred_test))
    adaboost_f1_train.append(metrics.f1_score(y_train, adaboost_pred_train, pos_label=1))
    adaboost_f1_test.append(metrics.f1_score(y_test, adaboost_pred_test, pos_label=1))
    adaboost_precision_train.append(metrics.precision_score(y_train, adaboost_pred_train, pos_label=1))
    adaboost_precision_test.append(metrics.precision_score(y_test, adaboost_pred_test, pos_label=1))
    adaboost_recall_train.append(metrics.recall_score(y_train, adaboost_pred_train, pos_label=1))
    adaboost_recall_test.append(metrics.recall_score(y_test, adaboost_pred_test, pos_label=1))
    adaboost_fpr_new, adaboost_tpr_new, adaboost_thresh_new = metrics.roc_curve(y_test, adaboost_pred_test, pos_label=1)
    adaboost_fpr += adaboost_fpr_new
    adaboost_tpr += adaboost_tpr_new
    adaboost_thresh += adaboost_thresh_new
    adaboost_auc_new = metrics.roc_auc_score(y_test, adaboost_pred_test)
    adaboost_auc += adaboost_auc_new

    # confusion matrix score
    adaboost_conf_matrix = confusion_matrix(y_test, adaboost.predict(X_test))
    adaboost_conf_matrix_list.append(adaboost_conf_matrix)

    # print model report
    print(classification_report(y_train, adaboost_pred_train, labels=[0,1]))
    print(classification_report(y_test, adaboost_pred_test, labels=[0,1]))

# average ROC curve paramters
adaboost_fpr = adaboost_fpr/k
adaboost_tpr = adaboost_tpr/k
adaboost_thresh = adaboost_thresh/k
adaboost_auc = adaboost_auc/k

## **Overtraining**

In [None]:
fig, ax = plt.subplots(figsize=(7, 3), dpi=200, facecolor='w', edgecolor='k')
ax.plot(range(1, k+1), adaboost_accuracies_train, '-bo', range(1, k+1), adaboost_accuracies_test, '-ro')
plt.xticks(range(1, k+1))
fig.suptitle('AdaBoost Accuracy', fontsize=18, y=0.99)
ax.legend(['Train Set', 'Test Set'], loc=2, prop={'size': 10})
plt.savefig('plots/adaboost/accuracy.png', dpi=200, bbox_inches='tight')
plt.show()

# print average values
print("Average Train Set Accuracy: " + str(sum(adaboost_accuracies_train)/len(adaboost_accuracies_train)))
print("Average Test Set Accuracy: " + str(sum(adaboost_accuracies_test)/len(adaboost_accuracies_test)))

In [None]:
fig, ax = plt.subplots(figsize=(7, 3), dpi=200, facecolor='w', edgecolor='k')
ax.plot(range(1, k+1), adaboost_f1_train, '-bo', range(1, k+1), adaboost_f1_test, '-ro')
plt.xticks(range(1, k+1))
fig.suptitle('AdaBoost F1 Score', fontsize=18, y=0.99)
ax.legend(['Train Set', 'Test Set'], loc=2, prop={'size': 10})
plt.savefig('plots/adaboost/f1.png', dpi=200, bbox_inches='tight')
plt.show()

# print average values
print("Average Train Set F1 Score: " + str(sum(adaboost_f1_train)/len(adaboost_f1_train)))
print("Average Test Set F1 Score: " + str(sum(adaboost_f1_test)/len(adaboost_f1_test)))

In [None]:
fig, ax = plt.subplots(figsize=(7, 3), dpi=200, facecolor='w', edgecolor='k')
ax.plot(range(1, k+1), adaboost_precision_train, '-bo', range(1, k+1), adaboost_precision_test, '-ro')
plt.xticks(range(1, k+1))
fig.suptitle('AdaBoost Precision Score', fontsize=18, y=0.99)
ax.legend(['Train Set', 'Test Set'], loc=2, prop={'size': 10})
plt.savefig('plots/adaboost/precision.png', dpi=200, bbox_inches='tight')
plt.show()

# print average values
print("Average Train Set Precision Score: " + str(sum(adaboost_precision_train)/len(adaboost_precision_train)))
print("Average Test Set Precision Score: " + str(sum(adaboost_precision_test)/len(adaboost_precision_test)))

In [None]:
fig, ax = plt.subplots(figsize=(7, 3), dpi=200, facecolor='w', edgecolor='k')
ax.plot(range(1, k+1), adaboost_recall_train, '-bo', range(1, k+1), adaboost_recall_test, '-ro')
plt.xticks(range(1, k+1))
fig.suptitle('AdaBoost Recall Score', fontsize=18, y=0.99)
ax.legend(['Train Set', 'Test Set'], loc=2, prop={'size': 10})
plt.savefig('plots/adaboost/recall.png', dpi=200, bbox_inches='tight')
plt.show()

# print average values
print("Average Train Set Recall Score: " + str(sum(adaboost_recall_train)/len(adaboost_recall_train)))
print("Average Test Set Recall Score: " + str(sum(adaboost_recall_test)/len(adaboost_recall_test)))

## **Confusion Matrix**

In [None]:
# Plot average normalized confusion matrix
mean_of_conf_matrix_list = np.mean(adaboost_conf_matrix_list, axis=0)
plt.figure(figsize=(5, 5), dpi=200, facecolor='w', edgecolor='k')
generate_confusion_matrix(mean_of_conf_matrix_list, classes=['NON-HOSPITALIZED', 'HOSPITALIZED'], normalize=True, title='Normalized confusion matrix')
plt.savefig('plots/adaboost/confusion_matrix.png', dpi=200, bbox_inches='tight')
plt.show()