# Bagging

### Imports

In [12]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import roc_auc_score

from data import get_data

### Load data

In [2]:
ROOT = './tadpole_challenge/'
X, y, label_dict = get_data(ROOT)

In [3]:
# TODO: split with class balance in mind
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

### Helper functions

In [46]:
def accuracy(clf, X, y):
    y_pred = clf.predict(X)
    acc = sum(y_pred == y) / len(y)
    return acc

def metrics(clf, X, y):
    y_pred = clf.predict(X)
    y_prob = clf.predict_proba(X)
    BCA = balanced_accuracy_score(y, y_pred)
    mAUC = roc_auc_score(y, y_prob, multi_class="ovo", average="macro")
    return BCA, mAUC

In [71]:
def report_accuracy(clf):
    train_acc = accuracy(clf, X_train, y_train)
    test_acc = accuracy(clf, X_test, y_test)
    print('Train accuracy:', train_acc)
    print('Test accuracy:', test_acc)
    
def report_metrics(clf, verbose):
    BCA_train, mAUC_train = metrics(clf, X_train, y_train)
    BCA_test, mAUC_test = metrics(clf, X_test, y_test)
    
    if verbose:
        print('\tTrain metrics')    
        print('BCA_train:', BCA_train)
        print('mAUC_train:', mAUC_train)

        print('\tTest metrics')   
        print('BCA_test:', BCA_test)
        print('mAUC_test:', mAUC_test)
    
    return BCA_train, BCA_test, mAUC_train, mAUC_test

Settings are not final.

In [72]:
def bagging_clf(clf):
    return BaggingClassifier(base_estimator = clf, 
                             n_estimators = 100, 
                             max_samples = 0.3, 
                             random_state = 0)

In [73]:
def evaluate(Classifier, verbose=False, **kwargs):
    if verbose: print('+ Base learner')
    clf = Classifier(**kwargs).fit(X_train, y_train)
    clf_perf = report_metrics(clf, verbose)
    
    if verbose: print('\n+ Ensemble')
    bag_clf = bagging_clf(Classifier(**kwargs)).fit(X_train, y_train)
    bag_clf_perf = report_metrics(bag_clf, verbose)
    
    # Values in dict are lists of the following format: 
    # BCA_train, BCA_test, mAUC_train, mAUC_test
    if not verbose:
        return {'clf': list(clf_perf), 'bag_clf': list(bag_clf_perf)}

In [78]:
# Make a main for every baselearner that runs them multiple times
# because SVC requires extra arguments, which makes it difficult
# to generalize into one function.

# def main(verbose=False):
#     baselearners = [DecisionTreeClassifier, 
#                     SVC, 
#                     MLPClassifier,
#                     LogisticRegression]
    
#     for learner in baselearners:           
#         if verbose:
#             print(learner)
#             evaluate(learner, True)
#             print()
#         else:
#             perf = evaluate(learner)

## Models

### DTC

In [79]:
evaluate(DecisionTreeClassifier, True)

+ Base learner
	Train metrics
BCA_train: 1.0
mAUC_train: 1.0
	Test metrics
BCA_test: 0.8249257552483359
mAUC_test: 0.868694316436252

+ Ensemble
	Train metrics
BCA_train: 0.917524219782151
mAUC_train: 0.9940879940039782
	Test metrics
BCA_test: 0.8913722478238607
mAUC_test: 0.9723823924731182


### SVM

In [38]:
evaluate(SVC, True, probability=True)

+ Base learner
	Train metrics
BCA_train: 0.8802071684432008
mAUC_train: 0.9742760310804081
	Test metrics
BCA_test: 0.8583384536610343
mAUC_test: 0.9650310419866872

+ Ensemble
	Train metrics
BCA_train: 0.8229333344551334
mAUC_train: 0.9630247237742404
	Test metrics
BCA_test: 0.8276241679467486
mAUC_test: 0.9594310035842294


### ANN

In [40]:
evaluate(MLPClassifier, True)

+ Base learner
	Train metrics
BCA_train: 0.8872009613827309
mAUC_train: 0.9741941233096751
	Test metrics
BCA_test: 0.8875448028673835
mAUC_test: 0.9695721326164874

+ Ensemble
	Train metrics
BCA_train: 0.8052836460000696
mAUC_train: 0.9566161506035402
	Test metrics
BCA_test: 0.793200204813108
mAUC_test: 0.9537624807987711


### LRA

In [84]:
evaluate(LogisticRegression, True)

+ Base learner
	Train metrics
BCA_train: 0.8097880755915684
mAUC_train: 0.9601234307359686
	Test metrics
BCA_test: 0.7994905273937531
mAUC_test: 0.9592767537122375

+ Ensemble
	Train metrics
BCA_train: 0.7232919225481128
mAUC_train: 0.9453789312376237
	Test metrics
BCA_test: 0.7256426011264722
mAUC_test: 0.9463194444444444
