# Boosting Runs

This notebook runs AdaBoost with different base learners $n$ times and saves the output to a CSV file separately for each base learner.

## Imports

In [4]:
# sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier

# custom modules
from tadpole import Tadpole
from classifier import Classifier
from main import run, get_params

## Helper Class

In [20]:
# Source: https://stackoverflow.com/questions/55632010/using-scikit-learns-mlpclassifier-in-adaboostclassifier
import numpy as np
class CustomMLPClassifier(MLPClassifier):
    def resample_with_replacement(self, X_train, y_train, sample_weight):

        # normalize sample_weights if not already
        sample_weight = sample_weight / sample_weight.sum(dtype=np.float64)

        X_train_resampled = np.zeros((len(X_train), len(X_train[0])), dtype=np.float32)
        y_train_resampled = np.zeros((len(y_train)), dtype=np.int)
        for i in range(len(X_train)):
            # draw a number from 0 to len(X_train)-1
            draw = np.random.choice(np.arange(len(X_train)), p=sample_weight)

            # place the X and y at the drawn number into the resampled X and y
            X_train_resampled[i] = X_train[draw]
            y_train_resampled[i] = y_train[draw]

        return X_train_resampled, y_train_resampled


    def fit(self, X, y, sample_weight=None):
        if sample_weight is not None:
            X, y = self.resample_with_replacement(X, y, sample_weight)
        return self._fit(X, y, incremental=(self.warm_start and
                                            hasattr(self, "classes_")))

## Data Loading

In [5]:
%%time
tp = Tadpole()

Wall time: 8.74 s


In [6]:
# number of runs
n_runs = 30

In [8]:
# parameters
params_dtc = get_params('dtc')
params_svc = get_params('svc')
params_lra = get_params('lra')
params_ann = get_params('ann')

params_boost_dtc = {'learning_rate': 0.0001, 'n_estimators': 400}
params_boost_svc = {'learning_rate': 0.01, 'n_estimators': 300}
params_boost_lra = {'learning_rate': 0.1, 'n_estimators': 50}
params_boost_ann = {'learning_rate': 0.0001, 'n_estimators': 300}

In [9]:
# output
dtc_out = './results/boost_dtc.csv'
svc_out = './results/boost_svc.csv'
lra_out = './results/boost_lra.csv'
ann_out = './results/boost_ann.csv'

## Boosting Ensembles

### DTC

In [12]:
dtc = DecisionTreeClassifier()
dtc.set_params(**params_dtc)

boost_clf = AdaBoostClassifier(base_estimator = dtc,                             
                            random_state = 0,
                            **params_boost_dtc)
boost_dtc = Classifier(boost_clf)
run(boost_dtc, tp, n_runs=n_runs, output=dtc_out)

100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:54<00:00,  1.83s/it]


Unnamed: 0,BCA_train,BCA_test,mAUC_train,mAUC_test
0,0.885154,0.905549,0.971989,0.973762
1,0.891859,0.856264,0.975105,0.95738
2,0.886009,0.898321,0.972166,0.970868
3,0.89365,0.871793,0.974369,0.96266
4,0.893053,0.869713,0.973628,0.963681
5,0.889294,0.88036,0.972965,0.966297
6,0.883502,0.880176,0.972226,0.970626
7,0.880994,0.903902,0.972698,0.972162
8,0.888702,0.892903,0.973492,0.971792
9,0.889408,0.879002,0.972288,0.969631


### SVC

In [13]:
svc = SVC(probability=True)
svc.set_params(**params_svc)

boost_clf = AdaBoostClassifier(base_estimator = svc,                             
                            random_state = 0,
                            **params_boost_svc)
boost_svc = Classifier(boost_clf)
run(boost_svc, tp, n_runs=n_runs, output=svc_out)

100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [41:51<00:00, 83.71s/it]


Unnamed: 0,BCA_train,BCA_test,mAUC_train,mAUC_test
0,0.859466,0.873502,0.960325,0.970581
1,0.863945,0.820509,0.966538,0.947693
2,0.852751,0.876984,0.96194,0.960259
3,0.860465,0.831671,0.962776,0.954503
4,0.838149,0.849425,0.959133,0.956996
5,0.833067,0.862894,0.955525,0.95755
6,0.850661,0.871999,0.959941,0.967145
7,0.858647,0.862379,0.961021,0.961519
8,0.866818,0.862894,0.961196,0.96316
9,0.853204,0.879907,0.957286,0.969721


### LRA

In [14]:
lra = LogisticRegression()
lra.set_params(**params_lra)

boost_clf = AdaBoostClassifier(base_estimator = lra,                             
                            random_state = 0,
                            **params_boost_lra)
boost_lra = Classifier(boost_clf)
run(boost_lra, tp, n_runs=n_runs, output=lra_out)

100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:08<00:00,  3.68it/s]


Unnamed: 0,BCA_train,BCA_test,mAUC_train,mAUC_test
0,0.333333,0.333333,0.5,0.5
1,0.333333,0.333333,0.5,0.5
2,0.333333,0.333333,0.5,0.5
3,0.333333,0.333333,0.5,0.5
4,0.333333,0.333333,0.5,0.5
5,0.333333,0.333333,0.5,0.5
6,0.333333,0.333333,0.5,0.5
7,0.333333,0.333333,0.5,0.5
8,0.333333,0.333333,0.5,0.5
9,0.333333,0.333333,0.5,0.5


### ANN

In [21]:
ann = CustomMLPClassifier()
ann.set_params(**params_ann)

boost_clf = AdaBoostClassifier(base_estimator = ann,                              
                            random_state = 0,
                            **params_boost_ann)
boost_ann = Classifier(boost_clf)
run(boost_ann, tp, n_runs=n_runs, output=ann_out)

100%|███████████████████████████████████████████████████████████████████████████████| 30/30 [6:49:56<00:00, 819.89s/it]


Unnamed: 0,BCA_train,BCA_test,mAUC_train,mAUC_test
0,0.895642,0.917143,0.981065,0.976742
1,0.899045,0.852926,0.98302,0.960995
2,0.895235,0.887178,0.982113,0.970665
3,0.901645,0.876405,0.982247,0.968391
4,0.904081,0.863471,0.982612,0.969722
5,0.897052,0.874283,0.98156,0.97298
6,0.894086,0.890823,0.980603,0.976971
7,0.90372,0.846396,0.9831,0.967871
8,0.898041,0.866539,0.982256,0.972532
9,0.889751,0.914837,0.980985,0.977063
