In [15]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier

from data import get_data

**Load data**

In [16]:
ROOT = './tadpole_challenge/'
X, y = get_data(ROOT)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

**Helper functions**

Some temporary performance metrics.

In [18]:
def accuracy(clf, X, y):
    y_pred = clf.predict(X)
    acc = sum(y_pred == y) / len(y)
    return acc

In [19]:
def report_accuracy(clf):
    train_acc = accuracy(clf, X_train, y_train)
    test_acc = accuracy(clf, X_test, y_test)
    print('Train accuracy:', train_acc)
    print('Test accuracy:', test_acc)

Settings are not final.

In [20]:
def bagging_clf(clf):
    return BaggingClassifier(base_estimator = clf, 
                             n_estimators = 100, 
                             max_samples = 0.3, 
                             random_state = 0)

In [71]:
def evaluate(Classifier, **kwargs):
    print('+ Base learner')
    clf = Classifier(**kwargs).fit(X_train, y_train)
    report_accuracy(clf)
    print('+ Ensemble')
    bag_clf = bagging_clf(Classifier(**kwargs)).fit(X_train, y_train)
    report_accuracy(bag_clf)

In [72]:
def main():
    baselearners = [DecisionTreeClassifier, 
                    LinearSVC, 
                    MLPClassifier,
                    LogisticRegression]
    
    for learner in baselearners:
        print(learner)
        evaluate(learner)
        print()

## DTC

In [73]:
evaluate(DecisionTreeClassifier)

+ Base learner
Train accuracy: 1.0
Test accuracy: 0.7109826589595376
+ Ensemble
Train accuracy: 0.8728323699421965
Test accuracy: 0.7976878612716763


## SVM

In [74]:
evaluate(LinearSVC)

+ Base learner
Train accuracy: 0.7182080924855492
Test accuracy: 0.7745664739884393
+ Ensemble
Train accuracy: 0.7052023121387283
Test accuracy: 0.7687861271676301


## ANN

In [34]:
evaluate(MLPClassifier)

+ Base learner
Train accuracy: 0.763728323699422
Test accuracy: 0.7572254335260116
+ Ensemble
Train accuracy: 0.6921965317919075
Test accuracy: 0.7398843930635838


## LRA

In [35]:
evaluate(LogisticRegression)

+ Base learner
Train accuracy: 0.7044797687861272
Test accuracy: 0.7601156069364162
+ Ensemble
Train accuracy: 0.6784682080924855
Test accuracy: 0.7109826589595376


## All base learners at once

In [36]:
main()

<class 'sklearn.tree._classes.DecisionTreeClassifier'>
+ Base learner
Train accuracy: 1.0
Test accuracy: 0.6878612716763006
+ Ensemble
Train accuracy: 0.8728323699421965
Test accuracy: 0.7976878612716763

<class 'sklearn.svm._classes.LinearSVC'>
+ Base learner
Train accuracy: 0.7182080924855492
Test accuracy: 0.7745664739884393
+ Ensemble
Train accuracy: 0.7052023121387283
Test accuracy: 0.7687861271676301

<class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>
+ Base learner
Train accuracy: 0.755057803468208
Test accuracy: 0.7572254335260116
+ Ensemble
Train accuracy: 0.6921965317919075
Test accuracy: 0.7398843930635838

<class 'sklearn.linear_model._logistic.LogisticRegression'>
+ Base learner
Train accuracy: 0.7044797687861272
Test accuracy: 0.7601156069364162
+ Ensemble
Train accuracy: 0.6784682080924855
Test accuracy: 0.7109826589595376

