In [1]:
# RUSboost implementation
# https://imbalanced-learn.org/stable/references/generated/imblearn.ensemble.RUSBoostClassifier.html
# https://ieeexplore.ieee.org/document/4761297

In [2]:
import numpy as np
import pandas as pd
import os, warnings
warnings.filterwarnings(action='ignore')
from imblearn.ensemble import RUSBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
repeat = 100

In [4]:
df = pd.read_csv('final_data.csv')

training_df = df.loc[df['Outside'] == 0]
test_df = df.loc[df['Outside'] == 1]

X_train_full = training_df.iloc[:, 5:]
y_train = training_df.iloc[:, 1 ]
X_test_full = test_df.iloc[:, 5:]
y_test = test_df.iloc[:, 1 ]

datasets = ['X_T2', 'X_T1', 'X_T1T2', 'X_adc']
estimators = ['decisiontree', 'randomforest']
for d, dataset in enumerate(datasets):
    if dataset == 'X_adc_T1_T2':
        X = X_train_full
        X_test = X_test_full
    elif dataset == 'X_T1':
        X = X_train_full.iloc[:, :107]
        X_test = X_test_full.iloc[:, :107]
    elif dataset == 'X_T2':
        X = X_train_full.iloc[:,107:214]
        X_test = X_test_full.iloc[:,107:214]
    elif dataset == 'X_adc_T1':
        X = X_train_full.iloc[:, np.r_[0:107, 214:321]]
        X_test = X_test_full.iloc[:, np.r_[0:107, 214:321]]
    elif dataset == 'X_adc_T2':
        X = X_train_full.iloc[:,107:321]
        X_test = X_test_full.iloc[:,107:321]
    elif dataset == 'X_adc':
        X = X_train_full.iloc[:, 214:]
        X_test = X_test_full.iloc[:, 214:]
    elif dataset == 'X_T1_T2':
        X = X_train_full.iloc[:, :214]
        X_test = X_test_full.iloc[:, :214]
    for _, estimator in enumerate(estimators):
        if estimator == 'decisiontree':
            base_estimator = DecisionTreeClassifier()
        elif estimator == 'randomforest':
            base_estimator = RandomForestClassifier(max_depth=4, random_state=seed)       
        print ("[ {} / {} ]".format(d, len(datasets)), "estimator:", estimator, "with dataset : ", dataset)
        AUC_trains = 0
        AUC_tests = 0        
        for seed in range(repeat):
            classifier = RUSBoostClassifier(base_estimator=base_estimator, random_state=seed)
            classifier.fit(X, y_train)
            preds_train = classifier.predict(X)
            preds_test = classifier.predict(X_test)
            performance_train = roc_auc_score(y_train, preds_train)
            performance_test = roc_auc_score(y_test, preds_test)
            AUC_trains += performance_train
            AUC_tests += performance_test  
        print ('training AUC:', AUC_trains/repeat)
        print ('validation AUC:',AUC_tests/repeat)        

[ 0 / 4 ] estimator: decisiontree with dataset :  X_T2
training AUC: 0.8180102040816327
validation AUC: 0.49193650793650784
[ 0 / 4 ] estimator: randomforest with dataset :  X_T2
training AUC: 0.8469387755102044
validation AUC: 0.5793650793650797
[ 1 / 4 ] estimator: decisiontree with dataset :  X_T1
training AUC: 0.8339795918367345
validation AUC: 0.44631746031746017
[ 1 / 4 ] estimator: randomforest with dataset :  X_T1
training AUC: 0.8619897959183679
validation AUC: 0.4201746031746029
[ 2 / 4 ] estimator: decisiontree with dataset :  X_T1T2
training AUC: 0.8339795918367345
validation AUC: 0.44631746031746017
[ 2 / 4 ] estimator: randomforest with dataset :  X_T1T2
training AUC: 0.8619897959183679
validation AUC: 0.4201746031746029
[ 3 / 4 ] estimator: decisiontree with dataset :  X_adc
training AUC: 0.8399489795918363
validation AUC: 0.5532539682539682
[ 3 / 4 ] estimator: randomforest with dataset :  X_adc
training AUC: 0.8622448979591845
validation AUC: 0.5514126984126986
