# Playground for Ensemble Slides - Study on Many Datasets

- Stephen W. Thomas
- Used for MMA 869, MMAI 869, and GMMA 869

In [1]:
import datetime
print(datetime.datetime.now())

2020-07-10 07:31:35.859945


In [2]:
import pandas as pd
pd.show_versions(as_json=False)

import sklearn
sklearn.__version__


INSTALLED VERSIONS
------------------
commit           : None
python           : 3.6.10.final.0
python-bits      : 64
OS               : Windows
OS-release       : 10
machine          : AMD64
processor        : Intel64 Family 6 Model 142 Stepping 10, GenuineIntel
byteorder        : little
LC_ALL           : None
LANG             : None
LOCALE           : None.None

pandas           : 1.0.3
numpy            : 1.18.5
pytz             : 2020.1
dateutil         : 2.8.1
pip              : 20.1.1
setuptools       : 49.1.0.post20200704
Cython           : None
pytest           : 5.4.3
hypothesis       : None
sphinx           : None
blosc            : None
feather          : None
xlsxwriter       : None
lxml.etree       : 4.5.0
html5lib         : None
pymysql          : None
psycopg2         : None
jinja2           : 2.11.2
IPython          : 7.16.1
pandas_datareader: None
bs4              : 4.9.0
bottleneck       : None
fastparquet      : None
gcsfs            : None
lxml.etree       : 4.5.0


'0.23.1'

In [3]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns 

from sklearn.model_selection import train_test_split

import itertools

import scipy

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))


In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier, StackingClassifier, VotingClassifier, BaggingClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier

from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier


from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, recall_score, precision_score, roc_auc_score

import time

# Helper function
def do_all_for_dataset(X_train, X_test, y_train, y_test):
    
    nb = GaussianNB()   
    lr = LogisticRegression(random_state=42, solver='lbfgs', max_iter=2000)
    dt = DecisionTreeClassifier(random_state=42)
    rf = RandomForestClassifier(random_state=42, n_estimators=200)
    ada = AdaBoostClassifier(random_state=42, n_estimators=200)
       
    dict_classifiers = {
        "LR": lr, 
        "NB": nb,
        "DT": dt,
        "Voting": VotingClassifier(estimators = [('DT', dt), ('LR', lr), ('NB', nb)], voting='soft'),
        "Bagging": BaggingClassifier(DecisionTreeClassifier(), n_estimators=200, random_state=42),
        "RF": rf,
        "ExtraTrees": ExtraTreesClassifier(random_state=42, n_estimators=200),
        "Adaboost": ada,
        "GBC": GradientBoostingClassifier(random_state=42, n_estimators=200),
        "Stacking": StackingClassifier(estimators=[('DT', dt), ('LR', lr), ('NB', nb), ('RF', rf), ('ADA', ada)], final_estimator=LogisticRegression())
    }
    
    evals = list()
    
    for model_name, model in dict_classifiers.items():
        start = time.time()
        y_pred = model.fit(X_train, y_train).predict(X_test)
        end = time.time()
        total = end - start
        
        accuracy       = accuracy_score(y_test, y_pred)
        f1             = f1_score(y_test, y_pred)
        recall         = recall_score(y_test, y_pred)
        precision      = precision_score(y_test, y_pred)
        roc_auc        = roc_auc_score(y_test, y_pred)
    
        df = pd.DataFrame({"Method"    : [model_name],
                           "Time"      : [total],
                           "Accuracy"  : [accuracy],
                           "Recall"    : [recall],
                           "Precision" : [precision],
                           "F1"        : [f1],
                           "AUC"       : [roc_auc],
                          })
        evals.append(df)
   

    evals_all = pd.concat([m for m in evals], axis = 0).reset_index()

    evals_all = evals_all.drop(columns = "index",axis =1)
    evals_all = evals_all.sort_values(by=['F1'], ascending=False)
    
    return evals_all

In [5]:
df = pd.read_csv('data/creditcard_sample.csv')
# This dataset is huge, so let's take a sample to speed things up
df = df.sample(frac=0.1, replace=False, random_state=1, axis=0)

X = df.drop(['Class'], axis=1)
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
y.value_counts()
y_test.value_counts()

do_all_for_dataset(X_train, X_test, y_train, y_test)

0    14214
1       26
Name: Class, dtype: int64

0    2843
1       5
Name: Class, dtype: int64

Unnamed: 0,Method,Time,Accuracy,Recall,Precision,F1,AUC
3,Voting,1.460999,0.998947,1.0,0.625,0.769231,0.999472
0,LR,1.265998,0.998947,0.8,0.666667,0.727273,0.899648
6,ExtraTrees,0.789999,0.998947,0.4,1.0,0.571429,0.7
4,Bagging,19.357263,0.998596,0.4,0.666667,0.5,0.699824
5,RF,4.046808,0.998596,0.4,0.666667,0.5,0.699824
7,Adaboost,7.974034,0.998596,0.4,0.666667,0.5,0.699824
9,Stacking,65.740811,0.998596,0.4,0.666667,0.5,0.699824
2,DT,0.227002,0.997893,0.4,0.4,0.4,0.699472
8,GBC,18.909412,0.997893,0.2,0.333333,0.25,0.599648
1,NB,0.016,0.987008,1.0,0.119048,0.212766,0.993493


In [6]:
df = pd.read_csv('data/diabetes_orig.csv')
X = df.drop(['Id', 'diabetes'], axis=1)
y = df['diabetes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
y.value_counts()
y_test.value_counts()

do_all_for_dataset(X_train, X_test, y_train, y_test)

0    500
1    268
Name: diabetes, dtype: int64

0    100
1     54
Name: diabetes, dtype: int64

Unnamed: 0,Method,Time,Accuracy,Recall,Precision,F1,AUC
7,Adaboost,0.334999,0.772727,0.648148,0.686275,0.666667,0.744074
8,GBC,0.238961,0.753247,0.62963,0.653846,0.641509,0.724815
3,Voting,0.057001,0.746753,0.592593,0.653061,0.621359,0.711296
5,RF,0.350259,0.746753,0.592593,0.653061,0.621359,0.711296
4,Bagging,0.582039,0.746753,0.574074,0.659574,0.613861,0.707037
1,NB,0.006997,0.707792,0.648148,0.57377,0.608696,0.694074
9,Stacking,4.725806,0.733766,0.555556,0.638298,0.594059,0.692778
6,ExtraTrees,0.262999,0.727273,0.555556,0.625,0.588235,0.687778
2,DT,0.011,0.727273,0.5,0.642857,0.5625,0.675
0,LR,0.052,0.714286,0.518519,0.608696,0.56,0.669259


In [7]:
df = pd.read_csv('data/GermanCredit.csv')
df['Class'] = (df['Class'] == 'Good').astype(int)
X = df.drop(['Class'], axis=1)
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=44)

y.value_counts()
y_test.value_counts()

do_all_for_dataset(X_train, X_test, y_train, y_test)

1    700
0    300
Name: Class, dtype: int64

1    140
0     60
Name: Class, dtype: int64

Unnamed: 0,Method,Time,Accuracy,Recall,Precision,F1,AUC
5,RF,0.441998,0.76,0.9,0.7875,0.84,0.666667
9,Stacking,6.201161,0.76,0.857143,0.810811,0.833333,0.695238
6,ExtraTrees,0.379036,0.755,0.864286,0.801325,0.831615,0.682143
0,LR,0.182006,0.75,0.857143,0.8,0.827586,0.678571
7,Adaboost,0.485,0.75,0.85,0.804054,0.826389,0.683333
4,Bagging,1.048002,0.75,0.842857,0.808219,0.825175,0.688095
8,GBC,0.410963,0.735,0.828571,0.8,0.814035,0.672619
3,Voting,0.179001,0.725,0.807143,0.801418,0.80427,0.670238
2,DT,0.012998,0.685,0.764286,0.781022,0.772563,0.632143
1,NB,0.007999,0.685,0.707143,0.818182,0.758621,0.670238


In [8]:
df = pd.read_csv('data/laheart.csv')
X = df.drop(['ID', 'DEATH_YR', 'DEATH'], axis=1)
y = df['DEATH']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


y.value_counts()
y_train.value_counts()

do_all_for_dataset(X_train, X_test, y_train, y_test)

0    136
1     64
Name: DEATH, dtype: int64

Unnamed: 0,Method,Time,Accuracy,Recall,Precision,F1,AUC
7,Adaboost,0.280997,0.8,0.692308,0.692308,0.692308,0.77208
1,NB,0.003006,0.725,0.384615,0.625,0.47619,0.636752
4,Bagging,0.369671,0.7,0.384615,0.555556,0.454545,0.618234
8,GBC,0.139038,0.675,0.384615,0.5,0.434783,0.599715
5,RF,0.263384,0.725,0.307692,0.666667,0.421053,0.616809
2,DT,0.006002,0.6,0.384615,0.384615,0.384615,0.54416
3,Voting,0.208954,0.675,0.307692,0.5,0.380952,0.579772
6,ExtraTrees,0.199961,0.725,0.230769,0.75,0.352941,0.596866
0,LR,0.197035,0.675,0.230769,0.5,0.315789,0.559829
9,Stacking,4.469018,0.7,0.153846,0.666667,0.25,0.558405


In [None]:
df = pd.read_csv('data/HR_comma_sep.csv')



X = df.drop(['ID', 'DEATH_YR', 'DEATH'], axis=1)
y = df['left']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


y.value_counts()
y_train.value_counts()

do_all_for_dataset(X_train, X_test, y_train, y_test)