<a href="https://colab.research.google.com/github/stevenhastings/DS_Workshops/blob/main/SuperEnsembleClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import os
from time import perf_counter

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

# ***gen_data.py***

In [2]:
import os
from string import ascii_uppercase

from pandas import DataFrame
from sklearn import datasets


n_features = 5
n_samples = 1000
target = "Target"
features = list(ascii_uppercase)[:n_features]

X, y = datasets.make_classification(
    n_samples=n_samples,
    n_features=n_features,
    n_informative=5,
    n_repeated=0,
    n_redundant=0,
    n_classes=5,
    random_state=42,
)

df = DataFrame(data=X, columns=features)
df[target] = y
df.head()

df.to_csv(os.path.join("dataset2.csv"), index=False)

In [6]:
first_path = "https://raw.githubusercontent.com/BrokenShell/SuperEnsembleClassifier/main/data/dataset.csv"
second_path = "https://raw.githubusercontent.com/BrokenShell/SuperEnsembleClassifier/main/data/dataset2.csv"
df = pd.read_csv(first_path)
df2 = pd.read_csv(second_path)
df.sample(10)

Unnamed: 0,A,B,C,D,E,Target
4740,-0.189699,-0.582538,-2.568042,1.215547,2.068821,3
312,2.863868,2.580598,-1.990148,4.335736,-0.814625,1
1710,0.679644,-0.54451,1.334222,0.178388,0.363296,1
2958,-1.574501,1.670298,-3.054169,-0.96448,2.22003,1
2148,-0.97175,-0.418527,-0.490103,-0.951355,-1.839895,4
4794,-0.77011,1.819862,0.803217,-0.117204,0.028863,2
146,2.630792,2.476772,-2.113398,2.269013,-3.745115,1
1398,-2.300902,-0.520139,2.524671,-0.441127,0.030031,2
2212,-2.076105,2.092011,-1.659259,-2.919632,1.814562,2
3168,-1.533323,0.797433,-3.420137,-0.095002,2.061979,1


In [7]:
# df = pandas.read_csv(os.path.join("data", "dataset.csv"))
# df.sample(10)
df2.sample(10)

Unnamed: 0,A,B,C,D,E,Target
499,-0.380621,-1.063205,2.526326,-0.195112,2.629562,2
742,-2.712791,-3.527402,-1.969738,-2.667003,1.493164,4
317,-1.776457,-0.265082,1.542769,1.366614,0.532151,0
103,-2.867411,-2.085744,1.614996,0.91272,2.393075,2
140,0.179392,-0.25191,0.270208,-1.642694,2.63016,3
414,-1.834358,1.136924,1.609472,-0.476674,1.784506,2
416,-1.158206,0.576693,-0.277032,-0.744612,0.153791,3
467,-0.666197,1.843392,-0.331834,4.04304,-0.039171,1
558,1.424989,2.165713,-1.550297,2.238547,-2.043095,1
909,-0.148005,1.626791,-2.046746,-1.24086,1.617276,3


In [8]:
df.shape

(5000, 6)

In [9]:
df.describe()

Unnamed: 0,A,B,C,D,E,Target
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,-0.587581,0.614006,0.005137,-0.215878,0.006497,2.0014
std,1.493213,1.498148,1.72805,1.652978,1.632831,1.413576
min,-6.69598,-5.024347,-6.114165,-6.489088,-6.87751,0.0
25%,-1.575304,-0.346215,-1.198611,-1.386536,-1.075645,1.0
50%,-0.691885,0.703356,-0.068541,-0.253239,0.05598,2.0
75%,0.326658,1.63805,1.169151,0.945687,1.154812,3.0
max,5.128072,6.357553,6.419446,4.993319,4.938408,4.0


In [10]:
df.corr()

Unnamed: 0,A,B,C,D,E,Target
A,1.0,0.187545,0.135122,0.341305,-0.110537,-0.181537
B,0.187545,1.0,0.037219,0.265218,-0.088616,-0.28738
C,0.135122,0.037219,1.0,-0.029098,0.14531,-0.15968
D,0.341305,0.265218,-0.029098,1.0,-0.057682,-0.252799
E,-0.110537,-0.088616,0.14531,-0.057682,1.0,-0.354342
Target,-0.181537,-0.28738,-0.15968,-0.252799,-0.354342,1.0


In [11]:
target = df.columns[-1]
features = df.columns.drop(target)
X_train, X_test, y_train, y_test = train_test_split(
    df[features],
    df[target],
    random_state=42,
    test_size=0.2,
)

In [12]:
print(f"{1 / len(df['Target'].unique()):.2%}")

20.00%


In [13]:
base_model = LogisticRegression(max_iter=1024, random_state=42)
start = perf_counter()
base_model.fit(X_train, y_train)
stop = perf_counter()
duration = stop - start
print(f"Algorithm: {base_model}")
print(f"Train Time: {duration:.2f}s")
print(f"Accuracy Score: {base_model.score(X_test, y_test):.2%}")

Algorithm: LogisticRegression(max_iter=1024, random_state=42)
Train Time: 0.07s
Accuracy Score: 54.80%


In [14]:
models = [
    KNeighborsClassifier(),
    SVC(random_state=42),
    DecisionTreeClassifier(random_state=42),
    RandomForestClassifier(random_state=42),
    AdaBoostClassifier(random_state=42),
    GaussianNB(),
]
for model in models:
    start = perf_counter()
    model.fit(X_train, y_train)
    stop = perf_counter()
    duration = stop - start
    print(f"Algorithm: {model}")
    print(f"Train Time: {duration:.2f}s")
    print(f"Test Score: {model.score(X_test, y_test):.2%}\n")

Algorithm: KNeighborsClassifier()
Train Time: 0.01s
Test Score: 75.30%

Algorithm: SVC(random_state=42)
Train Time: 0.50s
Test Score: 74.70%

Algorithm: DecisionTreeClassifier(random_state=42)
Train Time: 0.03s
Test Score: 62.20%

Algorithm: RandomForestClassifier(random_state=42)
Train Time: 0.92s
Test Score: 73.80%

Algorithm: AdaBoostClassifier(random_state=42)
Train Time: 0.34s
Test Score: 46.80%

Algorithm: GaussianNB()
Train Time: 0.00s
Test Score: 57.10%



In [15]:
from sklearn.ensemble import StackingClassifier

In [16]:
executives = [
    KNeighborsClassifier(),
    SVC(random_state=42),
    RandomForestClassifier(random_state=42),
    AdaBoostClassifier(random_state=42),
    GaussianNB(),
]
for executive in executives:
    model = StackingClassifier(
        estimators=[
            ("KNC", KNeighborsClassifier()),
            ("SVC", SVC(random_state=42)),
            ("RFC", RandomForestClassifier(random_state=42)),
            ("ABC", AdaBoostClassifier(random_state=42)),
            ("GNB", GaussianNB()),
        ],
        final_estimator=executive,
    )
    start = perf_counter()
    model.fit(X_train, y_train)
    stop = perf_counter()
    duration = stop - start
    print(f"Workers: {', '.join(model.named_estimators)}")
    print(f"Executive: {model.final_estimator}")
    print(f"Train Time: {duration:.2f}s")
    print(f"Test Score: {model.score(X_test, y_test):.2%}\n")

Workers: KNC, SVC, RFC, ABC, GNB
Executive: KNeighborsClassifier()
Train Time: 9.33s
Test Score: 72.90%

Workers: KNC, SVC, RFC, ABC, GNB
Executive: SVC(random_state=42)
Train Time: 9.75s
Test Score: 75.10%

Workers: KNC, SVC, RFC, ABC, GNB
Executive: RandomForestClassifier(random_state=42)
Train Time: 10.69s
Test Score: 77.10%

Workers: KNC, SVC, RFC, ABC, GNB
Executive: AdaBoostClassifier(random_state=42)
Train Time: 9.90s
Test Score: 66.90%

Workers: KNC, SVC, RFC, ABC, GNB
Executive: GaussianNB()
Train Time: 9.22s
Test Score: 74.10%



In [17]:
workers = [
    ("KNC", KNeighborsClassifier()),
    ("SVC", SVC(random_state=42)),
    ("RFC", RandomForestClassifier(random_state=42)),
    ("ABC", AdaBoostClassifier(random_state=42)),
    ("GNB", GaussianNB()),
]
for i in range(len(workers)):
    worker_set = workers[:i] + workers[i + 1:]
    model = StackingClassifier(
        estimators=worker_set,
        final_estimator=RandomForestClassifier(random_state=42),
    )
    start = perf_counter()
    model.fit(X_train, y_train)
    stop = perf_counter()
    duration = stop - start
    print(f"Workers: {', '.join(model.named_estimators)}")
    print(f"Executive: {model.final_estimator}")
    print(f"Train Time: {duration:.2f}s")
    print(f"Test Score: {model.score(X_test, y_test):.2%}\n")

Workers: SVC, RFC, ABC, GNB
Executive: RandomForestClassifier(random_state=42)
Train Time: 10.65s
Test Score: 75.90%

Workers: KNC, RFC, ABC, GNB
Executive: RandomForestClassifier(random_state=42)
Train Time: 7.81s
Test Score: 75.40%

Workers: KNC, SVC, ABC, GNB
Executive: RandomForestClassifier(random_state=42)
Train Time: 5.79s
Test Score: 76.20%

Workers: KNC, SVC, RFC, GNB
Executive: RandomForestClassifier(random_state=42)
Train Time: 8.61s
Test Score: 76.20%

Workers: KNC, SVC, RFC, ABC
Executive: RandomForestClassifier(random_state=42)
Train Time: 10.32s
Test Score: 76.10%



In [18]:
model = StackingClassifier(
    estimators=[
        ("KNC", KNeighborsClassifier()),
        ("SVC", SVC(random_state=42)),
        ("RFC", RandomForestClassifier(random_state=42)),
        ("ABC", AdaBoostClassifier(random_state=42)),
        ("GNB", GaussianNB()),
    ],
    final_estimator=RandomForestClassifier(random_state=42),
)
start = perf_counter()
model.fit(X_train, y_train)
stop = perf_counter()
duration = stop - start
print(f"Workers: {', '.join(model.named_estimators)}")
print(f"Executive: {model.final_estimator}")
print(f"Train Time: {duration:.2f}s")
print(f"Test Score: {model.score(X_test, y_test):.2%}\n")

Workers: KNC, SVC, RFC, ABC, GNB
Executive: RandomForestClassifier(random_state=42)
Train Time: 10.67s
Test Score: 77.10%



In [19]:
from sklearn.ensemble import VotingClassifier

In [20]:
model = VotingClassifier(
    estimators=[
        ("KNC", KNeighborsClassifier()),
        ("SVC", SVC(random_state=42)),
        ("RFC", RandomForestClassifier(random_state=42)),
        ("ABC", AdaBoostClassifier(random_state=42)),
        ("GNB", GaussianNB()),
    ],
)
start = perf_counter()
model.fit(X_train, y_train)
stop = perf_counter()
duration = stop - start
print(f"Workers: {', '.join(model.named_estimators)}")
print(f"Train Time: {duration:.2f}s")
print(f"Test Score: {model.score(X_test, y_test):.2%}\n")

Workers: KNC, SVC, RFC, ABC, GNB
Train Time: 1.75s
Test Score: 73.70%

