# Stacking

Training D1_D2 dataset using the `Stacking` ensemble method.

### Author
Ajinkya Indulkar

In [1]:
# import libraries
import warnings
import numpy as np

warnings.filterwarnings(action='ignore')

# import custom class
from tadpole import Tadpole

# import sklearn model training
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# set seed (for reproducibility)
np.random.seed(43)

### Data Loading + Pre-processing

In [2]:
# initialize class
tp = Tadpole(debug=True)

In [3]:
# load and pre-process tadpole dataset
tp.load()
print(tp.label_dict)

loading tadpole dataset
pre-processing dataset


In [4]:
# train-test split
tp.split()

splitting dataset to train and test datasets


### Model Training

#### Train a Decision Tree model (base learner)

In [5]:
# define and train DT base learner
dtc = DecisionTreeClassifier()
dtc.fit(tp.X_train, tp.y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [6]:
print('DT Classifier accuracy:', dtc.score(tp.X_test, tp.y_test))

DT Classifier accuracy: 0.8352601156069365


#### Train a Stacking model with DT as base learner

In [7]:
# define estimator pipeline
estimators = [
    ('dt', DecisionTreeClassifier()),
    ('svr', make_pipeline(StandardScaler(), LinearSVC()))]

In [8]:
# define and train stacking ensemble method
stk_clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(), verbose=2)
stk_clf.fit(tp.X_train, tp.y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.4s finished


StackingClassifier(cv=None,
                   estimators=[('dt',
                                DecisionTreeClassifier(ccp_alpha=0.0,
                                                       class_weight=None,
                                                       criterion='gini',
                                                       max_depth=None,
                                                       max_features=None,
                                                       max_leaf_nodes=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                       min_weight_fraction_leaf=0.0,
                                                       presort='deprecated',
                     

In [9]:
print('Stacking Classifier accuracy:', stk_clf.score(tp.X_test, tp.y_test))

Stacking Classifier accuracy: 0.8786127167630058


### Save Best Model

In [10]:
model_name = "stack_200522_03.pkl" #ensemble-date-modelversion

tp.save(stk_clf, model_name)

saving trained model
