In [99]:
# General
import numpy as np
import pandas as pd

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.svm import SVC

# Utilities
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score
from copy import copy as make_copy

In [100]:
# Generate classification data    
SEED = 2018

X, y = make_classification(n_samples=10000, n_features=40, n_redundant=0,
                           n_classes=2, random_state=SEED)
XDF = pd.DataFrame(X, columns=range(1, 41, 1))
df = pd.concat([XDF, pd.Series(y, name='Target')], axis=1)
X = df.iloc[:,range(0,40,1)]
y = df['Target']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3,
                                                    random_state=SEED)

In [101]:
df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,32,33,34,35,36,37,38,39,40,Target
0,2.136398,-1.479217,-0.703011,2.065569,0.326215,0.680216,0.158314,0.475397,0.199404,0.336694,...,0.113876,-1.164264,1.154697,0.181554,0.491464,0.102339,-1.158225,1.428284,0.597758,1
1,-0.504827,-1.015827,-0.903809,0.176179,1.056003,2.216257,-0.336319,-0.350847,-0.490535,1.283877,...,0.457354,1.029749,-0.522809,-0.41161,0.851378,-1.263698,-0.329216,0.52213,0.955266,1
2,0.935382,0.918816,-1.756534,0.469305,0.864305,0.147526,-1.737556,-0.274645,0.172316,0.102401,...,-0.347611,-0.922729,0.944388,-0.230285,1.053182,0.610122,-0.531289,0.177155,1.271056,0
3,0.563239,-0.089968,-1.697253,-1.857505,-0.520472,-0.197492,1.828199,0.259581,-1.307481,1.490092,...,0.588355,-0.481022,0.786705,-0.770595,-0.410716,-1.84844,-0.658107,-0.312612,1.459883,0
4,0.367298,0.46803,0.240915,0.479645,-0.590921,0.252307,0.738193,0.91374,-1.238055,-0.506258,...,0.492747,1.156892,1.067635,0.45759,-0.964262,1.35202,-1.716665,0.413292,1.242477,1


In [102]:
X.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,31,32,33,34,35,36,37,38,39,40
0,2.136398,-1.479217,-0.703011,2.065569,0.326215,0.680216,0.158314,0.475397,0.199404,0.336694,...,2.927491,0.113876,-1.164264,1.154697,0.181554,0.491464,0.102339,-1.158225,1.428284,0.597758
1,-0.504827,-1.015827,-0.903809,0.176179,1.056003,2.216257,-0.336319,-0.350847,-0.490535,1.283877,...,-0.918507,0.457354,1.029749,-0.522809,-0.41161,0.851378,-1.263698,-0.329216,0.52213,0.955266
2,0.935382,0.918816,-1.756534,0.469305,0.864305,0.147526,-1.737556,-0.274645,0.172316,0.102401,...,0.559697,-0.347611,-0.922729,0.944388,-0.230285,1.053182,0.610122,-0.531289,0.177155,1.271056
3,0.563239,-0.089968,-1.697253,-1.857505,-0.520472,-0.197492,1.828199,0.259581,-1.307481,1.490092,...,1.2965,0.588355,-0.481022,0.786705,-0.770595,-0.410716,-1.84844,-0.658107,-0.312612,1.459883
4,0.367298,0.46803,0.240915,0.479645,-0.590921,0.252307,0.738193,0.91374,-1.238055,-0.506258,...,-0.648702,0.492747,1.156892,1.067635,0.45759,-0.964262,1.35202,-1.716665,0.413292,1.242477


In [103]:
y_train.head()

5608    0
9811    1
8748    1
1223    0
794     0
Name: Target, dtype: int64

In [104]:
X_train.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,31,32,33,34,35,36,37,38,39,40
5608,-0.097796,1.802956,0.261603,1.855426,-0.257414,-0.668384,-0.144844,-0.418893,-0.597511,1.688892,...,0.400077,-0.655398,0.49869,0.475219,0.252238,0.477526,0.655597,1.375399,1.171419,0.854399
9811,0.282825,-0.465071,-1.669404,0.605164,-0.490462,-0.998484,0.14022,1.147137,0.172802,-0.638729,...,2.536403,0.301944,0.845361,0.374867,0.008023,-0.070534,-0.062904,-0.084143,-1.348575,1.081723
8748,-1.081339,0.178929,-1.218006,-0.662687,0.089189,0.379952,-0.281489,-1.589863,-0.138283,0.405542,...,-1.230865,-0.562558,-0.036891,-0.309939,-1.088222,0.270362,-1.503581,-0.984401,-0.130068,2.189865
1223,0.474709,0.289101,0.89263,-0.623837,0.546225,-0.873852,-0.191336,0.150735,-0.432376,0.013506,...,2.6908,1.924172,0.317753,0.321746,-1.24369,0.990459,-0.554268,-0.676453,1.190681,-1.979539
794,0.124288,-1.480249,0.38419,-0.159436,-1.079928,0.573386,0.064537,-1.510065,-0.813511,1.835092,...,1.222295,-4.124583,-0.560597,0.532232,-0.350833,-0.001426,-1.372551,-1.23221,0.758144,1.44306


In [105]:
# Define Base (level 0) and Stacking (level 1) estimators
base_clf = [LogisticRegression(random_state=5, solver='lbfgs'), RandomForestClassifier(n_estimators=100, random_state=5), 
            AdaBoostClassifier(random_state=5), SVC(probability=True, random_state=5, gamma='scale')]
stck_clf = LogisticRegression(random_state=5, solver='lbfgs')

In [106]:
# Evaluate Base estimators separately
for clf in base_clf:
    
    # Fit model
    clf.fit(X_train, y_train)
    
    # Predict
    y_pred = clf.predict(X_val)
    
    # Calculate accuracy
    acc = accuracy_score(y_val, y_pred)
    print('{} Accuracy: {:.2f}%'.format(clf.__class__.__name__, acc * 100))

LogisticRegression Accuracy: 95.20%
RandomForestClassifier Accuracy: 96.17%
AdaBoostClassifier Accuracy: 95.40%
SVC Accuracy: 96.17%


In [114]:
# Create Hold Out predictions (meta-features)
def hold_out_predict(clf, X, y, cv):
        
    """Performing cross validation hold out predictions for stacking"""
    # Initilize
    n_classes = len(np.unique(y)) # Assuming that training data contains all classes
    meta_features = np.zeros((X.shape[0], n_classes)) 
    n_splits = cv.get_n_splits(X, y)
    
    # Loop over folds
    print("Starting hold out prediction with {} splits for {}.".format(n_splits, clf.__class__.__name__))
    for train_idx, hold_out_idx in cv.split(X): 
        
        # Split data
        X_train = X.iloc[train_idx]    
        y_train = y.iloc[train_idx]
        X_hold_out = X.iloc[hold_out_idx]

        # Fit estimator to K-1 parts and predict on hold out part
        est = make_copy(clf)
        est.fit(X_train, y_train)
        y_hold_out_pred = est.predict_proba(X_hold_out)
        
        # Fill in meta features
        meta_features[hold_out_idx] = y_hold_out_pred

    return meta_features

In [117]:
# Create meta-features for training data

# Define 4-fold CV
cv = KFold(n_splits=4, random_state=SEED)

# Loop over classifier to produce meta features
meta_train = []
for clf in base_clf:
    
    # Create hold out predictions for a classifier
    meta_train_clf = hold_out_predict(clf, X_train, y_train, cv)
    
    #print(pd.DataFrame(meta_train_clf).head())
    # Remove redundant column
    meta_train_clf = np.delete(meta_train_clf, 0, axis=1).ravel()
    #print(pd.DataFrame(meta_train_clf).head())
    
    # Gather meta training data
    meta_train.append(meta_train_clf)
    #print(pd.DataFrame(meta_train).head())
    
meta_train = np.array(meta_train).T 
#print(pd.DataFrame(meta_train).head())

Starting hold out prediction with 4 splits for LogisticRegression.
Starting hold out prediction with 4 splits for RandomForestClassifier.
Starting hold out prediction with 4 splits for AdaBoostClassifier.
Starting hold out prediction with 4 splits for SVC.
          0     1         2         3
0  0.001152  0.03  0.467789  0.000708
1  0.098924  0.12  0.486892  0.024204
2  0.997904  0.91  0.536437  1.000000
3  0.117561  0.33  0.492631  0.071219
4  0.002137  0.08  0.467043  0.008766


In [119]:
# Create meta-features for testing data

meta_val = []
for clf in base_clf:
    
    # Create hold out predictions for a classifier
    clf.fit(X_train, y_train)
    meta_val_clf = clf.predict_proba(X_val)
    
    # Remove redundant column
    meta_val_clf = np.delete(meta_val_clf, 0, axis=1).ravel()
    
    # Gather meta training data
    meta_val.append(meta_val_clf)
    
meta_val = np.array(meta_val).T 
print(pd.DataFrame(meta_val).head())

          0     1         2         3
0  0.986661  0.96  0.527771  0.992024
1  0.500428  0.53  0.507079  0.307345
2  0.000301  0.10  0.463241  0.000201
3  0.975511  0.89  0.534609  0.970372
4  0.999992  0.95  0.536192  1.000000


In [None]:
# Predict on Stacking Classifier

# Set seed
if 'random_state' in stck_clf.get_params().keys():
    stck_clf.set_params(random_state=SEED)

# Optional (Add original features to meta)
original_flag = False
if original_flag:
    meta_train = np.concatenate((meta_train, X_train), axis=1)
    meta_test = np.concatenate((meta_test, X_test), axis=1)

# Fit model
stck_clf.fit(meta_train, y_train)

# Predict
y_pred = stck_clf.predict(meta_test)

# Calculate accuracy
acc = accuracy_score(y_test, y_pred)
print('Stacking {} Accuracy: {:.2f}%'.format(stck_clf.__class__.__name__, acc * 100))