In [1]:
import pandas as pd
import numpy as np
import csv
import random

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.base import BaseEstimator
from scipy.optimize import minimize
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.calibration import CalibratedClassifierCV
from xgboost import XGBClassifier
from sknn.mlp import Classifier, Layer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler



Couldn't import dot_parser, loading of dot files will not be possible.


### First ensemble technique (EN_optA)
Given a set of predictions  `X1,X2,...,XnX1,X2,...,Xn ,` it computes the optimal set of weights  w1,w2,...,wnw1,w2,...,wn ; such that minimizes  log_loss(yT,yE)log_loss(yT,yE) , where  yE=X1∗w1+X2∗w2+...+Xn∗wnyE=X1∗w1+X2∗w2+...+Xn∗wn  and  yTyT  is the true solution.

In [2]:
def objf_ens_optA(w, Xs, y, n_class):
    """
    Function to be minimized in the EN_optA ensembler.
    
    Parameters:
    ----------
    w: array-like, shape=(n_preds)
       Candidate solution to the optimization problem (vector of weights).
    Xs: list of predictions to combine
       Each prediction is the solution of an individual classifier and has a
       shape=(n_samples, n_classes).
    y: array-like sahpe=(n_samples,)
       Class labels
    n_class: int
       Number of classes in the problem (12 in Airbnb competition)
    
    Return:
    ------
    score: Score of the candidate solution.
    """
    w = np.abs(w)
    sol = np.zeros(Xs[0].shape)
    for i in range(len(w)):
        sol += Xs[i] * w[i]
    #Using log-loss as objective function (different objective functions can be used here). 
    score = log_loss(y, sol)   
    return score
        

class EN_optA(BaseEstimator):
    """
    Given a set of predictions $X_1, X_2, ..., X_n$,  it computes the optimal set of weights
    $w_1, w_2, ..., w_n$; such that minimizes $log\_loss(y_T, y_E)$, 
    where $y_E = X_1*w_1 + X_2*w_2 +...+ X_n*w_n$ and $y_T$ is the true solution.
    """
    def __init__(self, n_class):
        super(EN_optA, self).__init__()
        self.n_class = n_class
        
    def fit(self, X, y):
        """
        Learn the optimal weights by solving an optimization problem.
        
        Parameters:
        ----------
        Xs: list of predictions to be ensembled
           Each prediction is the solution of an individual classifier and has 
           shape=(n_samples, n_classes).
        y: array-like
           Class labels
        """
        #print X.shape[1], self.n_class
        
        Xs = np.hsplit(X, X.shape[1]/self.n_class)
        #Initial solution has equal weight for all individual predictions.
        x0 = np.ones(len(Xs)) / float(len(Xs)) 
        #Weights must be bounded in [0, 1]
        bounds = [(0,1)]*len(x0)   
        #All weights must sum to 1
        cons = ({'type':'eq','fun':lambda w: 1-sum(w)})
        #Calling the solver
        res = minimize(objf_ens_optA, x0, args=(Xs, y, self.n_class), 
                       method='SLSQP', 
                       bounds=bounds,
                       constraints=cons
                       )
        self.w = res.x
        return self
    
    def predict_proba(self, X):
        """
        Use the weights learned in training to predict class probabilities.
        
        Parameters:
        ----------
        Xs: list of predictions to be blended.
            Each prediction is the solution of an individual classifier and has 
            shape=(n_samples, n_classes).
            
        Return:
        ------
        y_pred: array_like, shape=(n_samples, n_class)
                The blended prediction.
        """
        Xs = np.hsplit(X, X.shape[1]/self.n_class)
        y_pred = np.zeros(Xs[0].shape)
        for i in range(len(self.w)):
            y_pred += Xs[i] * self.w[i] 
        return y_pred

### Second ensemble technique (EN_optB)
Given a set of predictions  X1,X2,...,XnX1,X2,...,Xn , where each  Xi  has  m=12  clases, i.e.  Xi=Xi1,Xi2,...,XimXi=Xi1,Xi2,...,Xim . The algorithm finds the optimal set of weights  w11,w12,...,wnmw11,w12,...,wnm ; such that minimizes  log_loss(yT,yE)log_loss(yT,yE) , where  yE=X11∗w11+...+X21∗w21+...+Xnm∗wnmyE=X11∗w11+...+X21∗w21+...+Xnm∗wnm  and and  yTyT  is the true solution.

In [3]:
def objf_ens_optB(w, Xs, y, n_class):
    """
    Function to be minimized in the EN_optB ensembler.
    
    Parameters:
    ----------
    w: array-like, shape=(n_preds)
       Candidate solution to the optimization problem (vector of weights).
    Xs: list of predictions to combine
       Each prediction is the solution of an individual classifier and has a
       shape=(n_samples, n_classes).
    y: array-like sahpe=(n_samples,)
       Class labels
    n_class: int
       Number of classes in the problem, i.e. = 12
    
    Return:
    ------
    score: Score of the candidate solution.
    """
    #Constraining the weights for each class to sum up to 1.
    #This constraint can be defined in the scipy.minimize function, but doing 
    #it here gives more flexibility to the scipy.minimize function 
    #(e.g. more solvers are allowed).
    w_range = np.arange(len(w))%n_class 
    for i in range(n_class): 
        w[w_range==i] = w[w_range==i] / np.sum(w[w_range==i])
        
    sol = np.zeros(Xs[0].shape)
    for i in range(len(w)):
        sol[:, i % n_class] += Xs[int(i / n_class)][:, i % n_class] * w[i] 
        
    #Using log-loss as objective function (different objective functions can be used here). 
    score = log_loss(y, sol)   
    return score
    

class EN_optB(BaseEstimator):
    """
    Given a set of predictions $X_1, X_2, ..., X_n$, where each $X_i$ has
    $m=12$ clases, i.e. $X_i = X_{i1}, X_{i2},...,X_{im}$. The algorithm finds the optimal 
    set of weights $w_{11}, w_{12}, ..., w_{nm}$; such that minimizes 
    $log\_loss(y_T, y_E)$, where $y_E = X_{11}*w_{11} +... + X_{21}*w_{21} + ... 
    + X_{nm}*w_{nm}$ and and $y_T$ is the true solution.
    """
    def __init__(self, n_class):
        super(EN_optB, self).__init__()
        self.n_class = n_class
        
    def fit(self, X, y):
        """
        Learn the optimal weights by solving an optimization problem.
        
        Parameters:
        ----------
        Xs: list of predictions to be ensembled
           Each prediction is the solution of an individual classifier and has 
           shape=(n_samples, n_classes).
        y: array-like
           Class labels
        """
        #print X.shape[1], self.n_class
        
        Xs = np.hsplit(X, X.shape[1]/self.n_class)
        #Initial solution has equal weight for all individual predictions.
        x0 = np.ones(self.n_class * len(Xs)) / float(len(Xs)) 
        #Weights must be bounded in [0, 1]
        bounds = [(0,1)]*len(x0)   
        #Calling the solver (constraints are directly defined in the objective
        #function)
        res = minimize(objf_ens_optB, x0, args=(Xs, y, self.n_class), 
                       method='L-BFGS-B', 
                       bounds=bounds, 
                       )
        self.w = res.x
        return self
    
    def predict_proba(self, X):
        """
        Use the weights learned in training to predict class probabilities.
        
        Parameters:
        ----------
        Xs: list of predictions to be ensembled
            Each prediction is the solution of an individual classifier and has 
            shape=(n_samples, n_classes).
            
        Return:
        ------
        y_pred: array_like, shape=(n_samples, n_class)
                The ensembled prediction.
        """
        
        Xs = np.hsplit(X, X.shape[1]/self.n_class)
        y_pred = np.zeros(Xs[0].shape)
        for i in range(len(self.w)):
            y_pred[:, i % self.n_class] += \
                   Xs[int(i / self.n_class)][:, i % self.n_class] * self.w[i]  
        return y_pred

In [4]:
print('Load data...')
DATA_DIR = "/Users/patrickkennedy/Desktop/Data_Science/Kaggle"
train = pd.read_csv(DATA_DIR + "/Santander/train.csv")
test = pd.read_csv(DATA_DIR + "/Santander/test.csv")

target = train['TARGET'].values

train = train.drop(['ID','TARGET'],axis=1)
id_test = test['ID'].values
test = test.drop(['ID'],axis=1)

print('Clearing...')
for (train_name, train_series), (test_name, test_series) in zip(train.iteritems(),test.iteritems()):
    if train_series.dtype == 'O':
        #for objects: factorize
        train[train_name], tmp_indexer = pd.factorize(train[train_name])
        test[test_name] = tmp_indexer.get_indexer(test[test_name])
        #but now we have -1 values (NaN)
    else:
        #for int or float: fill NaN
        tmp_len = len(train[train_series.isnull()])
        if tmp_len>0:
            #print "mean", train_series.mean()
            train.loc[train_series.isnull(), train_name] = -9999 #train_series.mean()
        #and Test
        tmp_len = len(test[test_series.isnull()])
        if tmp_len>0:
            test.loc[test_series.isnull(), test_name] = -9999 #train_series.mean()  #TODO
            


Load data...
Clearing...


In [8]:
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline


In [9]:
n_classes = 2 
for_real = True
random_state = 8312 #3900, 3233

#this is what i'll change when i run the whole data set...
#essentially my train and test sets are already split


#Spliting data into train and test sets.
#try splitting the data differently? .33 test_size?

X, X_test, y, y_test = train_test_split(train, target, test_size=0.2, random_state=random_state)
    
#Spliting train data into training and validation sets.
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=random_state)

print('Data shape:')
print('X_train: %s, X_valid: %s, X_test: %s \n' %(X_train.shape, X_valid.shape, 
                                                  X_test.shape))

if for_real:
    #take the train, target and test data, and come up with a validation set from train
    X_real = train
    X_test_real = test
    y_real = target
    
    X_train_real, X_valid_real, y_train_real, y_valid_real = train_test_split(X_real, y_real, 
                                                                              test_size=0.25, 
                                                                              random_state=random_state)
    
    

Data shape:
X_train: (45612, 369), X_valid: (15204, 369), X_test: (15204, 369) 



### First layer (individual classifiers)
All classifiers are applied twice:
Training on (X_train, y_train) and predicting on (X_valid)
Training on (X, y) and predicting on (X_test)
You can add / remove classifiers or change parameter values to see the effect on final results.

In [27]:
%%time
#Defining the classifiers
#think about jittering the random state and then averaging the predictions together ...
#only good for extra trees, RF?, native XGB, NN
clfs = {#'LR'  : LogisticRegression(), 
        #'linSVM' : CalibratedClassifierCV(LinearSVC(), method='isotonic', cv=3), 
        #'RF'  : RandomForestClassifier(n_estimators=1000, n_jobs=-1), 
        #'GBM' : GradientBoostingClassifier(n_estimators=50), 
        'ETC1' : ExtraTreesClassifier(n_estimators=100),#, max_features=130, max_depth=12, n_jobs=-1),
        #'ETC2' : ExtraTreesClassifier(n_estimators=250),#, max_features=130, max_depth=12, n_jobs=-1),
        #'ETC3' : ExtraTreesClassifier(n_estimators=1000),#, max_features=130, max_depth=12, n_jobs=-1),
        #'ETC4' : ExtraTreesClassifier(n_estimators=2500),#, max_features=130, max_depth=12, n_jobs=-1),
        #'ETC5' : ExtraTreesClassifier(n_estimators=10000),#, max_features=130, max_depth=12, n_jobs=-1),

        #'KNN35' : KNeighborsClassifier(n_neighbors=35)#,
        'XGBc1': XGBClassifier(objective='binary:logistic')#, learning_rate=0.03, n_estimators=2500),
        #'XGBc2': XGBClassifier(objective='binary:logistic', learning_rate=0.01),
        #'XGBc3': XGBClassifier(objective='binary:logistic',
        #                      colsample_bytree=0.77638333498678636,
        #                      learning_rate=0.030567867858705199,
        #                      max_delta_step=4.6626180513766657,
        #                      min_child_weight=57.354121041109124,
        #                      n_estimators=478,
        #                      subsample=0.8069399976204783,
        #                      max_depth=6,
        #                      gamma=0.2966938071810209),
        #'NN1'  : Pipeline([('min/max scaler', MinMaxScaler(feature_range=(-1.0, 1.0))),
        #                  ('neural network', Classifier(layers=[Layer("Rectifier", units=10),
        #                                                        Layer("Tanh", units=10),
        #                                                        Layer("Softmax")], 
        #                                                n_iter=15))])#,    
        #'NN2'  : Pipeline([('min/max scaler', MinMaxScaler(feature_range=(-1.0, 1.0))),
        #                  ('neural network', Classifier(layers=[Layer("Sigmoid", units=25),
        #                                                        Layer("Tanh", units=40),
        #                                                        Layer("Sigmoid", units=100),
        #                                                        Layer("Softmax")], 
        #                                                learning_rate=0.03,
        #                                                n_iter=15))])  
       }

#score with 7 models on kaggle - .4503 -- cv = .4478 (diff = +.0025)
#added second NN, 1&2 xgb, linSVM - .4506 -- cv = .4474
#added etc with 2500,10000, xgb with 1000, removed 1 xgb, 1 nn, linsvm, etc1-3 - 
#changed NN to 4-level, learning rate .03, 15 iter - .4506
#just ETC 2500, XGB1/3, NN1 - .4504
#just ETC 2500, XGB3, NN1 - .4503



#predictions on the validation and test sets
p_valid = []
p_test = []

p_valid_real = []
p_test_real = []
   
print('Performance of individual classifiers (1st layer) on X_test')   
print('------------------------------------------------------------')

for w in [9, 25, 50, 100, 'all']:
    select = SelectKBest(k=w)
    for nm, clf in clfs.items():
        pipeline = Pipeline([('select', select), (nm, clf)])
        #First run. Training on (X_train, y_train) and predicting on X_valid.
        pipeline.fit(X_train, y_train)
        yv = pipeline.predict_proba(X_valid)
        p_valid.append(yv)
        
        #Second run. Training on (X, y) and predicting on X_test.
        pipeline.fit(X, y)
        yt = pipeline.predict_proba(X_test)
        p_test.append(yt)
        
        if for_real:
            #First run. Training on (X_train, y_train) and predicting on X_valid.
            pipeline.fit(X_train_real, y_train_real)
            yv_real = pipeline.predict_proba(X_valid_real)
            p_valid_real.append(yv_real)
        
            #Second run. Training on (X, y) and predicting on X_test.
            pipeline.fit(X_real, y_real)
            yt_real = pipeline.predict_proba(X_test_real)
            p_test_real.append(yt_real)
        
        #Printing out the performance of the classifier
        #use AUC as the eval metric and see how that changes things
        print('{:10s} {:2s} {:1.7f} {:1s}'.format('%s: ' %(nm), 'auc  =>', roc_auc_score(y_test, yt[:,1]), str(w)))
        #print('CV score: ' + str(np.mean(cross_val_score(clf, X_test, y_test, scoring='roc_auc', cv=10))))
        #print('Select k: ' + str(w))
print('')

#when running the full data
#take out the logloss function... or alternatively, run both the split data and full data model so that
#i can compare my training logloss vs kaggle logloss

Performance of individual classifiers (1st layer) on X_test
------------------------------------------------------------
ETC1:      auc  => 0.7154475 9
XGBc1:     auc  => 0.8021603 9
ETC1:      auc  => 0.7267709 25
XGBc1:     auc  => 0.8228904 25
ETC1:      auc  => 0.7461476 50
XGBc1:     auc  => 0.8287535 50
ETC1:      auc  => 0.6938513 100
XGBc1:     auc  => 0.8403234 100
ETC1:      auc  => 0.6877166 all
XGBc1:     auc  => 0.8423246 all

CPU times: user 7min 13s, sys: 18.5 s, total: 7min 31s
Wall time: 7min 32s


In [12]:
yt

array([[ 1. ,  0. ],
       [ 1. ,  0. ],
       [ 1. ,  0. ],
       ..., 
       [ 0.5,  0.5],
       [ 1. ,  0. ],
       [ 1. ,  0. ]])

In [35]:
#think about correlating the model predictions... keep the uncorrelated ones (at least before sending them below?)
#df = pd.DataFrame()
cols = [keys for keys in clfs]
for i, col in enumerate(cols):
    df[col] = p_test[i][:,0]

df['y_vals'] = y_test
df.corr()

#redefine p_test with different models

Unnamed: 0,XGBc3,XGBc1,ETC4,NN1,y_vals,linSVM,RF,LR,KNN35
XGBc3,1.0,0.975546,0.732923,0.616782,-0.409715,0.341253,0.878139,0.206674,0.153196
XGBc1,0.975546,1.0,0.728703,0.610413,-0.406189,0.344285,0.862866,0.209698,0.158608
ETC4,0.732923,0.728703,1.0,0.620368,-0.409946,0.305509,0.879908,0.243988,0.187497
NN1,0.616782,0.610413,0.620368,1.0,-0.256992,0.257482,0.588221,0.220373,0.150482
y_vals,-0.409715,-0.406189,-0.409946,-0.256992,1.0,-0.138285,-0.403574,-0.091318,-0.083557
linSVM,0.341253,0.344285,0.305509,0.257482,-0.138285,1.0,0.329116,0.36385,0.146265
RF,0.878139,0.862866,0.879908,0.588221,-0.403574,0.329116,1.0,0.246036,0.176294
LR,0.206674,0.209698,0.243988,0.220373,-0.091318,0.36385,0.246036,1.0,0.202442
KNN35,0.153196,0.158608,0.187497,0.150482,-0.083557,0.146265,0.176294,0.202442,1.0


In [37]:
#decompose df back into p_test for analyses below...

p_test_built = []
p_valid_built = []

#do for p_valid too
p_test_built.append(df['XGBc3'].values)
p_test_built.append(df['XGBc1'].values)
p_test_built.append(df['ETC4'].values)
p_test_built.append(df['NN1'].values)
p_test_built.append(df['linSVM'].values)
p_test_built.append(df['RF'].values)
p_test_built.append(df['LR'].values)
p_test_built.append(df['KNN35'].values)


In [15]:
%%time
print('MEAN MODELS: Performance of individual classifiers (1st layer) on X_test')   
print('------------------------------------------------------------------------')

rs_holder = []

for r in xrange(1):    
    rs = random.randint(1,10000)
    x1 = True
    while x1 == True:
        if rs in rs_holder:
            rs = random.randint(1,10000)
        else:
            x1 = False
    rs_holder.append(rs)
    
    rs = 8312
    
    n_classes = 2 
    for_real = True

    #this is what i'll change when i run the whole data set...
    #essentially my train and test sets are already split

    #Spliting data into train and test sets.
    X, X_test, y, y_test = train_test_split(train, target, test_size=0.2, random_state=rs)
    
    #Spliting train data into training and validation sets.
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=rs)

    #print('Data shape:')
    #print('X_train: %s, X_valid: %s, X_test: %s \n' %(X_train.shape, X_valid.shape, 
    #                                              X_test.shape))

    if for_real:
        #take the train, target and test data, and come up with a validation set from train
        X_real = train
        X_test_real = test
        y_real = target
    
        X_train_real, X_valid_real, y_train_real, y_valid_real = train_test_split(X_real, y_real, 
                                                                              test_size=0.25, random_state=rs)
    




    p_valid_mean = []
    p_test_mean = []

    p_valid_mean_real = []
    p_test_mean_real = []

    for nm, clf in clfs.items():
        p_valid_clf = []
        p_test_clf = []
        p_valid_clf_real = []
        p_test_clf_real = []
    
        holder = []
    
        for i in range(1):
    
            dummy = random.randint(1,10000)
            x = True
            while x == True:
                if dummy in holder:
                    dummy = random.randint(1,10000)
                else:
                    x = False
            holder.append(dummy)
    
            #random.seed(rs)
    
            if nm == 'NN':
                #First run. Training on (X_train, y_train) and predicting on X_valid.
                clf.fit(X_train.as_matrix(), y_train)
                yv = clf.predict_proba(X_valid.as_matrix())
                p_valid_clf.append(yv)
        
                #Second run. Training on (X, y) and predicting on X_test.
                clf.fit(X.as_matrix(), y)
                yt = clf.predict_proba(X_test.as_matrix())
                p_test_clf.append(yt)
        
                if for_real:
                    #First run. Training on (X_train, y_train) and predicting on X_valid.
                    clf.fit(X_train_real.as_matrix(), y_train_real)
                    yv_real = clf.predict_proba(X_valid_real.as_matrix())
                    p_valid_clf_real.append(yv_real)
        
                    #Second run. Training on (X, y) and predicting on X_test.
                    clf.fit(X_real.as_matrix(), y_real)
                    yt_real = clf.predict_proba(X_test_real.as_matrix())
                    p_test_clf_real.append(yt_real)
    
            else:
                #First run. Training on (X_train, y_train) and predicting on X_valid.
                clf.fit(X_train, y_train)
                yv = clf.predict_proba(X_valid)
                p_valid_clf.append(yv)
        
                #Second run. Training on (X, y) and predicting on X_test.
                clf.fit(X, y)
                yt = clf.predict_proba(X_test)
                p_test_clf.append(yt)
        
                if for_real:
                    #First run. Training on (X_train, y_train) and predicting on X_valid.
                    clf.fit(X_train_real, y_train_real)
                    yv_real = clf.predict_proba(X_valid_real)
                    p_valid_clf_real.append(yv_real)
        
                    #Second run. Training on (X, y) and predicting on X_test.
                    clf.fit(X_real, y_real)
                    yt_real = clf.predict_proba(X_test_real)
                    p_test_clf_real.append(yt_real)
            
       
        #Printing out the performance of the classifier
        mean_pred_cv = np.mean(p_valid_clf, axis=0)
        mean_pred_test = np.mean(p_test_clf, axis=0)
        p_valid_mean.append(mean_pred_cv)
        p_test_mean.append(mean_pred_test)
    
        mean_real_pred_cv = np.mean(p_valid_clf_real, axis=0)
        mean_real_pred_test = np.mean(p_test_clf_real, axis=0)
        p_valid_mean_real.append(mean_real_pred_cv)
        p_test_mean_real.append(mean_real_pred_test)
    
        print('{:10s} {:2s} {:1.7f}'.format('%s - mean: ' %(nm), 'logloss  =>', log_loss(y_test, mean_pred_test)))
        cv_score = np.mean(cross_val_score(clf, X_test, y_test, scoring='log_loss', cv=10))
        print('rs: ' + str(rs) + ';   CV score: ' + str(cv_score))
        
    print('')

    

    
#also try setting different parameters for the XGB and add a NN to the mix
#either use bayesopt for each classifier and putting those into this model, -or- 
#randomize both parameters and by random_state... i could do several loops here
#lots of comp time but each random_state run a series of different parameters and
#take the average result for that particular random_state, then run the same
#parameters on the next random_state (don't want random X random as that is hard to replicate)...
#too many combos... let's do bayes_opt

#Random state 8312 = mean xgb: .464, cv: .468 -- confirmed rs, ___ seed
#Random state 3900 = mean xgb: .465, cv: .468
#Random state 3233 = mean xgb: .466, cv: .469




MEAN MODELS: Performance of individual classifiers (1st layer) on X_test
------------------------------------------------------------------------
XGBc - mean:  logloss  => 0.4644625
rs: 8312;   CV score: -0.468037340424

CPU times: user 4min 39s, sys: 2.02 s, total: 4min 41s
Wall time: 4min 41s


In [22]:
for i in range(2):
    seed = random.randint(1,10000)
    clf = XGBClassifier(objective="binary:logistic", seed=seed)
    clf.fit(X, y)
    print('{:10s} {:2s} {:1.7f}'.format('%s - mean: ' %(nm), 'logloss  =>', log_loss(y_test, clf.predict_proba(X_test))))
    print str(seed)
    print('')
    #next step is to change the seed to a randint

XGBc - mean:  logloss  => 0.4644625
8560

XGBc - mean:  logloss  => 0.4644625
5663



In [28]:
print('Performance of optimization based ensemblers (2nd layer) on X_test')   
print('------------------------------------------------------------')
    
#Creating the data for the 2nd layer.
XV = np.hstack(p_valid)
XT = np.hstack(p_test)  

n_classes = 2

#EN_optA
enA = EN_optA(n_classes)
enA.fit(XV, y_valid)
w_enA = enA.w
y_enA = enA.predict_proba(XT)
print('{:20s} {:2s} {:1.7f}'.format('EN_optA:', 'auc  =>', roc_auc_score(y_test, y_enA[:,1])))
    
#Calibrated version of EN_optA 
cc_optA = CalibratedClassifierCV(enA, method='isotonic', cv=5)
cc_optA.fit(XV, y_valid)
y_ccA = cc_optA.predict_proba(XT)
print('{:20s} {:2s} {:1.7f}'.format('Calibrated_EN_optA:', 'auc  =>', roc_auc_score(y_test, y_ccA[:,1])))
        
#EN_optB
enB = EN_optB(n_classes) 
enB.fit(XV, y_valid)
w_enB = enB.w
y_enB = enB.predict_proba(XT)
print('{:20s} {:2s} {:1.7f}'.format('EN_optB:', 'auc  =>', roc_auc_score(y_test, y_enB[:,1])))

#Calibrated version of EN_optB
cc_optB = CalibratedClassifierCV(enB, method='isotonic', cv=5)
cc_optB.fit(XV, y_valid)
y_ccB = cc_optB.predict_proba(XT)  
print('{:20s} {:2s} {:1.7f}'.format('Calibrated_EN_optB:', 'auc  =>', roc_auc_score(y_test, y_ccB[:,1])))
print('')


if for_real:
    print('REAL: Performance of optimization based ensemblers (2nd layer) on X_test')   
    print('------------------------------------------------------------')
    
    #Creating the data for the 2nd layer.
    XV_real = np.hstack(p_valid_real)
    XT_real = np.hstack(p_test_real)  

    n_classes = 2

    #EN_optA
    enA_real = EN_optA(n_classes)
    enA_real.fit(XV_real, y_valid_real)
    w_enA_real = enA_real.w
    y_enA_real = enA_real.predict_proba(XT_real)
    #print('{:20s} {:2s} {:1.7f}'.format('EN_optA:', 'logloss  =>', log_loss(y_test, y_enA)))
    
    #Calibrated version of EN_optA 
    cc_optA_real = CalibratedClassifierCV(enA_real, method='isotonic')
    cc_optA_real.fit(XV_real, y_valid_real)
    y_ccA_real = cc_optA_real.predict_proba(XT_real)
    #print('{:20s} {:2s} {:1.7f}'.format('Calibrated_EN_optA:', 'logloss  =>', log_loss(y_test, y_ccA)))
        
    #EN_optB
    enB_real = EN_optB(n_classes) 
    enB_real.fit(XV_real, y_valid_real)
    w_enB_real = enB_real.w
    y_enB_real = enB_real.predict_proba(XT_real)
    #print('{:20s} {:2s} {:1.7f}'.format('EN_optB:', 'logloss  =>', log_loss(y_test, y_enB)))

    #Calibrated version of EN_optB
    #cc_optB_real = CalibratedClassifierCV(enB_real, method='isotonic')
    #cc_optB_real.fit(XV_real, y_valid_real)
    #y_ccB_real = cc_optB_real.predict_proba(XT_real)  
    #print('{:20s} {:2s} {:1.7f}'.format('Calibrated_EN_optB:', 'logloss  =>', log_loss(y_test, y_ccB)))
    #print('')

Performance of optimization based ensemblers (2nd layer) on X_test
------------------------------------------------------------
EN_optA:             auc  => 0.8418034
Calibrated_EN_optA:  auc  => 0.8412124
EN_optB:             auc  => 0.8413589
Calibrated_EN_optB:  auc  => 0.8408967

REAL: Performance of optimization based ensemblers (2nd layer) on X_test
------------------------------------------------------------


In [133]:
print('MEAN:  Performance of optimization based ensemblers (2nd layer) on X_test')   
print('------------------------------------------------------------')
    
#Creating the data for the 2nd layer.
XV_mean = np.hstack(p_valid_mean)
XT_mean = np.hstack(p_test_mean)  
        
#EN_optA
enA_mean = EN_optA(n_classes)
enA_mean.fit(XV_mean, y_valid)
w_enA_mean = enA_mean.w
y_enA_mean = enA_mean.predict_proba(XT_mean)
print('{:20s} {:2s} {:1.7f}'.format('EN_optA:', 'logloss  =>', log_loss(y_test, y_enA_mean)))
    
#Calibrated version of EN_optA 
cc_optA_mean = CalibratedClassifierCV(enA_mean, method='isotonic')
cc_optA_mean.fit(XV_mean, y_valid)
y_ccA_mean = cc_optA_mean.predict_proba(XT_mean)
print('{:20s} {:2s} {:1.7f}'.format('Calibrated_EN_optA:', 'logloss  =>', log_loss(y_test, y_ccA_mean)))
        
#EN_optB
enB_mean = EN_optB(n_classes) 
enB_mean.fit(XV_mean, y_valid)
w_enB_mean = enB_mean.w
y_enB_mean = enB_mean.predict_proba(XT_mean)
print('{:20s} {:2s} {:1.7f}'.format('EN_optB:', 'logloss  =>', log_loss(y_test, y_enB_mean)))

#Calibrated version of EN_optB
cc_optB_mean = CalibratedClassifierCV(enB_mean, method='isotonic')
cc_optB_mean.fit(XV_mean, y_valid)
y_ccB_mean = cc_optB_mean.predict_proba(XT_mean)  
print('{:20s} {:2s} {:1.7f}'.format('Calibrated_EN_optB:', 'logloss  =>', log_loss(y_test, y_ccB_mean)))
print('')

if for_real:
    print('MEAN:  Performance of optimization based ensemblers (2nd layer) on X_test')   
    print('------------------------------------------------------------')
    
    #Creating the data for the 2nd layer.
    XV_mean_real = np.hstack(p_valid_mean_real)
    XT_mean_real = np.hstack(p_test_mean_real)  
        
    #EN_optA
    enA_mean_real = EN_optA(n_classes)
    enA_mean_real.fit(XV_mean_real, y_valid_real)
    w_enA_mean_real = enA_mean_real.w
    y_enA_mean_real = enA_mean_real.predict_proba(XT_mean_real)
    #print('{:20s} {:2s} {:1.7f}'.format('EN_optA:', 'logloss  =>', log_loss(y_test, y_enA_mean)))
    
    #Calibrated version of EN_optA 
    cc_optA_mean_real = CalibratedClassifierCV(enA_mean_real, method='isotonic')
    cc_optA_mean_real.fit(XV_mean_real, y_valid_real)
    y_ccA_mean_real = cc_optA_mean_real.predict_proba(XT_mean_real)
    #print('{:20s} {:2s} {:1.7f}'.format('Calibrated_EN_optA:', 'logloss  =>', log_loss(y_test, y_ccA_mean)))
        
    #EN_optB
    enB_mean_real = EN_optB(n_classes) 
    enB_mean_real.fit(XV_mean_real, y_valid_real)
    w_enB_mean_real = enB_mean_real.w
    y_enB_mean_real = enB_mean_real.predict_proba(XT_mean_real)
    #print('{:20s} {:2s} {:1.7f}'.format('EN_optB:', 'logloss  =>', log_loss(y_test, y_enB_mean)))

    #Calibrated version of EN_optB
    cc_optB_mean_real = CalibratedClassifierCV(enB_mean_real, method='isotonic')
    cc_optB_mean_real.fit(XV_mean_real, y_valid_real)
    y_ccB_mean_real = cc_optB_mean_real.predict_proba(XT_mean_real)  
    #print('{:20s} {:2s} {:1.7f}'.format('Calibrated_EN_optB:', 'logloss  =>', log_loss(y_test, y_ccB_mean)))
    print('')

MEAN:  Performance of optimization based ensemblers (2nd layer) on X_test
------------------------------------------------------------
EN_optA:             logloss  => 0.4596911
Calibrated_EN_optA:  logloss  => 0.4564359
EN_optB:             logloss  => 0.4582061
Calibrated_EN_optB:  logloss  => 0.4705139

MEAN:  Performance of optimization based ensemblers (2nd layer) on X_test
------------------------------------------------------------



### Weighted averages

In [148]:
#come up with better weights here... reflect that in calibration performance
y_3l = (y_enA * 2./9.) + (y_ccA * 4./9.) + (y_enB * 2./9.) + (y_ccB * 1./9.)
print('{:20s} {:2s} {:1.7f}'.format('3rd_layer:', 'logloss  =>', log_loss(y_test, y_3l)))

3rd_layer:           logloss  => 0.4623964


In [139]:
#come up with better weights here... reflect that in calibration performance
y_3l_mean = (y_enA_mean * 2./9.) + (y_ccA_mean * 4./9.) + (y_enB_mean * 2./9.) + (y_ccB_mean * 1./9.)
print('{:20s} {:2s} {:1.7f}'.format('3rd_layer:', 'logloss  =>', log_loss(y_test, y_3l_mean)))

3rd_layer:           logloss  => 0.4545998


In [116]:
#top 10% baby!  currently: 155/1942.

0.07981462409886715

In [29]:
#optimize weighting of the 3rd level - keep the same weighting for real data
best_score = 0.0

for i in range(10000):
    first = random.randint(0,20)
    second = random.randint(0,20)
    third = random.randint(0,20)
    fourth = random.randint(0,20)
    total = first + second + third + fourth
    first = first / (total * 1.0)
    second = second / (total * 1.0)
    third = third / (total * 1.0)
    fourth = fourth / (total * 1.0)
    
    y_3l = (y_enA * first) + (y_ccA * second) + (y_enB * third) + (y_ccB * fourth)
    current_score = roc_auc_score(y_test, y_3l[:,1])
    
    if current_score > best_score:
        print('{:20s} {:2s} {:1.7f}'.format('3rd_layer:', 'auc  =>', roc_auc_score(y_test, y_3l[:,1])))
        #print first, second, third, fourth
        best_score = current_score
        best_first = first
        best_second = second
        best_third = third
        best_fourth = fourth

3rd_layer:           auc  => 0.8414608
3rd_layer:           auc  => 0.8414829
3rd_layer:           auc  => 0.8415056
3rd_layer:           auc  => 0.8415115
3rd_layer:           auc  => 0.8417328
3rd_layer:           auc  => 0.8417669
3rd_layer:           auc  => 0.8417813
3rd_layer:           auc  => 0.8418001
3rd_layer:           auc  => 0.8418034


In [141]:
#optimize weighting of the 3rd level - keep the same weighting for real data
best_mean_score = 10.0

for i in range(10000):
    first = random.randint(0,20)
    second = random.randint(0,20)
    third = random.randint(0,20)
    fourth = random.randint(0,20)
    total = first + second + third + fourth
    first = first / (total * 1.0)
    second = second / (total * 1.0)
    third = third / (total * 1.0)
    fourth = fourth / (total * 1.0)
    
    y_3l_mean = (y_enA_mean * first) + (y_ccA_mean * second) + (y_enB_mean * third) + (y_ccB_mean * fourth)
    current_score = log_loss(y_test, y_3l_mean)
    
    if current_score < best_mean_score:
        print('{:20s} {:2s} {:1.7f}'.format('3rd_layer:', 'logloss  =>', log_loss(y_test, y_3l)))
        print first, second, third, fourth
        best_mean_score = current_score
        best_mean_first = first
        best_mean_second = second
        best_mean_third = third
        best_mean_fourth = fourth
        

3rd_layer:           logloss  => 0.4576157
0.295081967213 0.262295081967 0.196721311475 0.245901639344
3rd_layer:           logloss  => 0.4576157
0.409090909091 0.340909090909 0.159090909091 0.0909090909091
3rd_layer:           logloss  => 0.4576157
0.461538461538 0.487179487179 0.025641025641 0.025641025641
3rd_layer:           logloss  => 0.4576157
0.333333333333 0.555555555556 0.0 0.111111111111
3rd_layer:           logloss  => 0.4576157
0.0857142857143 0.542857142857 0.285714285714 0.0857142857143
3rd_layer:           logloss  => 0.4576157
0.24 0.56 0.16 0.04
3rd_layer:           logloss  => 0.4576157
0.0 0.933333333333 0.0 0.0666666666667
3rd_layer:           logloss  => 0.4576157
0.130434782609 0.739130434783 0.130434782609 0.0
3rd_layer:           logloss  => 0.4576157
0.0434782608696 0.869565217391 0.0434782608696 0.0434782608696
3rd_layer:           logloss  => 0.4576157
0.0625 0.8125 0.125 0.0
3rd_layer:           logloss  => 0.4576157
0.0714285714286 0.857142857143 0.0714285

In [33]:
#well awesome .. CV score is .4577 and my kaggle score is .45373
best_first, best_second, best_third, best_fourth

array([[ 0.9920762 ,  0.00792379],
       [ 0.98702986,  0.01297014],
       [ 0.99611793,  0.00388207],
       ..., 
       [ 0.88766522,  0.11233478],
       [ 0.9956001 ,  0.00439989],
       [ 0.97207571,  0.02792431]])

In [31]:
if for_real:
    preds = (y_ccA_real * best_second) + \
            (y_enB_real * best_third)
            
    #preds_mean = (y_enA_mean_real * best_mean_first) + \
    #            (y_ccA_mean_real * best_mean_second) + \
    #            (y_enB_mean_real * best_mean_third) + \
    #            (y_ccB_mean_real * best_mean_fourth)

    pd.DataFrame({"ID": id_test, "TARGET": preds[:,1]}).to_csv('oldmanpat_selectkbest_trial2.csv',index=False)

In [None]:
#best optimized score i can get with training data is .44956 using untrained XGBClassifier and ExtraTrees, 
#jittered 100 random_states with mean predictions with weights [0.0, .9047619047, 0.095238095, 0.0]

#let's try with additional models (logistic regression, random forest, NN), try with maybe randomized params,
#try maybe with bayes_optimized params

### Plotting the weights of each ensemble
In the case of EN_optA, there is a weight for each prediction and in the case of EN_optB there is a weight for each class for each prediction.

In [80]:
from tabulate import tabulate
print('         Weights of EN_optA:')
print('|---------------------------------------|')
wA = np.round(w_enA, decimals=2).reshape(1,-1)
print(tabulate(wA, headers=clfs.keys(), tablefmt="orgtbl"))
print('')
print('     Weights of EN_optB:')
print('|---------------------------|')
wB = np.round(w_enB.reshape((-1,n_classes)), decimals=2)
wB = np.hstack((np.array(list(clfs.keys()), dtype=str).reshape(-1,1), wB))
print(tabulate(wB, headers=['y%s'%(i) for i in range(n_classes)], tablefmt="orgtbl"))

         Weights of EN_optA:
|---------------------------------------|
|   ETC |   XGBc |
|-------+--------|
|  0.41 |   0.59 |

     Weights of EN_optB:
|---------------------------|
|      |   y0 |   y1 |
|------+------+------|
| ETC  | 0.65 |    0 |
| XGBc | 0.35 |    1 |


### Comparing our ensemble results with sklearn LogisticRegression based stacking of classifiers.¶
Both techniques EN_optA and EN_optB optimizes an objective function. In this experiment I am using the multi-class logloss as objective function. Therefore, the two proposed methods basically become implementations of LogisticRegression. The following code allows to compare the results of sklearn implementation of LogisticRegression with the proposed ensembles.

In [None]:

#By default the best C parameter is obtained with a cross-validation approach, doing grid search with
#10 values defined in a logarithmic scale between 1e-4 and 1e4.
#Change parameters to see how they affect the final results.
lr = LogisticRegressionCV(Cs=10, dual=False, fit_intercept=True, 
                          intercept_scaling=1.0, max_iter=100,
                          multi_class='ovr', n_jobs=1, penalty='l2', 
                          random_state=random_state,
                          solver='lbfgs', tol=0.0001)

lr.fit(XV, y_valid)
y_lr = lr.predict_proba(XT)
print('{:20s} {:2s} {:1.7f}'.format('Log_Reg:', 'logloss  =>', log_loss(y_test, y_lr)))

In [57]:
print len(p_valid), len(p_valid[0])
print len(np.hstack(p_valid))

4 22864
91456


In [None]:
for i in range(500):
    
    dummy = random.randint(1,10000)
    x = True
    while x == True:
        print dummy, str(len(holder)+1), holder
        #print holder
        if dummy in holder:
            dummy = random.randint(1,10000)
        else:
            x = False
    holder.append(dummy)
    
    random.seed(dummy)

In [None]:
#pd.DataFrame({"ID": id_test, "PredictedProb": np.mean(y_pred, axis=0)}).to_csv('extra_trees_and_log_and_gradientboost_with adas_jitteredrandomstate_500iterations.csv',index=False)