In [2]:
import math
import pandas as pd
import numpy as np
import scipy as sp

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import GridSearchCV, KFold, train_test_split, cross_val_score, cross_val_predict
from sklearn import metrics

In [17]:
y_train = pd.read_csv('../ensemble/train/y.csv', index_col=0)

# Ensembling

In [16]:
class Ensemble(object):
    def __init__(self, stacker_models, stacker_weights,
                 base_models, num_train_points, num_test_points):
            self.stackers = stacker_models
            self.weights = stacker_weights
            self.n_train = num_train_points
            self.n_test = num_test_points
            self.models = base_models
            
    def fit_predict(self, y_train):
        if y_train.shape[0] != self.n_train:
            print (y_train.shape[0])
            raise ValueError('Number of points in y_train does not equal number of data points in training set')
        
        # Load all train and test data from the base models
        self.S_train = self.load_metafeatures('train', self.models, self.n_train)
        self.S_test  = self.load_metafeatures('test', self.models, self.n_test)
        
        # Train each stacker on the metafeatures
        self.S_preds = np.zeros((self.n_test, len(self.stackers)))
        for i, stacker in enumerate(self.stackers):
            stacker.fit(self.S_train, np.ravel(y_train))
            self.S_preds[:, i] = stacker.predict_proba(self.S_test)[:,1]
        
        # Combine stackers according to their weights
        #for i, weight in enumerate(self.weights):
        #    self.S_preds[:, i] *= weight
        #self.y_pred = np.sum(self.S_preds, axis = 1)
        self.y_pred = sp.stats.hmean(self.S_preds, axis=1)
        
        return self.y_pred
    
    def load_metafeatures(self, mode, models, num_points):
        S_data = np.zeros((num_points, len(models)))
        
        for i, model in enumerate(models):
            S_data_i = pd.read_csv('../ensemble/' + mode + '/' + model + '_' + mode + '.csv', 
                                   index_col=0)
            if (S_data_i.shape[0] == num_points):
                S_data[:, i] = np.ravel((S_data_i['Y']))
            else:
                raise ValueError('Number of points in' + model + '_' + mode
                                 + 'does not equal number of data points in training set')
            
        return S_data

    def make_submission(self, filename):
        test_data =  pd.read_csv('../ensemble/test/' + self.models[0] + '_test.csv', index_col=0)
        submission = pd.DataFrame(columns=['Y'], index=test_data.index, data=self.y_pred)
        submission.to_csv('../ensemble/results/' + filename + ".csv")
        
        #sanity_check (y_train, submission)
        
        models_file = open('../ensemble/results/' + filename + '.txt', 'w')
        models_file.write('Models used:\n')
        for model in self.models:
            models_file.write('%s\n' % model)

def sanity_check (y_train, submission):
    print ()
    print ("Sanity Check:")
    print ("Mean of Y in training data:", y_train.mean())
    print ("Versus Mean of predicted Y:", submission['Y'].mean())
    print ("Difference = ", round(abs(y_train.mean() - submission['Y'].mean()),3))
    print ("If these differ by more than 0.01 or so, something may have gone wrong")

# Setup ensemble

In [10]:
base_models = [
    'Ada',
    'Chris',
    'ChrisNoBag',
    'ExtraTrees',
    'KNN_2',
    'KNN_4',
    'KNN_8',
    'KNN_16',
    'KNN_32',
    'KNN_64',
    'KNN_128',
    'KNN_256',
    'KNN_512',
    'LogReg',
    'RandomForest',
    'XGB_Bagged',
    'XGB_Clean',
    'XGB_Inter',
    'XGB_Missing',
    'XGB_Raw'
]

rf_stacker_model = RandomForestClassifier(criterion='entropy', max_depth=5, n_estimators=200, n_jobs=-1)
xgb_stacker_model = XGBClassifier(max_depth=6, subsample=1.0, n_estimators=100, 
                                  gamma=7, objective='binary:logistic', silent=1,
                                  reg_lambda=10, learning_rate=0.3,
                                  min_child_weight=10, colsample_bytree=1.0, colsample_bylevel=0.25)
stacker_models = [rf_stacker_model, xgb_stacker_model]
stacker_weights = [0.85, 0.15]

In [18]:
ens = Ensemble (stacker_models, stacker_weights,
                base_models, num_train_points=49998, num_test_points=50000)
preds = ens.fit_predict(y_train)
print (np.mean(preds))

0.0588657462828


In [19]:
ens.make_submission ('combined1')