Stack all the models we selected and build the pipline

In [2]:
import numpy as np
np.random.seed(1234) # set seed 
import pandas as pd
from scipy import sparse
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA


In [3]:
# 2rd layer: ensemble the XGboost and CNN, use data from stacking

bagging = True
bagging_size = 50 # number of bagging size, stablizing the predictions

n_folds = 5 # folds for cross validation

# load data, load log data from previous steps
path = 'C:\Users\shuyi\Documents\StudyResource\Kaggle\\'
train_file = "step2.csv"
# Read the training data
train_data = pd.read_csv(path + train_file)

print("number of rows:", train_data.shape[0])
print("number of columns:", train_data.shape[1])
train_data = train_data[train_data.Upc != -99990]
train_data[1:10]

('number of rows:', 647054)
('number of columns:', 20)


Unnamed: 0.1,Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber,Count,Count_Null,ScanCount_Neg,FinelineNumber_Missing,N_Fineline,N_Upc,N_Dep,Min_Count,Max_Count,Mean_Count,Upc_full,company
1,1,30,7,0,605388159800,1,62,8931.0,1,0.0,0.0,0.0,2,2,2,1,1,1.0,605388159800,605388
2,2,30,7,0,74108110990,1,50,4504.0,1,0.0,0.0,0.0,2,2,2,1,1,1.0,741081109908,741081
3,3,26,8,0,22384035100,2,49,3565.0,2,0.0,1.0,1.0,17,21,7,1,18,4.0,223840351006,223840
4,4,26,8,0,20066137440,2,49,1017.0,2,0.0,1.0,1.0,17,21,7,1,18,4.0,200661374407,200661
5,5,26,8,0,20066187830,2,49,1017.0,2,0.0,1.0,1.0,17,21,7,1,18,4.0,200661878301,200661
6,6,26,8,0,20066137430,1,49,1017.0,1,0.0,1.0,1.0,17,21,7,1,18,4.0,200661374308,200661
7,7,26,8,0,70048027370,1,49,2802.0,1,0.0,1.0,1.0,17,21,7,1,18,4.0,700480273702,700480
8,8,26,8,0,22384953180,1,49,4501.0,1,0.0,1.0,1.0,17,21,7,1,18,4.0,223849531805,223849
9,9,26,8,0,22384002000,-1,49,3565.0,0,0.0,1.0,1.0,17,21,7,1,18,4.0,223840020001,223840


In [None]:
# stack algorithms for multiple models
'''
use n different classifiers to obtain out of fold prefictions for target data.
It uses the train data to get the predictions for test
Adds n features to both train and test data
both input data are in pandas dataframe format
'''
def StackModels(train, test, y, models, n_folds):
    num_class = np.unique(y).shape[0]
    # The folds are made by preserving the percentage of samples for each class.
    y_folds = list(StratifiedKFold(y, n_folds))
    
    train_sc = train
    test_sc = test
    
    # number of rows * number of classifiers
    blend_train = np.zeros((train.shape[0], num_class*len(models)))
    blend_test = np.zeros((test.shape[0], num_class*len(models)))
    for j, model in enumerate(models):
        print("Training the model [%s]" %(i))
        
        for i, (train_i, cv_i) in enumerate(y_folds):
            print("Now training the fold [%s]" %(j))
            
            #train on 2 folds, predict the 3rd fold
            x_train = train[train_i]# select this fold by index from cross validation
            y_train = y[train_i]
            x_cv = train[cv_i] #针对这个fold，所形成的余下data组合成的cross validation
            
            model.fit(x_train, y_train)
            prediction = model.predict_proba(x_cv)
            blend_train[cv_i, j*num_class:(j+1)*num_class] = prediction #the jth model's prediction on each cross validation of each fold
       
        print("Stacking test data")
        model.fit(train, y) 
        prediction = model.predict_prob(test)
        blend_test[:, j*num_class:(j+1)*num_class] # columns belong to different models
        
    return blend_train, blend_test
  

In [5]:
def Model_stacking(train, test, y):
    # transform  the data to sparse matrix format, which is quick in matrix operation and row slicing
    train = sparse.csr_matrix(train)
    test = sparse.csr_matrix(test)
    
    # load the models predifined in the wrapper
    model1 = models.XGBoost_multilabel(nthread = 4, eta = 0.05, gamma = 0.1, max_depth = 15,
                                       min_child_weight = 2, max_delta_step = None, subsample = 0.7,
                                       colsample_bytree = 0.3, silent = 1, seed = 1337,
                                       l2_reg = 1.8, l1_reg = 0.15, num_round = 300)
    
    model2 = RandomForestClassifier(n_estimators = 200, criterion = "entropy", max_depth = 15, min_samples_split = 2,
                                    min_samples_leaf = 1, min_weight_fraction_leaf = 0.,max_features = 0.6,
                                    max_leaf_nodes = None, bootstrap = True, oob_score = False, n_jobs = 2,
                                    random_state = 1337, verbose = 0)
    
    model3 = AdaBoostClassifier(DecisionTreeClassifier(criterion='entropy', max_depth=15, min_samples_leaf=1, 
                                min_weight_fraction_leaf=0.0, max_features=0.4, random_state=1301),
                                n_estimators=300, learning_rate=0.07, random_state=1337 )
    
    models = [model1, model2, model3]
    train_probs, test_probs = StackModels(train, test, y, models, n_folds) 
    
    return train_probs, test_probs
    

In [None]:
train = train_data[[ f for f in train_data.columns if f != 'TripType' ]]
y = train_data['TripType']
Model_stacking(train, test, y)