# Stacking

In [79]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import datasets, metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import Lasso

In [80]:
# import data
df_beijing = pd.read_csv('data/beijing.csv', delimiter= ",",header=0)
dataset = df_beijing.as_matrix()

def data_split(dataset, n, a):
    dataset_train, dataset_test = train_test_split(dataset, test_size = a, random_state=n)
    X_train = dataset_train[:,1:13]
    y_train = dataset_train[:,15]
    X_test = dataset_test[:,1:13]
    y_test = dataset_test[:,15]
    return (dataset_train, dataset_test, X_train, y_train, X_test, y_test)

dataset_train, dataset_test, X_train, y_train, X_test, y_test = data_split(dataset, 0, 0.3)
np.savetxt("y_test.csv", y_test, delimiter=',', fmt='%d')


In [81]:
#output stack file 
class stack_model(object):
    def __init__(self, model, n_splits=5):
        self.n_splits = n_splits
        self.model = model

    def fit_predict(self, trainX, trainy, testX):

        self.stackTrain = np.zeros(len(trainX))
        self.stackTest = np.zeros(len(testX))

        skf = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=44)

        for train_index, test_index in skf.split(trainX, trainy):
            X_train, X_test = trainX[train_index], trainX[test_index]
            y_train, y_test = trainy[train_index], trainy[test_index]

            self.model.fit(X_train, y_train)
            y_pred = self.model.predict_proba(X_test)[:,1]
            self.stackTrain[test_index] = y_pred
            self.stackTest += self.model.predict_proba(testX)[:,1]
        
        self.stackTest /= self.n_splits
            
    def output(self,train_file_name='stack_train.csv',
                    test_file_name='stack_test.csv',
                    col_name='stack_pred'):

        pd.DataFrame({col_name:self.stackTrain}).to_csv(train_file_name,index=False) 
        pd.DataFrame({col_name:self.stackTest}).to_csv(test_file_name,index=False)

In [82]:
model_DT = DecisionTreeClassifier(min_samples_split=2, max_features=12)
model_RF = RandomForestClassifier(n_estimators = 200, max_features= 3)
model_XGB = XGBClassifier(max_depth=16, min_child_weight=1, learning_rate=0.3, n_estimators=1000)

MS = stack_model(model_XGB) # model is your model
MS.fit_predict(X_train, y_train, X_test)
MS.output()

In [83]:
# foldernames = os.popen('ls StackData').readlines()
# train = {}
# test = {}
# for foldername in foldernames:
#     foldername = foldername[:-1]
#     train[foldername] = pd.read_csv('StackData//{0}/stackTrain.csv'.format(foldername))
#     test[foldername] = pd.read_csv('StackData//{0}/stackTest.csv'.format(foldername))

In [84]:
# import data
df_test = pd.read_csv('stackTest1.csv', delimiter= ",",header=0)
df_train = pd.read_csv('stackTrain1.csv', delimiter= ",",header=0)
dataset_test = df_test.as_matrix()
dataset_train = df_train.as_matrix()

X1_train = dataset_train[0:5719,0:2]
y1_train = dataset_train[0:5719,3]
X1_test = dataset_test[0:5719,0:2]
y1_test = dataset_test[0:5719,3]
X1_train.shape

model = XGBClassifier()
trained_model_rf = model.fit(X1_train, y1_train)
predictions = trained_model_rf.predict(X1_test)
print('Test accuracy is \n',accuracy_score(y1_test, predictions))
print(metrics.classification_report(y1_test, predictions))


Test accuracy is 
 0.409861863962
             precision    recall  f1-score   support

        1.0       0.41      0.98      0.58      2306
        2.0       0.54      0.05      0.09      1516
        3.0       0.00      0.00      0.00       892
        4.0       0.00      0.00      0.00       449
        5.0       0.00      0.00      0.00       556

avg / total       0.31      0.41      0.26      5719



  'precision', 'predicted', average, warn_for)
