In [278]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import metrics
import xgboost as xgb
import operator
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier  #GBM algorithm
from sklearn.linear_model import LogisticRegression

In [295]:
train = pd.read_csv("./new/train.csv")
test = pd.read_csv("./new/test.csv")

In [296]:
#Clean label columns
label = train.pop("Target")
test_ids = test.pop("Id")

#drop columns
train.drop(['Id', 'DetectedCamera'], inplace=True, axis=1)
test.drop(['DetectedCamera'], inplace=True,axis=1)

#Validation split
x_train, x_valid, label_train, label_valid = train_test_split(train, label, test_size=0.2, random_state=4242, stratify = label)

In [207]:
def run_xgb(x_train, label_train, x_valid = None, label_valid = None):

    # Set our parameters for xgboost
    params = {}
    params['objective'] = 'multi:softprob'
    params['eval_metric'] = 'mlogloss'
    params['eta'] = 0.02
    params['num_class'] = 4
    params['max_depth'] = 4
    params['silent'] = 1
    params['min_child_weight'] = 0
    params['subsample'] = 0.8
    params['colsample_bytree'] = 0.8
    params['nthread'] = 13

    d_train = xgb.DMatrix(x_train, label=label_train)
    
    if x_valid is not None:
        d_valid = xgb.DMatrix(x_valid, label=label_valid)
        watchlist = [(d_train, 'train'), (d_valid, 'validation')]
    else:
        watchlist = [(d_train, 'train')]
        
    bst = xgb.train(params, d_train, 500, watchlist, early_stopping_rounds=50, verbose_eval=50)
    
    return bst

In [292]:
def init_xgb():

#     Validate
    bst = run_xgb(x_train, label_train, x_valid, label_valid)
    
    d_train = xgb.DMatrix(x_train)
    d_val = xgb.DMatrix(x_valid)
    
    train_pred = bst.predict(d_train)
    valid_pred = bst.predict(d_val)

    return train_pred, valid_pred

#     Test
#     bst = run_xgb(train, label)
    
#     d_train = xgb.DMatrix(train)
#     d_test = xgb.DMatrix(test)
    
#     train_pred = bst.predict(d_train)
#     test_pred = bst.predict(d_test)

#     return train_pred, test_pred


In [208]:
def run_gbm(x_train, label_train, x_valid = None, label_valid = None):

    gbm = GradientBoostingClassifier(n_estimators=500, max_depth=5, learning_rate=0.05, random_state=10)
    gbm.fit(x_train, label_train)
    
    return gbm

In [293]:
def init_gbm():
    
#     #Validation
    gbm_model = run_gbm(x_train, label_train, x_valid, label_valid)

    train_pred = gbm_model.predict_proba(x_train)
    valid_pred = gbm_model.predict_proba(x_valid)
    return train_pred, valid_pred

#     Testing
#     gbm_model = run_gbm(train, label)
    
#     train_pred = gbm_model.predict_proba(train)
#     test_pred = gbm_model.predict_proba(test)
    
#     return train_pred, test_pred

In [209]:
def run_rf(x_train, label_train):
    
    #train final model
    rf_model = RandomForestClassifier(n_estimators=300,max_depth=6, max_features=10)
    rf_model.fit(x_train, label_train)
    
    return rf_model

In [294]:
def init_rf():
    
    #Validation
    rf_model = run_rf(x_train, label_train)
    
    train_pred = rf_model.predict_proba(x_train)
    valid_pred = rf_model.predict_proba(x_valid)
    return train_pred, valid_pred

    #Testing
#     rf_model = run_rf(train, label)
    
#     train_pred = rf_model.predict_proba(train)
#     test_pred = rf_model.predict_proba(test)
    
#     return train_pred, test_pred


In [132]:
def run_model(model):
    
    if model == "xgb":
        model_train_res, model_test_res = init_xgb()
    elif model == "gbm":
        model_train_res, model_test_res = init_gbm()
    elif model == "rf":
        model_train_res, model_test_res = init_rf()
    
    model_train_res = pd.DataFrame(model_train_res)
    model_test_res = pd.DataFrame(model_test_res)
    
    model_train_res.columns = [ model + "_zero", model + "_one", model + "_two", model + "_three"]
    model_test_res.columns = [ model + "_zero", model + "_one", model + "_two", model + "_three"]
    
    return model_train_res, model_test_res

In [228]:
def first_level_stack(x_train, x_test):
    
    xgb_train_res, xgb_test_res = run_model("xgb")    
    gbm_train_res, gbm_test_res = run_model("gbm")
    rf_train_res, rf_test_res = run_model("rf")

    xgb_train_res.index = x_train.index
    xgb_test_res.index = x_test.index
    x_train = pd.concat([x_train, xgb_train_res], axis = 1)
    x_test = pd.concat([x_test, xgb_test_res], axis = 1)

    gbm_train_res.index = x_train.index
    gbm_test_res.index = x_test.index
    x_train = pd.concat([x_train, gbm_train_res], axis = 1)
    x_test = pd.concat([x_test, gbm_test_res], axis = 1)

    rf_train_res.index = x_train.index
    rf_test_res.index = x_test.index    
    x_train = pd.concat([x_train, rf_train_res], axis = 1)
    x_test = pd.concat([x_test, rf_test_res], axis = 1)

    return x_train, x_test

In [297]:
x_train, x_valid = first_level_stack(x_train, x_valid)
# train, test = first_level_stack(train, test)

[0]	train-mlogloss:1.34957	validation-mlogloss:1.34955
Multiple eval metrics have been passed: 'validation-mlogloss' will be used for early stopping.

Will train until validation-mlogloss hasn't improved in 50 rounds.
[50]	train-mlogloss:0.477504	validation-mlogloss:0.478027
[100]	train-mlogloss:0.229744	validation-mlogloss:0.231475
[150]	train-mlogloss:0.144189	validation-mlogloss:0.147829
[200]	train-mlogloss:0.112468	validation-mlogloss:0.118512
[250]	train-mlogloss:0.09985	validation-mlogloss:0.108274
[300]	train-mlogloss:0.094068	validation-mlogloss:0.104621
[350]	train-mlogloss:0.090906	validation-mlogloss:0.103252
[400]	train-mlogloss:0.088709	validation-mlogloss:0.102583
[450]	train-mlogloss:0.086852	validation-mlogloss:0.102273
[499]	train-mlogloss:0.08519	validation-mlogloss:0.102166


In [300]:
def second_level_stack():
    
#     xgb_train_res, xgb_test_res = run_model("xgb")
#     gbm_train_res, gbm_test_res = run_model("gbm")
#     rf_train_res, rf_test_res = run_model("rf")
    
#     #Validation
#     pred_final = pd.DataFrame()
#     pred_final["pred_zero"] = (xgb_test_res["xgb_zero"] + gbm_test_res["gbm_zero"] + rf_test_res["rf_zero"])/3
#     pred_final["pred_one"] = (xgb_test_res["xgb_one"] + gbm_test_res["gbm_one"] + rf_test_res["rf_one"])/3
#     pred_final["pred_two"] = (xgb_test_res["xgb_two"] + gbm_test_res["gbm_two"] + rf_test_res["rf_two"])/3
#     pred_final["pred_three"] = (xgb_test_res["xgb_three"] + gbm_test_res["gbm_three"] + rf_test_res["rf_three"])/3

    #Validation
    est = LogisticRegression(fit_intercept=False)
    est.fit(x_train, label_train)
    pred_final = est.predict_proba(x_valid)

    #Testing
#     est = LogisticRegression(fit_intercept=False)
#     est.fit(train, label)
#     pred_final = est.predict_proba(test)

    return pred_final

In [301]:
pred_final = second_level_stack()

In [240]:
def first_level_average():
    
    pred_final = pd.DataFrame()
    pred_final["pred_zero"] = (test["xgb_zero"] + test["gbm_zero"] + test["rf_zero"])/3
    pred_final["pred_one"] = (test["xgb_one"] + test["gbm_one"] + test["rf_one"])/3
    pred_final["pred_two"] = (test["xgb_two"] + test["gbm_two"] + test["rf_two"])/3
    pred_final["pred_three"] = (test["xgb_three"] + test["gbm_three"] + test["rf_three"])/3
    
    return pred_final

In [259]:
pred_final = first_level_average()

In [255]:
#Submit

pred_final.columns = ['Front','Left','Rear','Right']
pred_final['Id'] = test_ids
                      
pred_final = pred_final[['Id','Front','Left','Rear','Right']]
pred_final.to_csv("./subs/ens_1.csv", index=False)

In [303]:
print 100 - metrics.log_loss(label_valid, pred_final)

99.8442153441
