In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import metrics
import xgboost as xgb
import operator
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier  #GBM algorithm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold



In [40]:
train = pd.read_csv("./new/train_data.csv")
test = pd.read_csv("./new/test_data.csv")

In [41]:
#Clean label columns
label = train.pop("segment")
test_ids = test.pop("ID")

#drop columns
train.drop(['ID'], inplace=True, axis=1)
# test.drop([], inplace=True,axis=1)

#Validation split
# x_train, x_valid, label_train, label_valid = train_test_split(train, label, test_size=0.2, random_state=4242, stratify = label)

train = train.replace([np.inf, -np.inf], np.nan)
train = train.fillna(0)

test = test.replace([np.inf, -np.inf], np.nan)
test = test.fillna(0)

In [46]:
def run_xgb(x_train, label_train, x_valid = None, label_valid = None):

    # Set our parameters for xgboost
    params = {}
    params['objective'] = 'binary:logistic'
    params['eval_metric'] = 'auc'
    params['eta'] = 0.02
    params['max_depth'] = 5
    params['silent'] = 1
    params['min_child_weight'] = 0
    params['subsample'] = 0.8
    params['colsample_bytree'] = 0.8
    params['nthread'] = 13

    d_train = xgb.DMatrix(x_train, label=label_train)
    
    if x_valid is not None:
        d_valid = xgb.DMatrix(x_valid, label=label_valid)
        watchlist = [(d_train, 'train'), (d_valid, 'validation')]
    else:
        watchlist = [(d_train, 'train')]
        
    bst = xgb.train(params, d_train, 500, watchlist, early_stopping_rounds=50, verbose_eval=50)
    
    return bst

In [14]:
def init_xgb(func_train, func_label, func_test, k_fold_flag = 1):

#     Validate
    skf = StratifiedKFold(n_splits=5)
    skf.get_n_splits(func_train, func_label)
    
    train_pred = pd.DataFrame()
    
    if k_fold_flag == 1:
        for train_index, test_index in skf.split(func_train, func_label):

            print test_index

            kfold_train = func_train.iloc[train_index]
            kfold_label = func_label.iloc[train_index]
            kfold_test = func_train.iloc[test_index]

            xgb_model = run_xgb(kfold_train, kfold_label)
            kfold_pred = xgb_model.predict(xgb.DMatrix(kfold_test))
            kfold_pred = pd.DataFrame(kfold_pred)
            kfold_pred.columns = ["xgb_segment"]

            train_pred = pd.concat([train_pred, kfold_pred], axis = 0)

        bst = run_xgb(func_train, func_label)    
        d_val = xgb.DMatrix(func_test)
        test_pred = bst.predict(d_val)

        return train_pred, test_pred
    else:
        
        bst = run_xgb(func_train, func_label)    
        d_val = xgb.DMatrix(func_test)
        test_pred = bst.predict(d_val)

        return test_pred
        
#     Test
#     bst = run_xgb(train, label)
    
#     d_train = xgb.DMatrix(train)
#     d_test = xgb.DMatrix(test)
    
#     train_pred = bst.predict(d_train)
#     test_pred = bst.predict(d_test)

#     return train_pred, test_pred


In [7]:
def run_gbm(x_train, label_train, x_valid = None, label_valid = None):

    gbm = GradientBoostingClassifier(n_estimators=2000, max_depth=7, learning_rate=0.05, random_state=10)
    gbm.fit(x_train, label_train)
    
    return gbm

In [15]:
def init_gbm(func_train, func_label, func_test, k_fold_flag = 1):
    
#     #Validation
    
    if k_fold_flag == 1:
        
        skf = StratifiedKFold(n_splits=5)
        skf.get_n_splits(func_train, func_label)

        train_pred = pd.DataFrame()
        for train_index, test_index in skf.split(func_train, func_label):

            print test_index

            kfold_train = func_train.iloc[train_index]
            kfold_label = func_label.iloc[train_index]
            kfold_test = func_train.iloc[test_index]

            gbm_model = run_gbm(kfold_train, kfold_label)
            kfold_pred = gbm_model.predict_proba(kfold_test)
            kfold_pred = pd.DataFrame(kfold_pred)
            kfold_pred.columns = ["gbm_segment"]

            train_pred = pd.concat([train_pred, kfold_pred], axis = 0)

        gbm_model = run_gbm(func_train, func_label)
        test_pred = gbm_model.predict_proba(func_test)

        return train_pred, test_pred
    
    else:
        gbm_model = run_gbm(func_train, func_label)
        test_pred = gbm_model.predict_proba(func_test)
        return test_pred
        
#     Testing
#     gbm_model = run_gbm(train, label)
    
#     train_pred = gbm_model.predict_proba(train)
#     test_pred = gbm_model.predict_proba(test)
    
#     return train_pred, test_pred

In [9]:
def run_rf(x_train, label_train):
    
    #train final model
    rf_model = RandomForestClassifier(n_estimators=300,max_depth=6, max_features=10)
    rf_model.fit(x_train, label_train)
    
    return rf_model

In [16]:
def init_rf(func_train, func_label, func_test, k_fold_flag = 1):
    
    #Validation
    
    if k_fold_flag == 1:
        
        skf = StratifiedKFold(n_splits=5)
        skf.get_n_splits(func_train, func_label)

        train_pred = pd.DataFrame()
        for train_index, test_index in skf.split(func_train, func_label):

            print test_index

            kfold_train = func_train.iloc[train_index]
            kfold_label = func_label.iloc[train_index]
            kfold_test = func_train.iloc[test_index]

            rf_model = run_rf(kfold_train, kfold_label)
            kfold_pred = rf_model.predict_proba(kfold_test)
            kfold_pred = pd.DataFrame(kfold_pred)
            kfold_pred.columns = ["rf_segment"]

            train_pred = pd.concat([train_pred, kfold_pred], axis = 0)

        rf_model = run_rf(func_train, func_label)
        test_pred = rf_model.predict_proba(func_test)
        return train_pred, test_pred
    
    else:
        rf_model = run_rf(func_train, func_label)
        test_pred = rf_model.predict_proba(func_test)
        return test_pred

    #Testing
#     rf_model = run_rf(train, label)
    
#     train_pred = rf_model.predict_proba(train)
#     test_pred = rf_model.predict_proba(test)
    
#     return train_pred, test_pred


In [19]:
def run_model(model, k_fold_flag = 1):
    
    if k_fold_flag == 1:
        if model == "xgb":
            model_train_res, model_test_res = init_xgb(train, label, test)
        elif model == "gbm":
            model_train_res, model_test_res = init_gbm(train, label, test)
        elif model == "rf":
            model_train_res, model_test_res = init_rf(train, label, test)

        model_test_res = pd.DataFrame(model_test_res)
        model_test_res.columns = [ model + "_segment"]

        return model_train_res, model_test_res

    else:
        if model == "xgb":
            model_test_res = init_xgb(train, label, test, 0)
        elif model == "gbm":
            model_test_res = init_gbm(train, label, test, 0)
        elif model == "rf":
            model_test_res = init_rf(train, label, test, 0)

        model_test_res = pd.DataFrame(model_test_res)
        model_test_res.columns = [ model + "_segment"]
        
        return model_test_res

In [47]:
def first_level_stack(x_train, x_test):
    
    xgb_train_res, xgb_test_res = run_model("xgb")    
    gbm_train_res, gbm_test_res = run_model("gbm")
    rf_train_res, rf_test_res = run_model("rf")

    xgb_train_res.index = x_train.index
    xgb_test_res.index = x_test.index
    x_train = pd.concat([x_train, xgb_train_res], axis = 1)
    x_test = pd.concat([x_test, xgb_test_res], axis = 1)

    gbm_train_res.index = x_train.index
    gbm_test_res.index = x_test.index
    x_train = pd.concat([x_train, gbm_train_res], axis = 1)
    x_test = pd.concat([x_test, gbm_test_res], axis = 1)

    rf_train_res.index = x_train.index
    rf_test_res.index = x_test.index    
    x_train = pd.concat([x_train, rf_train_res], axis = 1)
    x_test = pd.concat([x_test, rf_test_res], axis = 1)

    return x_train, x_test

In [48]:
# x_train, x_valid = first_level_stack(x_train, x_valid)
train, test = first_level_stack(train, test)

[    0     1     2 ..., 40320 40321 40323]
[0]	train-auc:0.825029
Will train until train-auc hasn't improved in 50 rounds.


KeyboardInterrupt: 

In [78]:
def second_level_stack():
    
    xgb_test_res = run_model("xgb", 0)
    gbm_test_res = run_model("gbm", 0)
    rf_test_res = run_model("rf", 0)
    
#     #Validation
    pred_final = pd.DataFrame()
    pred_final["pred_zero"] = (xgb_test_res["xgb_zero"] + gbm_test_res["gbm_zero"] + rf_test_res["rf_zero"])/3
    pred_final["pred_one"] = (xgb_test_res["xgb_one"] + gbm_test_res["gbm_one"] + rf_test_res["rf_one"])/3
    pred_final["pred_two"] = (xgb_test_res["xgb_two"] + gbm_test_res["gbm_two"] + rf_test_res["rf_two"])/3
    pred_final["pred_three"] = (xgb_test_res["xgb_three"] + gbm_test_res["gbm_three"] + rf_test_res["rf_three"])/3

    #Validation
#     est = LogisticRegression(fit_intercept=False)
#     est.fit(x_train, label_train)
#     pred_final = est.predict_proba(x_valid)

    #Testing
#     est = LogisticRegression(fit_intercept=False)
#     est.fit(train, label)
#     pred_final = est.predict_proba(test)

    return pred_final

In [79]:
pred_final = second_level_stack()

[0]	train-mlogloss:1.34958	validation-mlogloss:1.34947
Multiple eval metrics have been passed: 'validation-mlogloss' will be used for early stopping.

Will train until validation-mlogloss hasn't improved in 50 rounds.
[50]	train-mlogloss:0.475238	validation-mlogloss:0.474808
[100]	train-mlogloss:0.228194	validation-mlogloss:0.229196
[150]	train-mlogloss:0.142697	validation-mlogloss:0.146065
[200]	train-mlogloss:0.111114	validation-mlogloss:0.11749
[250]	train-mlogloss:0.097972	validation-mlogloss:0.107859
[300]	train-mlogloss:0.091509	validation-mlogloss:0.104543
[350]	train-mlogloss:0.087632	validation-mlogloss:0.10341
[400]	train-mlogloss:0.084872	validation-mlogloss:0.103029
[450]	train-mlogloss:0.082236	validation-mlogloss:0.102887


In [240]:
def first_level_average():
    
    pred_final = pd.DataFrame()
    pred_final["pred_zero"] = (test["xgb_zero"] + test["gbm_zero"] + test["rf_zero"])/3
    pred_final["pred_one"] = (test["xgb_one"] + test["gbm_one"] + test["rf_one"])/3
    pred_final["pred_two"] = (test["xgb_two"] + test["gbm_two"] + test["rf_two"])/3
    pred_final["pred_three"] = (test["xgb_three"] + test["gbm_three"] + test["rf_three"])/3
    
    return pred_final

In [259]:
pred_final = first_level_average()

In [255]:
#Submit

pred_final.columns = ['Front','Left','Rear','Right']
pred_final['Id'] = test_ids
                      
pred_final = pred_final[['Id','Front','Left','Rear','Right']]
pred_final.to_csv("./subs/ens_1.csv", index=False)

In [80]:
print 100 - metrics.log_loss(label_valid, pred_final)

99.8951027522


In [42]:
train.head().transpose()

Unnamed: 0,0,1,2,3,4
genre_Action,0.0,0.0,0.0,0.0,0.0
genre_Awards,0.0,0.0,0.0,0.0,0.0
genre_Crime,0.0,0.0,0.0,0.0,0.0
genre_Documentary,0.0,0.0,0.0,0.0,0.0
genre_Horror,0.0,0.0,0.0,0.0,0.0
genre_Kids,0.0,0.0,0.0,0.0,0.0
genre_LiveTV,0.0,0.0,13.0,0.0,0.0
genre_Mythology,0.0,0.0,0.0,0.0,0.0
genre_NA,0.0,0.0,0.0,0.0,0.0
genre_Science,0.0,0.0,0.0,0.0,0.0
