In [1]:
from __future__ import division
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split



In [12]:
df_train = pd.read_csv("./new/train_data.csv")
df_test = pd.read_csv("./new/test_data.csv")

In [11]:
def run_xgb(x_train, label_train, x_valid = None, label_valid = None):

    # Set our parameters for xgboost
    params = {}
    params['objective'] = 'binary:logistic'
    params['eval_metric'] = 'auc'
    params['eta'] = 0.02
    params['max_depth'] = 5
    params['silent'] = 1
    params['min_child_weight'] = 0
    params['subsample'] = 0.8
    params['colsample_bytree'] = 0.8
    params['nthread'] = 13

    d_train = xgb.DMatrix(x_train, label=label_train)
    
    if x_valid is not None:
        d_valid = xgb.DMatrix(x_valid, label=label_valid)
        watchlist = [(d_train, 'train'), (d_valid, 'validation')]
    else:
        watchlist = [(d_train, 'train')]
        
    bst = xgb.train(params, d_train, 500, watchlist, early_stopping_rounds=50, verbose_eval=50)
    
    return bst

In [13]:
#Clean useless columns
df_label = df_train.pop("segment")
df_train.drop("ID", axis = 1, inplace=True)

#Validation split
x_train, x_valid, label_train, label_valid = train_test_split(df_train, df_label, test_size=0.2, random_state=4242, stratify = df_label)

test_ids = df_test.pop("ID")

In [14]:
#Validation Run
bst = run_xgb(x_train, label_train, x_valid, label_valid)

import operator
importance = bst.get_fscore()
importance = sorted(importance.items(), key=operator.itemgetter(1))

df_imp = pd.DataFrame(importance, columns=['feature', 'fscore'])
df_imp['fscore'] = df_imp['fscore']/df_imp['fscore'].sum()

df_imp

[0]	train-auc:0.794685	validation-auc:0.789259
Multiple eval metrics have been passed: 'validation-auc' will be used for early stopping.

Will train until validation-auc hasn't improved in 50 rounds.


KeyboardInterrupt: 

In [None]:
#Real Test Run + Submit
bst = run_xgb(df_train, df_label)

d_test = xgb.DMatrix(df_test)
p_test = bst.predict(d_test)

sub = pd.DataFrame()
sub['ID'] = test_ids
sub['segment'] = p_test
sub.to_csv("./subs/xgb_1.csv", index=False)   

[0]	train-auc:0.795365
Will train until train-auc hasn't improved in 50 rounds.
[50]	train-auc:0.809165
[100]	train-auc:0.811995
[150]	train-auc:0.814577
[200]	train-auc:0.8174
[250]	train-auc:0.82035
[300]	train-auc:0.82314
[350]	train-auc:0.825579
[400]	train-auc:0.827764
[450]	train-auc:0.829938
