In [1]:
%matplotlib inline

In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

In [3]:
import datetime
from sklearn.cross_validation import KFold
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn import metrics
import xgboost as xgb
import matplotlib.pyplot as plt
import prettyplotlib as ppl
import brewer2mpl
import random
from operator import itemgetter
import time
import copy

In [4]:
set2 = brewer2mpl.get_map('Set2', 'qualitative', 8).mpl_colors

font = {'family' : 'serif',
        'color'  : 'darkred',
        'weight' : 'bold',
        'size'   : 16,
        }
plt.rc('font',family='serif')
plt.rc('font', size=16)
plt.rc('font', weight='bold')
plt.style.use('fivethirtyeight')
    
# Get current size
fig_size = plt.rcParams["figure.figsize"]
 
# Set figure width to 6 and height to 6
fig_size[0] = 6
fig_size[1] = 6
plt.rcParams["figure.figsize"] = fig_size

In [5]:
train = pd.read_csv('data/act_train.csv', dtype={'people_id': np.str,
                               'activity_id': np.str,
                               'outcome': np.int8}, parse_dates=['date'])
test = pd.read_csv('data/act_test.csv', dtype={'people_id': np.str,
                              'activity_id': np.str}, parse_dates=['date'])
people = pd.read_csv('data/people.csv', dtype={'people_id': np.str,
                              'activity_id': np.str,
                              'char_38': np.int32},parse_dates=['date'])

In [6]:
train.head()

Unnamed: 0,people_id,activity_id,date,activity_category,char_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,outcome
0,ppl_100,act2_1734928,2023-08-26,type 4,,,,,,,,,,type 76,0
1,ppl_100,act2_2434093,2022-09-27,type 2,,,,,,,,,,type 1,0
2,ppl_100,act2_3404049,2022-09-27,type 2,,,,,,,,,,type 1,0
3,ppl_100,act2_3651215,2023-08-04,type 2,,,,,,,,,,type 1,0
4,ppl_100,act2_4109017,2023-08-26,type 2,,,,,,,,,,type 1,0


In [7]:
print("Process tables...")
for table in [train, test]:
        table['year'] = table['date'].dt.year
        table['month'] = table['date'].dt.month
        table['day'] = table['date'].dt.day
        table.drop('date', axis=1, inplace=True)
        table['activity_category'] = table['activity_category'].str.lstrip('type ').astype(np.int32)
        for i in range(1, 11):
            table['char_' + str(i)].fillna('type -999', inplace=True)
            table['char_' + str(i)] = table['char_' + str(i)].str.lstrip('type ').astype(np.int32)

Process tables...


In [8]:
people['year'] = people['date'].dt.year
people['month'] = people['date'].dt.month
people['day'] = people['date'].dt.day
people.drop('date', axis=1, inplace=True)
people['group_1'] = people['group_1'].str.lstrip('group ').astype(np.int32)
for i in range(1, 10):
        people['char_' + str(i)] = people['char_' + str(i)].str.lstrip('type ').astype(np.int32)
for i in range(10, 38):
        people['char_' + str(i)] = people['char_' + str(i)].astype(np.int32)

print("Merge...")
train = pd.merge(train, people, how='left', on='people_id', left_index=True)
train.fillna(-999, inplace=True)
test = pd.merge(test, people, how='left', on='people_id', left_index=True)
test.fillna(-999, inplace=True)

Merge...


In [9]:
feature_cols = [col for col in train.columns if col not in ['people_id', 'activity_id','outcome']]
target = 'outcome'

In [10]:
print('Length of train: ', len(train))
print('Length of test: ', len(test))
print('Features [{}]: {}'.format(len(feature_cols), sorted(feature_cols)))


Length of train:  2197291
Length of test:  498687
Features [56]: ['activity_category', 'char_10_x', 'char_10_y', 'char_11', 'char_12', 'char_13', 'char_14', 'char_15', 'char_16', 'char_17', 'char_18', 'char_19', 'char_1_x', 'char_1_y', 'char_20', 'char_21', 'char_22', 'char_23', 'char_24', 'char_25', 'char_26', 'char_27', 'char_28', 'char_29', 'char_2_x', 'char_2_y', 'char_30', 'char_31', 'char_32', 'char_33', 'char_34', 'char_35', 'char_36', 'char_37', 'char_38', 'char_3_x', 'char_3_y', 'char_4_x', 'char_4_y', 'char_5_x', 'char_5_y', 'char_6_x', 'char_6_y', 'char_7_x', 'char_7_y', 'char_8_x', 'char_8_y', 'char_9_x', 'char_9_y', 'day_x', 'day_y', 'group_1', 'month_x', 'month_y', 'year_x', 'year_y']


In [33]:
def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()

In [34]:
def get_importance(gbm, features):
    create_feature_map(features)
    importance = gbm.get_fscore(fmap='xgb.fmap')
    importance = sorted(importance.items(), key=itemgetter(1), reverse=True)
    return importance

In [35]:
def run_single(train, test, features, target, random_state=0):
    eta = 0.2
    max_depth = 5
    subsample = 0.8
    colsample_bytree = 0.8
    start_time = time.time()

    print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))
    params = {
        "objective": "binary:logistic",
        "booster" : "gbtree",
        "eval_metric": "auc",
        "eta": eta,
        "tree_method": 'exact',
        "max_depth": max_depth,
        "subsample": subsample,
        "colsample_bytree": colsample_bytree,
        "silent": 1,
        "seed": random_state,
    }
    num_boost_round = 115
    early_stopping_rounds = 10
    test_size = 0.1

    X_train, X_valid = train_test_split(train, test_size=test_size, random_state=random_state)
    print('Length train:', len(X_train.index))
    print('Length valid:', len(X_valid.index))
    y_train = X_train[target]
    y_valid = X_valid[target]
    dtrain = xgb.DMatrix(X_train[features], y_train)
    dvalid = xgb.DMatrix(X_valid[features], y_valid)

    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True)

    print("Validating...")
    check = gbm.predict(xgb.DMatrix(X_valid[features]), ntree_limit=gbm.best_iteration+1)
    score = roc_auc_score(X_valid[target].values, check)
    print('Check error value: {:.6f}'.format(score))

    imp = get_importance(gbm, features)
    print('Importance array: ', imp)

    print("Predict test set...")
    test_prediction = gbm.predict(xgb.DMatrix(test[features]), ntree_limit=gbm.best_iteration+1)

    print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))
    return test_prediction.tolist(), score

In [36]:
test_prediction, score = run_single(train, test, feature_cols, 'outcome')

XGBoost params. ETA: 0.2, MAX_DEPTH: 5, SUBSAMPLE: 0.8, COLSAMPLE_BY_TREE: 0.8
Length train:

Will train until eval error hasn't decreased in 10 rounds.
[0]	train-auc:0.907602	eval-auc:0.907374
[1]	train-auc:0.913504	eval-auc:0.913421
[2]	train-auc:0.924558	eval-auc:0.924228
[3]	train-auc:0.927926	eval-auc:0.927518
[4]	train-auc:0.929106	eval-auc:0.928761
[5]	train-auc:0.929533	eval-auc:0.929015
[6]	train-auc:0.929751	eval-auc:0.929260
[7]	train-auc:0.930174	eval-auc:0.929591
[8]	train-auc:0.930535	eval-auc:0.929916
[9]	train-auc:0.930653	eval-auc:0.930069
[10]	train-auc:0.931911	eval-auc:0.931246
[11]	train-auc:0.932197	eval-auc:0.931533
[12]	train-auc:0.933671	eval-auc:0.933043
[13]	train-auc:0.934529	eval-auc:0.933922
[14]	train-auc:0.935246	eval-auc:0.934619
[15]	train-auc:0.936155	eval-auc:0.935594
[16]	train-auc:0.936445	eval-auc:0.935894
[17]	train-auc:0.936629	eval-auc:0.936072
[18]	train-auc:0.937377	eval-auc:0.936798
[19]	train-auc:0.937730	eval-auc:0.937151
[20]	train-auc:0.938241	eval-auc:0.937686
[21]	train-auc:0.938755	eval-auc:0.938209
[22]	train-auc:0.939387	eva

 1977561
Length valid: 219730
Validating...
Check error value: 0.962806
Importance array:  [('group_1', 831), ('char_38', 346), ('char_7_y', 330), ('char_6_y', 168), ('day_y', 160), ('char_9_y', 128), ('char_8_y', 111), ('month_y', 89), ('char_3_y', 87), ('char_10_x', 80), ('char_4_y', 75), ('year_x', 57), ('year_y', 55), ('month_x', 54), ('char_5_y', 50), ('char_25', 43), ('char_10_y', 41), ('char_2_y', 38), ('char_29', 34), ('char_20', 33), ('char_33', 31), ('char_11', 30), ('day_x', 29), ('char_1_y', 29), ('char_13', 29), ('char_23', 20), ('char_26', 20), ('char_35', 20), ('char_19', 19), ('char_24', 19), ('char_18', 18), ('char_30', 18), ('char_31', 17), ('char_34', 16), ('char_27', 16), ('char_36', 15), ('char_32', 14), ('char_14', 13), ('char_37', 13), ('activity_category', 12), ('char_17', 11), ('char_16', 9), ('char_21', 9), ('char_28', 9), ('char_12', 8), ('char_22', 8), ('char_15', 7), ('char_9_x', 6), ('char_2_x', 3), ('char_8_x', 2), ('char_7_x', 2), ('char_3_x', 2), ('char

In [37]:
def create_submission(test, prediction):
    now = datetime.datetime.now()
    sub_file = 'submission_xg_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
    print('Writing submission: ', sub_file)
    f = open(sub_file, 'w')
    f.write('activity_id,outcome\n')
    total = 0
    for id in test['activity_id']:
        str1 = str(id) + ',' + str(prediction[total])
        str1 += '\n'
        total += 1
        f.write(str1)
    f.close()

In [38]:
create_submission(test, test_prediction)

Writing submission:  submission_xg_2016-08-14-12-19.csv
