In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

In [2]:
dependent = pd.read_csv('../data/model/dependent/dependent_n.csv')
independent = pd.read_csv('../data/model/independent/independent_n.csv')
print(dependent.shape, independent.shape)
data = dependent.merge(independent, on=['user_id','product_id','eval_set'], how='inner')
del dependent, independent
train_data = data[data['eval_set'] == 'train']
valid_data = data[data['eval_set'] == 'valid']
del data
print(train_data.shape, valid_data.shape)

(13514162, 4) (13514162, 145)
(6879915, 146) (1725955, 146)


In [3]:
train_matrix = xgb.DMatrix(data = train_data.iloc[:,4:], label = train_data.iloc[:,3])
del train_data
valid_matrix = xgb.DMatrix(data = valid_data.iloc[:,4:], label = valid_data.iloc[:,3])
del valid_data

In [4]:
booster = {}
booster['booster'] = 'gbtree'
booster['nthread'] = 63
booster['max_depth'] = 10
booster['min_child_weight'] = 10
booster['subsample'] = 0.8
booster['colsample_bytree'] = 1.0
booster['colsample_bylevel'] = 0.9
booster['lambda'] = 1.0
booster['alpha'] = 0.0
booster['objective'] = 'binary:logistic'
booster['eval_metric'] = ['logloss']
booster['base_score'] = 0.1
booster['seed'] = 108

In [5]:
params = {}
params['params'] = booster
params['dtrain'] = train_matrix
params['num_boost_round'] = 2000
params['evals'] = [(train_matrix,'train_matrix'),(valid_matrix,'valid_matrix')]
params['early_stopping_rounds'] = 10
params['verbose_eval'] = 150
params['callbacks'] = [xgb.callback.reset_learning_rate([0.02] * 2000)]

In [6]:
model = xgb.train(**params)

[0]	train_matrix-logloss:0.315511	valid_matrix-logloss:0.31418
Multiple eval metrics have been passed: 'valid_matrix-logloss' will be used for early stopping.

Will train until valid_matrix-logloss hasn't improved in 10 rounds.
[150]	train_matrix-logloss:0.23888	valid_matrix-logloss:0.241151
[300]	train_matrix-logloss:0.235049	valid_matrix-logloss:0.239996
[450]	train_matrix-logloss:0.232357	valid_matrix-logloss:0.239756
[600]	train_matrix-logloss:0.230004	valid_matrix-logloss:0.239644
Stopping. Best iteration:
[662]	train_matrix-logloss:0.229128	valid_matrix-logloss:0.239615



In [7]:
model.save_model('../data/model/xgb_binary.model')

In [8]:
sorted(model.get_fscore().items(), key=lambda x : x[1], reverse=True)[:50]

[('usr_lag_rdr', 7970),
 ('ratio6', 7818),
 ('ratio10', 7733),
 ('likelihood_days', 6827),
 ('ratio9', 6714),
 ('usr_post', 6691),
 ('usr_label', 6602),
 ('bm25_score', 6434),
 ('fscore_23', 6401),
 ('fscore_24', 6124),
 ('fscore_34', 6100),
 ('usr_dysc_cnt', 5949),
 ('usr_avg_rdr', 5941),
 ('xgb_w2v_score', 5884),
 ('usr_dow_cnt', 5878),
 ('ratio4', 5821),
 ('fmean', 5751),
 ('ratio7', 5670),
 ('prd_dysc_int', 5490),
 ('order_hour_of_day', 5339),
 ('usr_hod_cnt', 5305),
 ('cartdiv', 5276),
 ('prd2way2', 5032),
 ('cartlen', 4793),
 ('prd2way1', 4754),
 ('prd_ordn_int', 4753),
 ('usr_med_dysc', 4752),
 ('ffm', 4505),
 ('prd_ais_post_rt', 4495),
 ('diff_hod', 4468),
 ('ratio5', 4348),
 ('usr_cds_ais', 4336),
 ('usr_ais_perc_cnt', 4230),
 ('usr_dep_cnt', 4128),
 ('catboost1', 4076),
 ('usr_cds_prd', 4063),
 ('prd2way3', 4051),
 ('ratio3', 4030),
 ('ratio2', 4009),
 ('ratio8', 3974),
 ('prd2way4', 3950),
 ('mcrt1', 3890),
 ('max_tfidf', 3809),
 ('decay', 3786),
 ('days_since_prior_order', 

In [9]:
model.attributes()

{'best_iteration': '662',
 'best_msg': '[662]\ttrain_matrix-logloss:0.229128\tvalid_matrix-logloss:0.239615',
 'best_score': '0.239615'}