In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

np.random.seed(2019)

In [2]:
trn = pd.read_csv('./dataset/train_ver2.csv')
tst = pd.read_csv('./dataset/test_ver2.csv')

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


# Preprocessing

In [3]:
print('before tranin set shape : ', trn.shape)
prods = trn.columns[24:].tolist()

trn[prods] = trn[prods].fillna(0.0).astype(np.int8)

# if some customer buy anyting, remove
no_product = trn[prods].sum(axis=1) == 0
trn = trn[~no_product]
print('after tranin set shape : ', trn.shape)

before tranin set shape :  (13647309, 48)
after tranin set shape :  (11091070, 48)


In [4]:
for col in trn.columns[24:]:
    tst[col] = 0

df = pd.concat([trn, tst], axis=0)

In [5]:
features = []

# 범주형을 0, 1, 2와 같은 꼴로 변경시킴
categorical_cols = ['ind_empleado', 'pais_residencia', 'sexo', 'tiprel_1mes', 'indresi', 'indext', 'conyuemp', 'canal_entrada', 'indfall', 'nomprov', 'segmento']
for col in categorical_cols:
    df[col], _ = df[col].factorize(na_sentinel=-99)
features += categorical_cols

In [6]:
df['age'].replace(' NA', -99, inplace=True)
df['age'] = df['age'].astype(np.int8)

In [7]:
df['antiguedad'].replace('     NA', -99, inplace=True)
df['antiguedad'] = df['antiguedad'].astype(np.int8)

In [8]:
df['renta'].replace('         NA', -99, inplace=True)
df['renta'].fillna(-99, inplace=True)
df['renta'] = df['renta'].astype(float).astype(np.int8)

In [9]:
df['indrel_1mes'].replace('P', 5, inplace=True)
df['indrel_1mes'].fillna(-99, inplace=True)
df['indrel_1mes'] = df['indrel_1mes'].astype(float).astype(np.int8)

In [10]:
features += ['age', 'antiguedad', 'renta', 'ind_nuevo', 'indrel', 'intrel_1mes', 'ind_actividad_cliente']

In [11]:
del df['tipodom']

# Feature Engineering

In [12]:
df['fecha_alta_month'] = df['fecha_alta'].map(lambda x: 0.0 if x.__class__ is float else float(x.split('-')[1])).astype(np.int8)
df['fecha_alta_year'] = df['fecha_alta'].map(lambda x: 0.0 if x.__class__ is float else float(x.split('-')[0])).astype(np.int16)
features += ['fecha_alta_month', 'fecha_alta_year']

In [13]:
df['ult_fec_cli_1t_month'] = df['ult_fec_cli_1t'].map(lambda x: 0.0 if x.__class__ is float else float(x.split('-')[1])).astype(np.int8)
df['ult_fec_cli_1t_year'] = df['ult_fec_cli_1t'].map(lambda x: 0.0 if x.__class__ is float else float(x.split('-')[0])).astype(np.int16)
features += ['ult_fec_cli_1t_month', 'ult_fec_cli_1t_year']

In [14]:
features

['ind_empleado',
 'pais_residencia',
 'sexo',
 'tiprel_1mes',
 'indresi',
 'indext',
 'conyuemp',
 'canal_entrada',
 'indfall',
 'nomprov',
 'segmento',
 'age',
 'antiguedad',
 'renta',
 'ind_nuevo',
 'indrel',
 'intrel_1mes',
 'ind_actividad_cliente',
 'fecha_alta_month',
 'fecha_alta_year',
 'ult_fec_cli_1t_month',
 'ult_fec_cli_1t_year']

In [15]:
df.fillna(-99, inplace=True)

In [16]:
def date_to_int(str_date):
    Y, M, D = [int(a) for a in str_date.strip().split("-")]
    int_date = (int(Y) - 2015) * 12 + int(M)
    
    return int_date

In [17]:
df['int_date'] = trn['fecha_dato'].map(date_to_int).astype(np.int8)

In [18]:
df_lag = df.copy()
df_lag.columns = [col + '_prev' if col not in ['ncodpers', 'int_date'] else col for col in df.columns]
df_lag['int_date'] += 1

In [19]:
print(len(df_lag.columns))
print(len(df.columns))

52
52


In [20]:
df_trn = df.merge(df_lag, on=['ncodpers', 'int_date'], how='left')
del df, df_lag

In [21]:
for prod in prods:
    prev = prod + '_prev'
    df_trn[prev].fillna(0, inplace=True)
df_trn.fillna(-99, inplace=True)

In [22]:
features += [feature + '_prev' for feature in features]
features += [prod + '_prev' for prod in prods]

# Seperate validation set

In [23]:
use_dates = ['2016-01-28', '2016-02-28', '2016-03-28', '2016-04-28', '2016-05-28']
trn = df_trn[df_trn['fecha_dato'].isin(use_dates)]
tst = df_trn[df_trn['fecha_dato'] == '2016-06-28']
del df_trn

In [24]:
X = []
Y = []
for i, prod in enumerate(prods):
    prev = prod + '_prev'
    #Extract new purchase
    prX = trn[(trn[prod] == 1) & (trn[prev] == 0)]
    prY = np.zeros(prX.shape[0], dtype = np.int8) + i
    X.append(prX)
    Y.append(prY)
XY = pd.concat(X)
Y = np.hstack(Y)
XY['y'] = Y

In [25]:
vld_date = '2016-05-28'
XY_trn = XY[XY['fecha_dato'] != vld_date]
XY_vld = XY[XY['fecha_dato'] == vld_date]

# Model training

In [26]:
param = {
    'booster' : 'gbtree',
    'max_depth' : 8,
    'nthread' : 4,
    'num_class' : len(prods),
    'objective' : 'multi:softprob',
    'silent' : 1,
    'eval_metric' : 'mlogloss',
    'eta' : 0.1,
    'min_child_weight' : 10,
    'colsample_bytree' : 0.8,
    'colsample_bylevel' : 0.9,
    'seed': 2019,
}

In [27]:
X_trn = XY_trn.as_matrix(columns=features)
Y_trn = XY_trn.as_matrix(columns=['y'])
dtrn = xgb.DMatrix(X_trn, label=Y_trn, feature_names=features)

  """Entry point for launching an IPython kernel.
  


In [28]:
X_vld = XY_vld.as_matrix(columns=features)
Y_vld = XY_vld.as_matrix(columns=['y'])
dvld = xgb.DMatrix(X_vld, label=Y_vld, feature_names=features)

  """Entry point for launching an IPython kernel.
  


In [29]:
watch_list = [(dtrn, 'train'), (dvld, 'eval')]
model = xgb.train(param, dtrn, num_boost_round=1000, evals=watch_list, early_stopping_rounds=20)

[0]	train-mlogloss:2.74803	eval-mlogloss:2.7587
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 20 rounds.
[1]	train-mlogloss:2.5136	eval-mlogloss:2.53046
[2]	train-mlogloss:2.32368	eval-mlogloss:2.34079
[3]	train-mlogloss:2.17125	eval-mlogloss:2.18883
[4]	train-mlogloss:2.05224	eval-mlogloss:2.07009
[5]	train-mlogloss:1.9533	eval-mlogloss:1.97154
[6]	train-mlogloss:1.86737	eval-mlogloss:1.88576
[7]	train-mlogloss:1.79415	eval-mlogloss:1.81287
[8]	train-mlogloss:1.73029	eval-mlogloss:1.74942
[9]	train-mlogloss:1.67555	eval-mlogloss:1.69534
[10]	train-mlogloss:1.63034	eval-mlogloss:1.65059
[11]	train-mlogloss:1.58562	eval-mlogloss:1.60597
[12]	train-mlogloss:1.54695	eval-mlogloss:1.5678
[13]	train-mlogloss:1.51234	eval-mlogloss:1.5333
[14]	train-mlogloss:1.47955	eval-mlogloss:1.50079
[15]	train-mlogloss:1.44957	eval-mlogloss:1.47118
[16]	train-mlogloss:1.42302	eval-mlogloss:1.44483
[17]	train-mlo

[161]	train-mlogloss:1.00024	eval-mlogloss:1.08796
[162]	train-mlogloss:0.999688	eval-mlogloss:1.08793
[163]	train-mlogloss:0.999086	eval-mlogloss:1.08793
[164]	train-mlogloss:0.998575	eval-mlogloss:1.08793
[165]	train-mlogloss:0.998012	eval-mlogloss:1.08789
[166]	train-mlogloss:0.997261	eval-mlogloss:1.08785
[167]	train-mlogloss:0.996696	eval-mlogloss:1.08781
[168]	train-mlogloss:0.996144	eval-mlogloss:1.08783
[169]	train-mlogloss:0.995657	eval-mlogloss:1.08777
[170]	train-mlogloss:0.995063	eval-mlogloss:1.08777
[171]	train-mlogloss:0.994647	eval-mlogloss:1.08778
[172]	train-mlogloss:0.994214	eval-mlogloss:1.08774
[173]	train-mlogloss:0.993711	eval-mlogloss:1.08774
[174]	train-mlogloss:0.993065	eval-mlogloss:1.08771
[175]	train-mlogloss:0.992392	eval-mlogloss:1.08773
[176]	train-mlogloss:0.991855	eval-mlogloss:1.08775
[177]	train-mlogloss:0.991375	eval-mlogloss:1.08773
[178]	train-mlogloss:0.990732	eval-mlogloss:1.0877
[179]	train-mlogloss:0.99022	eval-mlogloss:1.08766
[180]	train-mlo

# Save trained model

In [37]:
import pickle

pickle.dump(model, open("model/xgb.baseline.pkl", "wb"))
best_ntree_limit = model.best_ntree_limit

# Validation

In [30]:
from mapk import *

In [31]:
vld = trn[trn['fecha_dato'] == vld_date]
ncodpers_vld = vld.as_matrix(columns=['ncodpers'])
for prod in prods:
    prev = prod + '_prev'
    padd = prod + '_add'
    vld[padd] = vld[prod] - vld[prev]
    
add_vld = vld.as_matrix(columns=[prod + '_add' for prod in prods])
add_vld_list = [list() for i in range(len(ncodpers_vld))]

  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
  


In [32]:
add_vld

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

In [33]:
count_vld = 0
for ncodper in range(len(ncodpers_vld)):
    for prod in range(len(prods)):
        if add_vld[ncodper, prod] > 0:
            add_vld_list[ncodper].append(prod)
            count_vld += 1

In [34]:
print(mapk(add_vld_list, add_vld_list, 7, 0.0))

0.04266379915553903


In [38]:
X_vld = vld.as_matrix(columns=features)
Y_vld = vld.as_matrix(columns=['y'])
dvld = xgb.DMatrix(X_vld, label=Y_vld, feature_names=features)
preds_vld = model.predict(dvld, ntree_limit=best_ntree_limit)

  """Entry point for launching an IPython kernel.
  


In [39]:
preds_vld = preds_vld - vld.as_matrix(columns=[prod + '_prev' for prod in prods])

  """Entry point for launching an IPython kernel.


In [42]:
result_vld = []
for ncodper, pred in zip(ncodpers_vld, preds_vld):
    y_prods = [(y, p, ip) for y,p,ip in zip(pred, prods, range(len(prods)))]
    y_prods = sorted(y_prods, key=lambda a: a[0], reverse=True)[:7]
    result_vld.append([ip for y,p,ip in y_prods])

In [43]:
print(mapk(add_vld_list, result_vld, 7, 0.0))

0.03645581690898608


# Training Total Dataset

In [45]:
X_all = XY.as_matrix(columns=features)
Y_all = XY.as_matrix(columns=['y'])
dall = xgb.DMatrix(X_all, label=Y_all, feature_names=features)
watch_list = [(dall, 'train')]

  """Entry point for launching an IPython kernel.
  


In [46]:
best_ntree_limit = int(best_ntree_limit * (len(XY_trn) + len(XY_vld)) / len(XY_trn))
model = xgb.train(param, dall, num_boost_round=best_ntree_limit, evals=watch_list)

[0]	train-mlogloss:2.74834
[1]	train-mlogloss:2.51448
[2]	train-mlogloss:2.32418
[3]	train-mlogloss:2.17134
[4]	train-mlogloss:2.05217
[5]	train-mlogloss:1.95331
[6]	train-mlogloss:1.86703
[7]	train-mlogloss:1.79362
[8]	train-mlogloss:1.7297
[9]	train-mlogloss:1.67507
[10]	train-mlogloss:1.62983
[11]	train-mlogloss:1.58495
[12]	train-mlogloss:1.54624
[13]	train-mlogloss:1.51151
[14]	train-mlogloss:1.47872
[15]	train-mlogloss:1.44883
[16]	train-mlogloss:1.4223
[17]	train-mlogloss:1.39709
[18]	train-mlogloss:1.37508
[19]	train-mlogloss:1.35382
[20]	train-mlogloss:1.335
[21]	train-mlogloss:1.31823
[22]	train-mlogloss:1.30168
[23]	train-mlogloss:1.28713
[24]	train-mlogloss:1.27323
[25]	train-mlogloss:1.26112
[26]	train-mlogloss:1.24877
[27]	train-mlogloss:1.23767
[28]	train-mlogloss:1.22734
[29]	train-mlogloss:1.21743
[30]	train-mlogloss:1.20832
[31]	train-mlogloss:1.19983
[32]	train-mlogloss:1.19221
[33]	train-mlogloss:1.18441
[34]	train-mlogloss:1.17759
[35]	train-mlogloss:1.17078
[36]	t

In [48]:
print("Feature importance:")
for kv in sorted([(k,v) for k,v in model.get_fscore().items()], key=lambda kv: kv[1], reverse=True):
    print(kv)

Feature importance:
('renta', 19253)
('age', 18196)
('antiguedad', 17060)
('age_prev', 12423)
('antiguedad_prev', 11739)
('fecha_alta_month', 11090)
('nomprov', 10902)
('fecha_alta_year', 8821)
('renta_prev', 7626)
('canal_entrada', 7387)
('nomprov_prev', 6004)
('canal_entrada_prev', 4385)
('fecha_alta_month_prev', 4123)
('ind_recibo_ult1_prev', 3188)
('sexo', 3178)
('fecha_alta_year_prev', 3097)
('ind_ecue_fin_ult1_prev', 2972)
('ind_cco_fin_ult1_prev', 2912)
('ind_cno_fin_ult1_prev', 2692)
('segmento', 2192)
('ind_tjcr_fin_ult1_prev', 2043)
('ind_reca_fin_ult1_prev', 1994)
('segmento_prev', 1892)
('tiprel_1mes', 1781)
('ind_nom_pens_ult1_prev', 1617)
('ind_nomina_ult1_prev', 1506)
('ind_valo_fin_ult1_prev', 1482)
('ind_dela_fin_ult1_prev', 1447)
('ind_ctop_fin_ult1_prev', 1337)
('ind_actividad_cliente', 1286)
('tiprel_1mes_prev', 1240)
('sexo_prev', 1189)
('ind_ctpp_fin_ult1_prev', 1124)
('ind_fond_fin_ult1_prev', 968)
('ind_ctma_fin_ult1_prev', 865)
('ind_actividad_cliente_prev', 86

In [50]:
X_tst = tst.as_matrix(columns=features)
dtst = xgb.DMatrix(X_tst, feature_names=features)
preds_tst = model.predict(dtst, ntree_limit=best_ntree_limit)

  """Entry point for launching an IPython kernel.


In [51]:
ncodpers_tst = tst.as_matrix(columns=['ncodpers'])
preds_tst = preds_tst - tst.as_matrix(columns=[prod + '_prev' for prod in prods])

  """Entry point for launching an IPython kernel.
  


# Create submit file

In [57]:
submit_file = open('model/xgb.baseline.2015-06-28(2)', 'w')
submit_file.write('ncodpers,added_products\n')
for ncodper, pred in zip(ncodpers_tst, preds_tst):
    y_prods = [(y,p,ip) for y, p, ip in zip(pred, prods, range(len(prods)))]
    y_prods = sorted(y_prods, key=lambda a: a[0], reverse=True)[:7]
    y_prods = [p for y,p,ip in y_prods]
    submit_file.write('{}, {}\n'.format(int(ncodper), ''.join(y_prods)))
submit_file.close()