In [1]:
import multiprocessing as mp
import time
import os
import gc

import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import log_loss, auc, roc_curve, f1_score, average_precision_score, mean_squared_error
from sklearn.model_selection import StratifiedKFold
import catboost as cat

from reduce_memory import reduce_numeric_mem_usage, reduce_object_mem_usage

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
!ls

catboost_info						   session_fts.csv
cat_feature_index.4e8a1a48-f9a4c943-ce42059a-58e90a3d.tmp  test-Copy1.ipynb
data							   test.ipynb
data_exploring.ipynb					   train.ipynb
__pycache__						   Untitled.ipynb
reduce_memory.py


In [3]:
xtrain = pd.read_hdf('./data/train.h5', 'xtrain')

In [4]:
xval = pd.read_hdf('./data/train.h5', 'xval')

In [5]:
!ls -lthr

total 83M
-rw-rw-r-- 1 sam sam 4.5K Mar 15 20:06 reduce_memory.py
-rw-rw-r-- 1 sam sam  76M Mar 16 13:53 session_fts.csv
-rw-rw-r-- 1 sam sam  21K Mar 26 22:53 test-Copy1.ipynb
drwxrwxr-x 2 sam sam 4.0K Mar 28 23:14 data
-rw-rw-r-- 1 sam sam  75K Mar 28 23:30 test.ipynb
-rw-rw-r-- 1 sam sam  74K Mar 28 23:30 data_exploring.ipynb
drwxrwxr-x 2 sam sam 4.0K Mar 28 23:31 __pycache__
drwxrwxr-x 4 sam sam 4.0K Mar 28 23:47 catboost_info
-rw-rw-r-- 1 sam sam  40K Mar 29 09:14 Untitled.ipynb
-rw-rw-r-- 1 sam sam 5.9M Mar 29 12:00 cat_feature_index.4e8a1a48-f9a4c943-ce42059a-58e90a3d.tmp
-rw-rw-r-- 1 sam sam 7.4K Mar 29 12:08 train.ipynb


In [6]:
xtrain.head()

Unnamed: 0,item_id,appeared,location,price,rel_price_rank,price_mean,price_median,diff_mean,diff_median,diff_mean_rel,...,action_type_click_rel_pos_avg,city_nunique,city_get_first,platform_get_first,device_get_first,nfilters_mean,nfilters_max,nfilters_min,nfilters_get_last,nimps_max
00000510f1adc,2661832,0,0,46,0.6,27.32,20.0,18.68,26.0,0.406087,...,,,,,,,,,,
00000510f1adc,9222426,0,1,26,0.76,27.32,20.0,-1.32,6.0,-0.050769,...,,,,,,,,,,
00000510f1adc,7051844,0,2,16,0.48,27.32,20.0,-11.32,-4.0,-0.7075,...,,,,,,,,,,
00000510f1adc,4079190,0,3,38,0.2,27.32,20.0,10.68,18.0,0.281053,...,,,,,,,,,,
00000510f1adc,5752778,0,4,12,0.36,27.32,20.0,-15.32,-8.0,-1.276667,...,,,,,,,,,,


In [7]:
xtrain.columns

Index(['item_id', 'appeared', 'location', 'price', 'rel_price_rank',
       'price_mean', 'price_median', 'diff_mean', 'diff_median',
       'diff_mean_rel', 'diff_median_rel', 'nprop', 'n_clicks', 'star',
       'good_rating', 'satisfactory_rating', 'excellent_rating', 'p_mean',
       'star_mean', 'gr_mean', 'sr_mean', 'er_mean', 'target', 'session_id',
       'timestamp_ptp', 'timestamp_mean_dwell_time',
       'timestamp_var_dwell_time', 'step_max', 'action_type_nunique',
       'action_type_n_clickouts', 'action_type_click_rel_pos_avg',
       'city_nunique', 'city_get_first', 'platform_get_first',
       'device_get_first', 'nfilters_mean', 'nfilters_max', 'nfilters_min',
       'nfilters_get_last', 'nimps_max'],
      dtype='object')

In [8]:
xtrain.columns[xtrain.dtypes=='O']

Index(['session_id', 'city_get_first', 'platform_get_first',
       'device_get_first'],
      dtype='object')

In [9]:
cat_fts = ['city_get_first', 'platform_get_first', 'device_get_first', 'item_id', 'location']
# convert to categorical
for c in cat_fts:
    print(c)
#     maps = np.concatenate([xtrain[c].unique(), xval[c].unique()])
    maps = xtrain[c].unique()
    mapper = dict(zip(maps, np.arange(len(maps), dtype=int)))
    xtrain[c] = xtrain[c].map(mapper)
    xval[c] = xval[c].map(mapper)


city_get_first
platform_get_first
device_get_first
item_id
location


In [10]:
reduce_numeric_mem_usage(xtrain)

Memory usage before optimization is: 6104.31 MB
Memory usage after optimization is: 3745.16 MB
Decreased by 38.6%


In [11]:
y_trn = xtrain['target']
y_val = xval['target']
del xtrain['target'], xval['target']

In [12]:
xtrain.head()

Unnamed: 0,item_id,appeared,location,price,rel_price_rank,price_mean,price_median,diff_mean,diff_median,diff_mean_rel,...,action_type_click_rel_pos_avg,city_nunique,city_get_first,platform_get_first,device_get_first,nfilters_mean,nfilters_max,nfilters_min,nfilters_get_last,nimps_max
00000510f1adc,0,0,0,46,0.600098,27.3125,20.0,18.6875,26.0,0.406006,...,,,0,0,0,,,,,
00000510f1adc,1,0,1,26,0.759766,27.3125,20.0,-1.320312,6.0,-0.050781,...,,,0,0,0,,,,,
00000510f1adc,2,0,2,16,0.47998,27.3125,20.0,-11.320312,-4.0,-0.70752,...,,,0,0,0,,,,,
00000510f1adc,3,0,3,38,0.199951,27.3125,20.0,10.679688,18.0,0.281006,...,,,0,0,0,,,,,
00000510f1adc,4,0,4,12,0.360107,27.3125,20.0,-15.320312,-8.0,-1.276367,...,,,0,0,0,,,,,


In [None]:
params = {'iterations': 3000,
          'learning_rate': 0.02,
          'depth': 8,
         'task_type': 'GPU'}

categorical_ind = [k for k, v in enumerate(xtrain.columns) if v in cat_fts ]

# train model
clf = cat.CatBoostClassifier(**params)
clf.fit(xtrain, y_trn,
        cat_features=categorical_ind,
        eval_set=(xval, y_val),
        early_stopping_rounds=100,
        verbose=100,
        plot=False)
print('Done!')
# try to save model
model_path = './cat_model'
cat.save_model(clf, model_path)

# saves for each fold
clfs = []
cv_scores = []
val_inds = []

# append model
clfs.append(clf)
# make prediction on validation set
val_pred = clf.predict_proba(xval)[:, 1]
logloss_i = log_loss(y_val, val_pred)
cv_scores.append(logloss_i)
# compute roc auc
fpr, tpr, thresholds = roc_curve(y_val, val_pred, pos_label=1)
auc_i = auc(fpr, tpr)
# compute map
map_i = average_precision_score(y_val, val_pred)
print('logloss={0:.4f} | map={1:.4f} | auc={2:.4f}'.format(logloss_i, map_i, auc_i))

0:	learn: 0.6525134	test: 0.6524462	best: 0.6524462 (0)	total: 2.04s	remaining: 1h 42m 8s
100:	learn: 0.1362686	test: 0.1353415	best: 0.1353415 (100)	total: 3m 45s	remaining: 1h 47m 46s
200:	learn: 0.1313499	test: 0.1331250	best: 0.1331250 (200)	total: 7m	remaining: 1h 37m 29s
300:	learn: 0.1304304	test: 0.1330986	best: 0.1330903 (297)	total: 10m 15s	remaining: 1h 31m 58s
400:	learn: 0.1299058	test: 0.1337767	best: 0.1328616 (339)	total: 13m 35s	remaining: 1h 28m 5s
bestTest = 0.1328616121
bestIteration = 339
Shrink model to first 340 iterations.


In [None]:
# compute mean rr
xval['pred'] = val_pred

In [None]:
# xval['pred']

In [None]:
# xtrain[train=='084dbb067c71e']

In [None]:
def reciprocal_rank(df):
    pred_list = df['pred'].values
    sorted_ind = np.argsort(pred_list)[::-1]
    ranked_items = df['item_id'].iloc[sorted_ind].values
    try:
        target_ind = np.where(df['target'] == 1)[0][0]
        rank = np.where(sorted_ind == target)[0][0]
    except:
        return 0
#     return 1/(rank+1)
    return rank+1

xval['target'] = y_val
# val_rr = xval.groupby('session_id').apply(reciprocal_rank)
val_rr = xval.groupby(level=0).apply(reciprocal_rank)

In [None]:
# xtrain[cat_fts].nunique()

In [None]:
val_rr[val_rr!=0].mean()

In [None]:
val_rr[val_rr!=0].hist(bins=50)

In [None]:
(1/val_rr[val_rr!=0]).mean()

In [None]:
(1/val_rr[val_rr!=0]).hist(bins=50)