In [1]:
from utils import load_data, check_gpu, check_dir
from clean_session import preprocess_sessions
import pandas as pd
import numpy as np
from tqdm import tqdm
import gc
import catboost as cat
import matplotlib.pyplot as plt

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:

def plot_imp(data, fold_, plot_n=15):
    check_dir('./imps')
    imp = pd.DataFrame.from_records(data)
    imp.to_csv(f'./imps/{fold_}.csv', index=False)
    imp.columns = ['features', 'feature_importance']
    imp_des = imp.sort_values(by='feature_importance', ascending=False)
    imp_asc = imp.sort_values(by='feature_importance', ascending=True)

    fig, axes = plt.subplots(figsize=(8, 8), nrows=2, ncols=1)
    imp_des[:plot_n].plot(x='features', y='feature_importance', ax=axes[0], kind='barh', grid=True)
    imp_asc[:plot_n].plot(x='features', y='feature_importance', ax=axes[1], kind='barh', grid=True)
    plt.tight_layout()
    fig.savefig('./imps/{}.png'.format(fold_))

In [3]:
train = load_data('train')#, nrows=10000)

In [51]:
sub = load_data('submission_popular')

In [52]:
sub.head()

Unnamed: 0,user_id,session_id,timestamp,step,item_recommendations
0,000324D9BBUC,89643988fdbfb,1541593942,10,924795 106315 1033140 119494 101758 903037 105...
1,0004Q49X39PY,9de47d9a66494,1541641157,1,3505150 3812004 2227896 2292254 3184842 222702...
2,0004Q49X39PY,beea5c27030cb,1541561202,1,4476010 3505150 3812004 2227896 2292254 222702...
3,00071784XQ6B,9617600e1ba7c,1541630328,2,22854 3067559 22721 22713 16121 22772 22727 22...
4,0008BO33KUQ0,2d0e2102ee0dc,1541636411,6,9857656 5849628 655716 1352530 502066 1405084 ...


In [53]:
sub.shape

(253573, 5)

In [58]:
test = load_data('test')#, nrows=10000)

In [54]:
test = test[test.session_id.isin(sub.session_id.unique())].reset_index()

In [55]:
test.shape

(1613617, 13)

In [59]:
%time
duplicated_mask = test[[c for c in test.columns if c != 'step']].duplicated(keep='last')
test = test[~duplicated_mask].reset_index(drop=True)

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 6.2 µs


In [60]:
test.shape

(1795391, 12)

notes: raw test: original shape = (3782335, 12) after drop dups (1795391, 12) |
| sub shape: (253573, 5) | test in sub shape: (1613617, 13) after drop it's (1613617, 13)


In [31]:
3424793/3424793

1.0

In [4]:
%time
# find duplciates except steps
train.sort_values(by=['user_id', 'session_id', 'timestamp', 'step'], 
                  ascending=[True, True, True, True], 
                  inplace=True)

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.96 µs


In [5]:
train.shape

(15932992, 12)

In [6]:
%time
duplicated_mask = train[[c for c in train.columns if c != 'step']].duplicated(keep='last')
train = train[~duplicated_mask].reset_index(drop=True)

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.01 µs


In [7]:
train.shape

(6683369, 12)

In [8]:
train = preprocess_sessions(train, data_source='train')

[>>>>>] Cliping session dataframe up to last click out (if there is clickout)
[>>>>>][te=6.92 mins] filtering out sessions without clickouts, reference, or clickout is nan
train length before filtering: 5,764,987
train length after filtering: 5,200,153


In [9]:
# only take the last row
train = train.groupby('session_id').last().reset_index()
del train['action_type'], train['step']
gc.collect()

77

In [10]:
train.head()

Unnamed: 0,session_id,user_id,timestamp,reference,platform,city,device,current_filters,impressions,prices
0,00000510f1adc,WT30CXPIG450,1541064087,7281198,IN,"Ganpatipule, India",desktop,,2661832|9222426|7051844|4079190|5752778|468398...,46|26|16|38|12|20|21|27|13|21|36|9|144|19|8|19...
1,00003f3b20954,CITFOTN2IT5P,1541097696,979325,ES,"La Manga, Spain",mobile,,87132|886881|486611|979325|87173|87175|149508|...,330|187|437|159|499|324|476|381|424|159|144|19...
2,000056cd97ce2,0TBXPQCK401O,1541484101,3811810,MY,"Port Dickson, Malaysia",desktop,,3811810|496141|674016|1153426|1339848|8400372|...,187|78|100|51|150|81|44|36|53|104|93|48|38|52|...
3,000066611146f,XRIOQ0R1CHLT,1541332330,5479306,BE,"Obaköy, Turkey",desktop,Hotel|Resort,5479306|97126|2195126|4163966|8415348|7829686|...,96|81|87|122|50|21|90|177|81|144|122|44|66|121...
4,0000be39860d7,X931R6LTG2RE,1541104090,79237,US,"Chattanooga, USA",desktop,,1221442|3485396|3396928|2870208|2626700|315318...,171|218|132|209|202|60|123|145|154|174|218|48|...


In [11]:
train.shape

(826842, 10)

In [12]:
# encode city, platform and device
def categorize(df, cols):
    for col in cols:
        print('converting', col)
        unique_values = df[col].unique()
        mapping = {v: k for k, v in enumerate(unique_values)}
        df[col] = df[col].map(mapping)
categorize(train, ['city', 'platform', 'device'])

converting city
converting platform
converting device


In [13]:
# all item id and reference
reference_ids = list(train.reference.unique())
train['nimp'] = train.impressions.str.split('|').str.len()
# pad -1 to impressions length less than 25
train.loc[train.nimp<25, 'impressions'] = train.loc[train.nimp<25].apply(lambda x: f"{x.impressions}|{'|'.join([str(-1) for _ in range(25-x.nimp)])}",
                                                                        axis=1)
train['impression_list'] = train.impressions.str.split('|')
train['nimp'] = train.impressions.str.split('|').str.len()

imp_lists = train.impression_list.values
impression_ids = list(set([j for i in imp_lists for j in i]))

item_ids = list(set(reference_ids + impression_ids))
item_id_mapping = {v: k for k, v in enumerate(item_ids)}


In [14]:
%time
# map it
train['reference'] = train.reference.map(item_id_mapping)
train['impression_list'] = train.impression_list.apply(lambda imp_list: [item_id_mapping[i] for i in imp_list] )

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.96 µs


In [15]:
train.head()

Unnamed: 0,session_id,user_id,timestamp,reference,platform,city,device,current_filters,impressions,prices,nimp,impression_list
0,00000510f1adc,WT30CXPIG450,1541064087,65784,0,0,0,,2661832|9222426|7051844|4079190|5752778|468398...,46|26|16|38|12|20|21|27|13|21|36|9|144|19|8|19...,25,"[230851, 577837, 354263, 116114, 737749, 34676..."
1,00003f3b20954,CITFOTN2IT5P,1541097696,409961,1,1,1,,87132|886881|486611|979325|87173|87175|149508|...,330|187|437|159|499|324|476|381|424|159|144|19...,25,"[508873, 195895, 767452, 409961, 246230, 34846..."
2,000056cd97ce2,0TBXPQCK401O,1541484101,129061,2,2,0,,3811810|496141|674016|1153426|1339848|8400372|...,187|78|100|51|150|81|44|36|53|104|93|48|38|52|...,25,"[129061, 720609, 525631, 272581, 23183, 223092..."
3,000066611146f,XRIOQ0R1CHLT,1541332330,76750,3,3,0,Hotel|Resort,5479306|97126|2195126|4163966|8415348|7829686|...,96|81|87|122|50|21|90|177|81|144|122|44|66|121...,25,"[76750, 758455, 300388, 258778, 702154, 58870,..."
4,0000be39860d7,X931R6LTG2RE,1541104090,521957,4,4,0,,1221442|3485396|3396928|2870208|2626700|315318...,171|218|132|209|202|60|123|145|154|174|218|48|...,25,"[192840, 257898, 97964, 10645, 208845, 199100,..."


In [16]:
def get_index(x):
    if x.reference in x.impression_list:
        return x.impression_list.index(x.reference)
    else:
        return 25
train['target'] = train.apply(get_index, axis=1)

In [17]:
train = pd.concat([train, pd.DataFrame(train.impression_list.to_list(), columns=[f'loc{i}' for i in range(25)])], axis=1)
del train['impression_list'], train['impressions']

In [18]:
drop_cols = ['user_id', 'reference', 'current_filters', 'nimp', 'prices']
train.drop(drop_cols, axis=1, inplace=True)

In [19]:
train.head()

Unnamed: 0,session_id,timestamp,platform,city,device,target,loc0,loc1,loc2,loc3,...,loc15,loc16,loc17,loc18,loc19,loc20,loc21,loc22,loc23,loc24
0,00000510f1adc,1541064087,0,0,0,6,230851,577837,354263,116114,...,767989,293496,75489,283467,258794,152444,460390,784621,346735,600896
1,00003f3b20954,1541097696,1,1,1,3,508873,195895,767452,409961,...,667444,538758,198779,276203,461979,517860,67511,490102,316143,69213
2,000056cd97ce2,1541484101,2,2,0,0,129061,720609,525631,272581,...,537305,468663,259126,627054,162275,246478,248536,552162,259799,726311
3,000066611146f,1541332330,3,3,0,0,76750,758455,300388,258778,...,766471,341141,464033,220518,97897,664110,387991,329605,33961,306921
4,0000be39860d7,1541104090,4,4,0,19,192840,257898,97964,10645,...,401788,617082,183158,492387,521957,102948,464461,485634,710140,396897


In [20]:
# take the last 10% timestamp
train.sort_values(by=['timestamp'], inplace=True)
train.reset_index(drop=True, inplace=True)

In [21]:
# split_per = 0.1
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=2)

sids = train.session_id.values
target = train.target.values
del train['target'], train['timestamp'], train['session_id']


In [22]:
device = 'GPU' if check_gpu() else 'CPU'
params = {'iterations': 1000,
          'learning_rate': 0.02,
          'depth': 8,
          'task_type': device,
          'loss_function': 'MultiClass',
          'eval_metric': 'Accuracy'}

In [23]:
cat_fts = [f'loc{i}' for i in range(25)]
cat_fts += ['platform', 'city', 'device']

categorical_ind = [k for k, v in enumerate(train.columns) if v in cat_fts]

for trn_ind, val_ind in skf.split(target, target):
    x_trn, x_val = train.iloc[trn_ind].values, train.iloc[val_ind].values
    y_trn, y_val = target[trn_ind], target[val_ind]
    
    # train model
    clf = cat.CatBoostClassifier(**params)
    clf.fit(x_trn, y_trn,
            cat_features=categorical_ind,
            eval_set=(x_val, y_val),
            early_stopping_rounds=100,
            verbose=100,
            plot=False)
    print('Done!')
    print('Grab feature importance for both train and val')
    # get feature importance
    trn_imp = clf.get_feature_importance(data=cat.Pool(data=x_trn, cat_features=categorical_ind),
                                         prettified=True)
    val_imp = clf.get_feature_importance(data=cat.Pool(data=x_val, cat_features=categorical_ind),
                                         prettified=True)
    plot_imp(trn_imp, 'train')
    plot_imp(val_imp, 'val')
    print('Done feature imp')

#     # make prediction on validation set
#     val_pred = clf.predict_proba(xval.values)[:, 1]
#     logloss_i = log_loss(y_val, val_pred)
#     # compute roc auc
#     fpr, tpr, thresholds = roc_curve(y_val, val_pred, pos_label=1)
#     auc_i = auc(fpr, tpr)
#     # compute map
#     map_i = average_precision_score(y_val, val_pred)
#     print('logloss={0:.4f} | map={1:.4f} | auc={2:.4f}'.format(logloss_i, map_i, auc_i))

#     # mrr
#     print('reciproical rank for validation set')
#     xval['pred'] = val_pred
#     xval['target'] = y_val
#     val_rr = xval.groupby(level=0).apply(reciprocal_rank)
#     mrr = (1/val_rr[val_rr != 0]).mean()
#     print(f'Mean reciporical rank on validation set: {mrr:.4f}')


0:	learn: 0.3779731	test: 0.3779585	best: 0.3779585 (0)	total: 19s	remaining: 5h 16m 40s
100:	learn: 0.3779731	test: 0.3779585	best: 0.3779585 (0)	total: 2h 36m 6s	remaining: 23h 9m 27s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.3779584886
bestIteration = 0

Shrink model to first 1 iterations.
Done!
Grab feature importance for both train and val


NameError: name 'check_dir' is not defined