In [1]:
from utils import load_data, check_gpu, check_dir
from clean_session import preprocess_sessions
import pandas as pd
import numpy as np
from tqdm import tqdm
import gc
import catboost as cat
import matplotlib.pyplot as plt
from clean_session import preprocess_sessions
from manual_encoding import action_encoding, click_view_encoding, meta_encoding
from hotel2vec import hotel2vec

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
def plot_imp(data, fold_, plot_n=15):
    check_dir('./imps')
    imp = pd.DataFrame.from_records(data)
    imp.to_csv(f'./imps/{fold_}.csv', index=False)
    imp.columns = ['features', 'feature_importance']
    imp_des = imp.sort_values(by='feature_importance', ascending=False)
    imp_asc = imp.sort_values(by='feature_importance', ascending=True)

    fig, axes = plt.subplots(figsize=(8, 8), nrows=2, ncols=1)
    imp_des[:plot_n].plot(x='features', y='feature_importance', ax=axes[0], kind='barh', grid=True)
    imp_asc[:plot_n].plot(x='features', y='feature_importance', ax=axes[1], kind='barh', grid=True)
    plt.tight_layout()
    fig.savefig('./imps/{}.png'.format(fold_))

In [3]:
train = load_data('train')#, nrows=10000)

In [4]:
train = preprocess_sessions(train, data_source='train', rd=True)

[>>>>>] Remove initial duplciates
Before dropping duplicates df shape: (15932992, 12)
After dropping duplicates df shape: (6683369, 12)
[>>>>>][te=0.51 mins] Cliping session dataframe up to last click out (if there is clickout)
[>>>>>][te=7.54 mins] filtering out sessions without clickouts, reference, or clickout is nan
train length before filtering: 5,764,987
train length after filtering: 5,200,153


In [5]:
train_last = train.groupby('session_id').last().reset_index()
del train

In [6]:
drop_cols = ['user_id', 'timestamp', 'current_filters']
train_last = train_last.drop(drop_cols, axis=1)

In [7]:
train_last['impressions'] = train_last.impressions.str.split('|')
train_last['prices'] = train_last.prices.str.split('|')

In [8]:
train_last.isna().sum()

session_id     0
step           0
action_type    0
reference      0
platform       0
city           0
device         0
impressions    0
prices         0
dtype: int64

In [9]:
train_last.shape

(826842, 9)

In [10]:
434*25

10850

In [11]:
# train_last[[c for c in train_last.columns if c!= 'impressions']].values.shape

In [12]:
# # train_last.impressions
# vals = train_last.impressions.values.tolist()


In [13]:
def explode(df):
    impressions = df['impressions'].values.tolist()
    prices = df['prices'].values.tolist()
    
    rs = [len(r) for r in impressions]
    # locations
    inds = np.concatenate([np.arange(i, dtype=int) for i in rs])
    # relative locations
    rel_inds = np.concatenate([np.arange(i)/i for i in rs])

    # the rest cols
    rest_cols = [c for c in df.columns if c not in ['impressions', 'prices']]
    rest_arr = np.repeat(df[rest_cols].values, rs, axis=0)
    # create dataframe to host the exploded
    exploded = pd.DataFrame(np.column_stack((rest_arr, np.concatenate(impressions), np.concatenate(prices))),
                            columns=rest_cols+['impression', 'price'])
    exploded['price'] = exploded['price'].astype(int)
    exploded['impression_loc'] = inds
    exploded['rel_impression_loc'] = rel_inds
    exploded['impression'] = exploded['impression'].astype(int)
    exploded['reference'] = exploded['reference'].astype(int)
    exploded['step'] = exploded['step'].astype(int)

    return exploded

df = explode(train_last)
del train_last
gc.collect()

59

In [14]:
df.head()

Unnamed: 0,session_id,step,action_type,reference,platform,city,device,impression,price,impression_loc,rel_impression_loc
0,00000510f1adc,1,clickout item,7281198,IN,"Ganpatipule, India",desktop,2661832,46,0,0.0
1,00000510f1adc,1,clickout item,7281198,IN,"Ganpatipule, India",desktop,9222426,26,1,0.04
2,00000510f1adc,1,clickout item,7281198,IN,"Ganpatipule, India",desktop,7051844,16,2,0.08
3,00000510f1adc,1,clickout item,7281198,IN,"Ganpatipule, India",desktop,4079190,38,3,0.12
4,00000510f1adc,1,clickout item,7281198,IN,"Ganpatipule, India",desktop,5752778,12,4,0.16


In [15]:
# 1) all the manual encodings
ae = action_encoding()
ae_cols = [c for c in ae.columns if c != 'reference']
df = pd.merge(df.set_index('impression'), ae.set_index('reference'), left_index=True, right_index=True)
df.index.name = 'impression'
df.reset_index(inplace=True)

# 2) the hotel2vec encodings
hv = hotel2vec()
hv_cols = [c for c in hv.columns if c != 'item_id']
df = pd.merge(df.set_index('impression'), hv.set_index('item_id'), left_index=True, right_index=True)
df.index.name = 'impression'
df.reset_index(inplace=True)

# 3) click view
cv = click_view_encoding()
cv_cols = [c for c in cv.columns if c != 'item_id']
df = pd.merge(df.set_index('impression'), cv.set_index('item_id'), left_index=True, right_index=True)
df.index.name = 'impression'
df.reset_index(inplace=True)

# 4) meta
meta = meta_encoding()
meta_cols = [c for c in meta.columns if c != 'item_id']
df = pd.merge(df.set_index('impression'), meta.set_index('item_id'), left_index=True, right_index=True)
df.index.name = 'impression'
df.reset_index(inplace=True)

# df.groupby('session_id')['price'].mean()

Load from exsiting file: ./cache/action_encodings.csv
[>>>>>] Load the existing hotel2vec model from ./cache/hotel_2vec/embeddings.csv
Load from exsiting file: ./cache/clickview_encodings.csv
Load from exsiting file: ./cache/meta_encodings.csv


In [16]:
grp = df.groupby('session_id')

In [17]:
# hv_cols

In [18]:
def compute_diff(df, cols):
    diff = (df.set_index('session_id')[cols] - grp[cols].mean()).reset_index(drop=True)
    diff.columns = [f'{c}_diff' for c in diff.columns]
    df = pd.concat([df, diff], axis=1)
    return df

df = compute_diff(df, ['price'])
df = compute_diff(df, ae_cols)
df = compute_diff(df, hv_cols)
df = compute_diff(df, cv_cols)
df = compute_diff(df, meta_cols)

In [None]:
df.shape

(16564452, 542)

In [None]:
df.to_hdf()

In [None]:
df.head()

Unnamed: 0,impression,session_id,step,action_type,reference,platform,city,device,price,impression_loc,...,safe (hotel)_diff,satisfactory rating_diff,pousada (br)_diff,free wifi (rooms)_diff,guest house_diff,motel_diff,from 3 stars_diff,ironing board_diff,health retreat_diff,honeymoon_diff
0,5001,001373bf276d0,3,clickout item,2296490,DE,"Friedrichshafen, Germany",tablet,82,0,...,0.0,0.5,0.0,0.909091,-0.090909,0.0,0.863636,-0.045455,0.0,-0.090909
1,5001,02b8562f9c8c0,1,clickout item,140076,CH,"Lindau, Germany",desktop,84,16,...,0.0,0.5,0.0,-0.090909,-0.090909,0.0,-0.136364,-0.045455,0.0,0.909091
2,5001,0340fb41c4d2e,2,clickout item,148486,DE,"Friedrichshafen, Germany",tablet,103,2,...,0.0,0.5,0.0,-0.090909,-0.090909,0.0,-0.136364,-0.045455,0.0,0.909091
3,5001,0cb674014b4ea,1,clickout item,51598,DE,"Friedrichshafen, Germany",desktop,93,8,...,0.0,-0.5,0.0,-0.090909,-0.090909,0.0,-0.136364,-0.045455,0.0,-0.090909
4,5001,0f4184661756c,3,clickout item,432041,DE,"Friedrichshafen, Germany",mobile,94,18,...,0.0,0.5,0.0,-0.090909,0.909091,0.0,-0.136364,-0.045455,0.0,-0.090909


In [None]:
# drop_cols = ['user_id', 'timestamp', 'current_filters']
# df = df.drop(drop_cols, axis=1)

In [None]:
# create target
df['target'] = (df.reference == df.impression).astype(int)
del df['reference']

In [None]:
df['target'].value_counts()

0    15738285
1      826167
Name: target, dtype: int64

In [None]:
8243/434

18.993087557603687

In [None]:
df.isna().sum()[df.isna().sum()!=0]

In [None]:
# encode city, platform and device
def categorize(df, cols):
    for col in cols:
        print('converting', col)
        unique_values = df[col].unique()
        mapping = {v: k for k, v in enumerate(unique_values)}
        df[col] = df[col].map(mapping)
cat_fts = ['city', 'platform', 'device', 'action_type', 'impression']
categorize(df, cat_fts)

In [None]:
df.dtypes[df.dtypes=='O']

In [None]:
# split_per = 0.1
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5)

sids = df.session_id.values
target = df.target.values
del df['target']# train['timestamp'], train['session_id']


In [None]:
device = 'GPU' if check_gpu() else 'CPU'
params = {'iterations': 1000,
          'learning_rate': 0.02,
          'depth': 8,
          'task_type': device,
          'loss_function': 'MultiClass',
          'eval_metric': 'Accuracy'}

In [None]:
for trn_ind, val_ind in skf.split(sids, sids):
    trn_mask = df.session_id.isin(sids[trn_ind])
    del df['session_id']
    x_trn, x_val = df[trn_mask], df[~trn_mask]
    y_trn, y_val = target[trn_mask], target[~trn_mask]
    
    categorical_ind = [k for k, v in enumerate(x_trn.columns) if v in cat_fts]
    
    # train model
    clf = cat.CatBoostClassifier(**params)
    clf.fit(x_trn.values, y_trn,
            cat_features=categorical_ind,
            eval_set=(x_val.values, y_val),
            early_stopping_rounds=100,
            verbose=100,
            plot=False)
    print('Done!')
    print('Grab feature importance for both train and val')
    # get feature importance
    trn_imp = clf.get_feature_importance(data=cat.Pool(data=x_trn, cat_features=categorical_ind),
                                         prettified=True)
    val_imp = clf.get_feature_importance(data=cat.Pool(data=x_val, cat_features=categorical_ind),
                                         prettified=True)
    plot_imp(trn_imp, 'train')
    plot_imp(val_imp, 'val')
    print('Done feature imp')
    break
#     # make prediction on validation set
#     val_pred = clf.predict_proba(xval.values)[:, 1]
#     logloss_i = log_loss(y_val, val_pred)
#     # compute roc auc
#     fpr, tpr, thresholds = roc_curve(y_val, val_pred, pos_label=1)
#     auc_i = auc(fpr, tpr)
#     # compute map
#     map_i = average_precision_score(y_val, val_pred)
#     print('logloss={0:.4f} | map={1:.4f} | auc={2:.4f}'.format(logloss_i, map_i, auc_i))

#     # mrr
#     print('reciproical rank for validation set')
#     xval['pred'] = val_pred
#     xval['target'] = y_val
#     val_rr = xval.groupby(level=0).apply(reciprocal_rank)
#     mrr = (1/val_rr[val_rr != 0]).mean()
#     print(f'Mean reciporical rank on validation set: {mrr:.4f}')
