In [19]:
import time
import pandas as pd
import numpy as np
import datetime
import os
import gc
from functools import partial
import matplotlib.pyplot as plt
import shap
import catboost as cat 

from utils import load_data, get_logger, get_data_path
from clean_session import preprocess_sessions
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
shap.initjs()

In [None]:
fts = pd.read_parquet('./cache/train_session_fts.snappy')

In [61]:
train_raw = load_data('train', nrows=5000000)
train_raw['timestamp'] = train_raw['timestamp'].apply(lambda ts: datetime.datetime.utcfromtimestamp(ts))


[05-20 10:39:20 - utils-106 - load_data - INFO] Loading train using 5,000,000 rows (4,999,976 trimmed) which is 31.38% out of total train data


In [3]:
train = pd.read_parquet('./cache/train_inputs.snappy')

In [5]:
# sids = train.session_id.unique()
train_sids = np.load('./cache/train_session_ids.npy')
# train_input.sort_values('session_id').head()

In [7]:
from create_model_inputs import create_action_type_mapping
create_action_type_mapping()

[05-20 09:50:09 - create_model_inputs-25 - create_action_type_mapping - INFO] Load action_types mapping from existing: ./cache/action_types_mapping.npy


({'clickout item': 0,
  'search for poi': 1,
  'interaction item image': 2,
  'interaction item info': 3,
  'interaction item deals': 4,
  'search for destination': 5,
  'filter selection': 6,
  'interaction item rating': 7,
  'search for item': 8,
  'change of sort order': 9},
 10)

In [63]:
sid = np.random.choice(train_sids, 1)[0]
sid

'd1d8bc3a53a50'

### raw data

In [64]:
train_raw[train_raw.session_id==sid]

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
676364,F4YD0BG4WI5P,d1d8bc3a53a50,2018-11-03 07:52:05,1,clickout item,28679,IT,"Montreux, Switzerland",mobile,,28689|28682|28693|14695|28679|28691|28694|2869...,247|168|140|319|109|293|166|157|311|409|281|44...
676365,F4YD0BG4WI5P,d1d8bc3a53a50,2018-11-03 07:54:58,2,clickout item,28679,IT,"Montreux, Switzerland",mobile,,28689|28682|28693|14695|28679|28691|28694|2869...,247|168|140|319|109|293|166|157|311|409|281|44...


In [65]:
train_raw[train_raw.session_id==sid].to_dict(orient='list')

{'user_id': ['F4YD0BG4WI5P', 'F4YD0BG4WI5P'],
 'session_id': ['d1d8bc3a53a50', 'd1d8bc3a53a50'],
 'timestamp': [Timestamp('2018-11-03 07:52:05'),
  Timestamp('2018-11-03 07:54:58')],
 'step': [1, 2],
 'action_type': ['clickout item', 'clickout item'],
 'reference': ['28679', '28679'],
 'platform': ['IT', 'IT'],
 'city': ['Montreux, Switzerland', 'Montreux, Switzerland'],
 'device': ['mobile', 'mobile'],
 'current_filters': [nan, nan],
 'impressions': ['28689|28682|28693|14695|28679|28691|28694|28698|7941730|6622690|9144432|28695|28696|28688|8005226|9541130|9543588|10547466',
  '28689|28682|28693|14695|28679|28691|28694|28698|7941730|6622690|9144432|28695|28696|28688|8005226|9541130|9543588|10547466'],
 'prices': ['247|168|140|319|109|293|166|157|311|409|281|447|229|310|1051|282|783|197',
  '247|168|140|319|109|293|166|157|311|409|281|447|229|310|1051|282|783|197']}

In [66]:
# load model
clf = cat.CatBoostClassifier()  # parameters not required.
clf.load_model('./models/cv0.model')

<catboost.core.CatBoostClassifier at 0x7ff8b0553b38>

In [67]:
pred = clf.predict_proba(train[train_sids==sid])
pred

array([[3.72124330e-01, 1.37851592e-01, 1.16590504e-01, 4.21556475e-02,
        9.75452636e-02, 2.35821845e-02, 4.05272898e-02, 3.38485214e-02,
        1.48334986e-02, 1.16292388e-02, 8.87330106e-03, 1.26075024e-02,
        1.14066102e-02, 9.33394663e-03, 5.99602651e-03, 2.37823694e-02,
        5.50599638e-03, 1.50237382e-02, 6.34455831e-03, 5.30412180e-03,
        1.44353356e-03, 1.05306035e-03, 1.67616513e-03, 7.66901457e-04,
        1.94099103e-04],
       [1.18523102e-01, 7.11242151e-02, 6.17304970e-02, 4.74739185e-02,
        3.45607008e-01, 5.49468908e-02, 6.41713192e-02, 6.08395054e-02,
        4.07144438e-02, 1.68898127e-02, 1.79735039e-02, 1.19222079e-02,
        2.93183716e-02, 1.30709265e-02, 6.02371400e-03, 1.27233484e-02,
        5.98957608e-03, 1.06736479e-02, 2.73441039e-03, 2.37785062e-03,
        1.52326820e-03, 1.52441650e-03, 1.00732995e-03, 7.15362173e-04,
        4.01353413e-04]])

In [68]:
np.argsort(pred)[:, ::-1]

array([[ 0,  1,  2,  4,  3,  6,  7, 15,  5, 17,  8, 11,  9, 12, 13, 10,
        18, 14, 16, 19, 22, 20, 21, 23, 24],
       [ 4,  0,  1,  6,  2,  7,  5,  3,  8, 12, 10,  9, 13, 15, 11, 17,
        14, 16, 18, 19, 21, 20, 22, 23, 24]])

In [None]:
%%time
explainer = shap.TreeExplainer(clf)

In [69]:
shap_values = explainer.shap_values(train[train_sids==sid])



In [71]:
# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
ncls = 0
ninstance = 0
shap.force_plot(explainer.expected_value[ncls], shap_values[ncls][ninstance],
                train[train_sids==sid].iloc[ninstance,:])

### train inputs

In [72]:
train[train_sids==sid].to_dict(orient='list')

{'step': [1, 2],
 'n_imps': [18.0, 18.0],
 'n_cf': [0.0, 0.0],
 'session_size': [0.0, 0.6931471805599453],
 'session_duration': [0.0, 173.0],
 'last_duration': [0.0, 5.159055299214529],
 'ref_shift': [nan, 4.0],
 'at_shift': [nan, 0.0],
 'price_0': [0.23501427212178877, 0.23501427212178877],
 'price_1': [0.1598477640342531, 0.1598477640342531],
 'price_2': [0.13320647002854424, 0.13320647002854424],
 'price_3': [0.30352045670789723, 0.30352045670789723],
 'price_4': [0.10371075166508087, 0.10371075166508087],
 'price_5': [0.2787821122740247, 0.2787821122740247],
 'price_6': [0.15794481446241673, 0.15794481446241673],
 'price_7': [0.1493815413891532, 0.1493815413891532],
 'price_8': [0.29590865842055186, 0.29590865842055186],
 'price_9': [0.38915318744053284, 0.38915318744053284],
 'price_10': [0.26736441484300666, 0.26736441484300666],
 'price_11': [0.4253092293054234, 0.4253092293054234],
 'price_12': [0.21788772597526165, 0.21788772597526165],
 'price_13': [0.2949571836346337, 0.2949

In [17]:
152/578

0.2629757785467128

In [None]:
# fts[fts.session_id==sid].to_dict(orient='list')

In [None]:
# fts[fts.session_id==sid][['ref_shift', 'at_shift']].values

In [None]:
# train_input[train_input.session_id==sid].impressions_natural.values

In [None]:
# train_raw[train_raw.session_id==sid]

In [None]:
import pandas as pd
sub_original = pd.read_csv('./subs/sub.csv')

In [None]:
sub_new = pd.read_csv('./subs/sub_05-19.csv')

In [None]:
sub_original.head()

In [None]:
sub_new.head(10)

In [None]:
sub_new.iloc[5].item_recommendations

In [None]:
sub_original.shape

In [None]:
test_sub = pd.read_csv('./subs/test_sub.csv')

In [None]:
test_sub.impressions.iloc[0]

In [None]:
sub_original['item_recommendations'] = sub_original.item_recommendations.apply(lambda xs: [int(x) for x in xs])

In [None]:
train_input.price_24.hist()

In [None]:
train_input[[f'price_{i}' for i in range(25)]].describe()

In [None]:
from create_model_inputs import compute_session_fts, prepare_data, flogger

In [None]:
logger = get_logger('create_model_inputs')
Filepath = get_data_path()


In [None]:
mode='train'
nrows=1000000
logger.info(f'Prepare {mode} data')
t_init = time.time()
df = prepare_data(mode, convert_action_type=True, nrows=nrows, recompute=False)
logger.info('Compute session features')

In [None]:
str_prices = np.concatenate(df.prices.dropna().str.split('|').values)
str_prices = [int(sp) for sp in str_prices]
# pd.value_counts(str_prices)
_ = plt.hist(str_prices)

In [None]:
%%time
df = compute_session_fts(df, mode)

In [None]:
str_prices = np.concatenate(df.prices.dropna().str.split('|').values)
str_prices = [int(sp) for sp in str_prices]
# pd.value_counts(str_prices)
_ = plt.hist(str_prices)

In [None]:
logger.info('Only select last click-out from each session')
df = df.groupby('session_id').last().reset_index()
flogger(df, 'df shape after only selecting last click-out row each session')

# # log-transform on session_size feature
# logger.info('Log-transform on session_size feature')
# df['session_id_size'] = np.log(df['session_id_size'])

# # log1p-transform on timestamp_dwell_time_prior_clickout but will cliping upper to 1hr
# logger.info('Also log-transform on timestamp_dwell_time_prior_clickout but will cliping upper to 1hr')
# df['timestamp_dwell_time_prior_clickout'] = np.log1p(df['timestamp_dwell_time_prior_clickout'].clip(upper=60 ** 2))

In [None]:
str_prices = np.concatenate(df.prices.dropna().str.split('|').values)
str_prices = [int(sp) for sp in str_prices]
# pd.value_counts(str_prices)
_ = plt.hist(str_prices)

In [None]:
df['prices'] = df['prices'].str.split('|')
df['prices_int'] = df['prices'].apply(lambda x: [int(p) for p in x])

In [None]:
str_prices = np.concatenate(df.prices_int.dropna().values)
# pd.value_counts(str_prices)
_ = plt.hist(str_prices)

In [None]:
df['time_steps'] = df['prices_int'].str.len()
padding_mask = df['time_steps'] < 25
df.drop('time_steps', axis=1, inplace=True)

In [None]:
df.prices.head()

In [None]:
df.prices.iloc[0]

In [None]:
df[df.prices.str.len()<25].prices.iloc[0]

In [None]:
x = [int(i) for i in df[df.prices.str.len()<25].prices.iloc[0]]
x

In [None]:
np.pad(x, (0, 25-len(x)), mode='constant', constant_values=0)

In [None]:
type(np.nan)

In [None]:
df.loc[padding_mask, 'prices_int'] = df.loc[padding_mask, 'prices_int'].apply(lambda x: np.pad(x, (0, 25-len(x)),
                                                                                       mode='constant',
                                                                                       constant_values=np.nan))

In [None]:
str_prices = np.concatenate(df.prices_int.dropna().values)
# pd.value_counts(str_prices)
_ = plt.hist(str_prices)

In [None]:
def normalize(ps):
    p_arr = np.array(ps)
    return p_arr / (p_arr.max())
df['prices_percentage_int'] = df['prices'].apply(normalize)