In [1]:
import time
import pandas as pd
import numpy as np
import datetime
import os
import gc
from functools import partial
import matplotlib.pyplot as plt
from utils import load_data, get_logger, get_data_path
from clean_session import preprocess_sessions
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [7]:
from create_model_inputs import compute_session_fts, prepare_data, flogger

In [None]:
logger = get_logger('create_model_inputs')
Filepath = get_data_path()


In [None]:
mode='train'
nrows=1000000
logger.info(f'Prepare {mode} data')
t_init = time.time()
df = prepare_data(mode, convert_action_type=True, nrows=nrows, recompute=False)
logger.info('Compute session features')

In [None]:
str_prices = np.concatenate(df.prices.dropna().str.split('|').values)
str_prices = [int(sp) for sp in str_prices]
# pd.value_counts(str_prices)
_ = plt.hist(str_prices)

In [None]:
%%time
df = compute_session_fts(df, mode)

In [None]:
str_prices = np.concatenate(df.prices.dropna().str.split('|').values)
str_prices = [int(sp) for sp in str_prices]
# pd.value_counts(str_prices)
_ = plt.hist(str_prices)

In [None]:
logger.info('Only select last click-out from each session')
df = df.groupby('session_id').last().reset_index()
flogger(df, 'df shape after only selecting last click-out row each session')

# # log-transform on session_size feature
# logger.info('Log-transform on session_size feature')
# df['session_id_size'] = np.log(df['session_id_size'])

# # log1p-transform on timestamp_dwell_time_prior_clickout but will cliping upper to 1hr
# logger.info('Also log-transform on timestamp_dwell_time_prior_clickout but will cliping upper to 1hr')
# df['timestamp_dwell_time_prior_clickout'] = np.log1p(df['timestamp_dwell_time_prior_clickout'].clip(upper=60 ** 2))

In [None]:
str_prices = np.concatenate(df.prices.dropna().str.split('|').values)
str_prices = [int(sp) for sp in str_prices]
# pd.value_counts(str_prices)
_ = plt.hist(str_prices)

In [None]:
df['prices'] = df['prices'].str.split('|')
df['prices_int'] = df['prices'].apply(lambda x: [int(p) for p in x])

In [None]:
str_prices = np.concatenate(df.prices_int.dropna().values)
# pd.value_counts(str_prices)
_ = plt.hist(str_prices)

In [None]:
df['time_steps'] = df['prices_int'].str.len()
padding_mask = df['time_steps'] < 25
df.drop('time_steps', axis=1, inplace=True)

In [None]:
df.prices.head()

In [None]:
df.prices.iloc[0]

In [None]:
df[df.prices.str.len()<25].prices.iloc[0]

In [None]:
x = [int(i) for i in df[df.prices.str.len()<25].prices.iloc[0]]
x

In [None]:
np.pad(x, (0, 25-len(x)), mode='constant', constant_values=0)

In [None]:
type(np.nan)

In [None]:
df.loc[padding_mask, 'prices_int'] = df.loc[padding_mask, 'prices_int'].apply(lambda x: np.pad(x, (0, 25-len(x)),
                                                                                       mode='constant',
                                                                                       constant_values=np.nan))

In [None]:
str_prices = np.concatenate(df.prices_int.dropna().values)
# pd.value_counts(str_prices)
_ = plt.hist(str_prices)

In [None]:
def normalize(ps):
    p_arr = np.array(ps)
    return p_arr / (p_arr.max())
df['prices_percentage_int'] = df['prices'].apply(normalize)

In [4]:
np.exp(np.log(3))

3.0000000000000004

In [2]:
train = pd.read_parquet('./cache/train_inputs_no_imp.snappy')

In [3]:
train.head()

Unnamed: 0,step,timestamp_session_duration,timestamp_dwell_time_prior_clickout,session_id_size,nf,price_0,price_1,price_2,price_3,price_4,...,price_19,price_20,price_21,price_22,price_23,price_24,n_imps,target,pos,at
0,5,293.0,5.46806,1.609438,,0.164444,0.195556,0.193333,0.16,0.124444,...,0.195556,0.222222,0.246667,0.411111,0.16,0.082222,25,0,0.04,2.0
1,3,1.0,0.693147,0.693147,,0.327801,1.0,0.39834,0.211618,0.282158,...,0.248963,0.319502,0.190871,0.568465,0.33195,0.248963,25,19,0.8,1.0
2,18,197.0,3.828641,1.94591,,0.756494,0.313312,0.478896,0.537338,0.277597,...,0.548701,0.146104,0.068182,0.097403,0.068182,0.199675,25,18,0.68,1.0
3,6,149.0,3.044522,1.386294,,,,,,,...,,,,,,,14,13,0.52,1.0
4,1,,,0.0,,0.489899,0.510101,0.393939,0.378788,0.5,...,0.580808,0.39899,1.0,0.671717,0.570707,0.575758,25,0,,


In [5]:
train['session_id_size'] = np.exp(train['session_id_size'])

In [7]:
train.groupby('session_id_size')['target'].value_counts()

session_id_size  target
1.0              0         7051
                 1         1203
                 2          703
                 3          539
                 4          493
                 5          361
                 6          263
                 7          253
                 8          229
                 9          211
                 10         207
                 11         176
                 12         153
                 13         137
                 17         116
                 14         112
                 15         102
                 18          99
                 19          97
                 24          86
                 16          84
                 23          83
                 20          66
                 22          62
                 21          57
2.0              0         4227
                 1         1024
                 2          674
                 3          531
                 4          443
                

In [2]:
# test data
test = pd.read_csv('./data/test.csv')

In [3]:
subs = pd.read_csv('./data/submission_popular.csv')

In [4]:
test_sub = test[test.session_id.isin(subs.session_id.unique())].reset_index(drop=True)

In [5]:
last = test_sub.groupby('session_id').last().reset_index()

In [6]:
# test_sub.platform.value_counts()

In [7]:
last.head()

Unnamed: 0,session_id,user_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
0,0000059a39020,P2L70S1Y60IF,1541719950,1,clickout item,,JP,"Sapporo, Japan",mobile,,2251200|924581|4775012|10090928|2282660|895299...,54|42|64|51|62|42|61|54|53|56|54|53|57|59|44|4...
1,0000b9394297b,SKDEGHT3I151,1541606244,2,clickout item,Sassi di Matera,AR,"Matera, Italy",mobile,,1044228|5670638|5875672|6033244|7917004|507857...,242|193|288|180|196|129|155|232|245|191|188|17...
2,0001650138d30,V3SQM6X57BNX,1541614783,1,clickout item,,CA,"Banff, Canada",mobile,,44394|44103|44247|44155|43972|44292|44404|4449...,259|220|99|210|283|252|152|192|179|213|199|175...
3,000177b850519,13UA01X5MOHZ,1541574580,4,clickout item,17011,TR,"Rome, Italy",desktop,,17011|16122|17403|17040|17266|17295|17535|1757...,875|726|216|396|263|174|182|264|327|265|364|27...
4,00017b3b2c136,K0LALBSISIAC,1541565393,11,clickout item,6698142,CO,"Piedecuesta, Colombia",mobile,,5723818|4341718|6298194|6698142|7952264|285957...,46|89|21|14|59|74|37|33|30|46|17|77|12|37|44|3...


In [8]:
last['imps'] = last.impressions.str.split('|')

In [9]:
last['nimp'] = last.imps.str.len()

In [10]:
866/last.shape[0]*100

0.34151901030472487

In [11]:
test_pred = pd.read_csv('./subs/lgb_sub_05-23.csv')

In [12]:
result = test_pred[test_pred.session_id.isin(last[last.nimp==1].session_id.unique())].reset_index(drop=True)

In [13]:
first = [i[0] for i in result.item_recommendations.str.split().tolist()]

In [14]:
result['f'] = first

In [15]:
merged = pd.merge(result, last[['session_id', 'impressions']], on='session_id')

In [16]:
merged.head()

Unnamed: 0,user_id,session_id,timestamp,step,item_recommendations,f,impressions
0,62CY1P9C44KR,002d39b5d542a,1541609027,8,2195066 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,2195066,2195066
1,TAUTLI293H5K,0056bf7fbecdb,1541619327,1,972673 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,972673,972673
2,7X92EQ2HHCXC,007946c8ff58a,1541554751,3,2717657 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,2717657,2717657
3,E4EV8507MO7F,009a5cfefbb11,1541590580,7,3435244 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,3435244,3435244
4,CZ0NUCK7YXTM,00c2ab1749758,1541611769,2,6475490 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,6475490,6475490


In [17]:
(merged['f'] != merged.impressions).sum()

0

In [24]:
(merged.f == 0).sum()

0