In [2]:
from utils import load_data, check_gpu, check_dir
from clean_session import preprocess_sessions
import pandas as pd
import numpy as np
from tqdm import tqdm
import gc
from utils import meta_encoding
from functools import partial
import matplotlib.pyplot as plt
from clean_session import preprocess_sessions
from create_model_inputs import prepare_data, compute_session_fts, create_model_inputs
import datetime
%matplotlib inline
%load_ext autoreload
%autoreload 2
def fprint(df, name):
    print(f'{name} shape: ({df.shape[0]:,}, {df.shape[1]})')
import pprint
pp = pprint.PrettyPrinter(indent=1)

In [45]:
nrows = 1000000
train_raw = prepare_data('train', convert_action_type=False, nrows=nrows, select_cols=False, recompute=False)

[05-23 19:27:50 - utils-107 - load_data - INFO] Loading train using 1,000,000 rows (999,995 trimmed) which is 6.28% out of total train data
[05-23 19:27:50 - create_model_inputs-17 - flogger - INFO] raw train shape: (999,995, 12)
[05-23 19:27:50 - clean_session-56 - preprocess_sessions - INFO] Load from existing file: ./gbm_cache/preprocessed_train_1000000.snappy
[05-23 19:27:51 - create_model_inputs-68 - prepare_data - INFO] Sort df by user_id, session_id, timestamp, step
[05-23 19:27:51 - create_model_inputs-17 - flogger - INFO] Prepared train data shape: (326,695, 12)


In [10]:
# df = prepare_data('train', convert_action_type=False, nrows=nrows, recompute=False)
df = train_raw.copy()

[05-23 18:58:12 - utils-107 - load_data - INFO] Loading train using 1,000,000 rows (999,995 trimmed) which is 6.28% out of total train data
[05-23 18:58:12 - create_model_inputs-17 - flogger - INFO] raw train shape: (999,995, 12)
[05-23 18:58:12 - clean_session-56 - preprocess_sessions - INFO] Load from existing file: ./gbm_cache/preprocessed_train_1000000.snappy
[05-23 18:58:13 - create_model_inputs-67 - prepare_data - INFO] Sort df by user_id, session_id, timestamp, step
[05-23 18:58:13 - create_model_inputs-17 - flogger - INFO] Prepared train data shape: (326,695, 9)


In [11]:
df = df.groupby('session_id').last().reset_index()
df['imps'] = df['impressions'].str.split('|')
df['imps'] = df['imps'].apply(lambda x: [int(i) for i in x])
# df.loc[padding_mask, 'impressions'] = (df.loc[padding_mask, 'impressions']
#                                          .apply(lambda x: np.pad(x, (0, 25-len(x)),mode='constant')))
df['reference'] = df['reference'].astype(int)
# filter out nan rows with reference_id not in impressions list, since if the true target in test
# is not in the impression list then it would not get evaluated
def assign_target(row):
    ref = row['reference']
    imp = list(row['imps'])
    if ref in imp:
        return imp.index(ref)
    else:
        return np.nan

df['target'] = df.apply(assign_target, axis=1)
# drop the ones whose reference is not in the impression list
df = df[df['target'].notna()].reset_index(drop=True)
df['target'] = df['target'].astype(int)

In [None]:
meta_mapping = meta_encoding()

In [97]:
pd.options.display.max_colwidth = 1000
pd.options.display.max_columns = 1000

### take a look

In [36]:
display_cols = ['step', 'action_type', 'current_filters', 'reference', 'timestamp',
                'impressions', 'prices', 'target']
raw_display_cols = [c for c in train_raw.columns if c in display_cols]

In [172]:
sids = df.session_id.unique()

In [204]:
# select random session
rid = np.random.choice(sids, 1)[0]

In [205]:
# the train input
df_eg = df[df.session_id==rid][display_cols].reset_index(drop=True)
df_eg

Unnamed: 0,step,action_type,current_filters,reference,timestamp,impressions,prices,target
0,4,clickout item,Free WiFi (Combined)|Focus on Distance,1309013,2018-11-05 18:42:20,82017|1309013|1362762|5048896|86915|2188006|1832285|2443040|2826096|2137914|5041310|6457444|115743|1403243|129435|82010|126047|2738756|2730992|127694,280|168|129|152|181|192|125|142|151|188|58|160|211|179|161|1021|107|200|150|142,1


In [206]:
pp.pprint(df_eg.to_dict(orient='list'))

{'action_type': ['clickout item'],
 'current_filters': ['Free WiFi (Combined)|Focus on Distance'],
 'impressions': ['82017|1309013|1362762|5048896|86915|2188006|1832285|2443040|2826096|2137914|5041310|6457444|115743|1403243|129435|82010|126047|2738756|2730992|127694'],
 'prices': ['280|168|129|152|181|192|125|142|151|188|58|160|211|179|161|1021|107|200|150|142'],
 'reference': [1309013],
 'step': [4],
 'target': [1],
 'timestamp': [Timestamp('2018-11-05 18:42:20')]}


---
### raw

In [207]:
# the train input
raw_eg = train_raw[train_raw.session_id==rid][raw_display_cols+['city', 'platform', 'device']].reset_index(drop=True)
raw_eg

Unnamed: 0,timestamp,step,action_type,current_filters,reference,impressions,prices,city,platform,device
0,2018-11-05 18:39:48,1,search for destination,,"Marne-la-Vallée, France",,,"Marne-la-Vallée, France",BE,mobile
1,2018-11-05 18:40:39,2,search for destination,,"Benidorm, Spain",,,"Benidorm, Spain",BE,mobile
2,2018-11-05 18:41:25,3,filter selection,Free WiFi (Combined),Free WiFi (Combined),,,"Benidorm, Spain",BE,mobile
3,2018-11-05 18:42:20,4,clickout item,Free WiFi (Combined)|Focus on Distance,1309013,82017|1309013|1362762|5048896|86915|2188006|1832285|2443040|2826096|2137914|5041310|6457444|115743|1403243|129435|82010|126047|2738756|2730992|127694,280|168|129|152|181|192|125|142|151|188|58|160|211|179|161|1021|107|200|150|142,"Benidorm, Spain",BE,mobile


In [208]:
pp.pprint(raw_eg.to_dict(orient='list'))

{'action_type': ['search for destination',
                 'search for destination',
                 'filter selection',
                 'clickout item'],
 'city': ['Marne-la-Vallée, France',
          'Benidorm, Spain',
          'Benidorm, Spain',
          'Benidorm, Spain'],
 'current_filters': [None,
                     None,
                     'Free WiFi (Combined)',
                     'Free WiFi (Combined)|Focus on Distance'],
 'device': ['mobile', 'mobile', 'mobile', 'mobile'],
 'impressions': [None,
                 None,
                 None,
                 '82017|1309013|1362762|5048896|86915|2188006|1832285|2443040|2826096|2137914|5041310|6457444|115743|1403243|129435|82010|126047|2738756|2730992|127694'],
 'platform': ['BE', 'BE', 'BE', 'BE'],
 'prices': [None,
            None,
            None,
            '280|168|129|152|181|192|125|142|151|188|58|160|211|179|161|1021|107|200|150|142'],
 'reference': ['Marne-la-Vallée, France',
               'Benidorm, Spai

In [209]:
# meta_mapping.shape

In [210]:
refs = raw_eg.impressions.dropna().str.split('|')
refs = list([int(j) for i in refs for j in i])

In [211]:
refs_meta = meta_mapping[meta_mapping.item_id.isin(refs)]
s = refs_meta.sum(axis=0)
refs_meta_useful = refs_meta[s[s!=0].index].reset_index(drop=True)
ss = refs_meta_useful.sum(axis=0)
refs_meta_useful = refs_meta_useful[ss[ss!=24].index]
refs_meta_useful = refs_meta_useful.set_index('item_id')
refs_meta_useful = refs_meta_useful.reindex(refs)
refs_meta_useful

Unnamed: 0_level_0,on-site boutique shopping,free wifi (combined),from 4 stars,accessible parking,porter,conference rooms,fridge,central heating,swimming pool (outdoor),openable windows,balcony,hotel,fitness,cable tv,large groups,car park,steam room,from 3 stars,business centre,concierge,wheelchair accessible,surfing,satellite tv,tennis court (indoor),massage,sitting area (rooms),bowling,luxury hotel,free wifi (rooms),air conditioning,swimming pool (bar),good rating,electric kettle,shower,hairdresser,room service,airport shuttle,spa (wellness facility),swimming pool (indoor),solarium,honeymoon,computer with internet,terrace (hotel),bathtub,hairdryer,free wifi (public areas),non-smoking rooms,romantic,business hotel,family friendly,self catering,gay-friendly,wifi (rooms),bike rental,excellent rating,spa hotel,4 star,playground,house / apartment,hiking trail,washing machine,horse riding,design hotel,towels,hotel bar,microwave,singles,jacuzzi (hotel),direct beach access,desk,very good rating,organised activities,deck chairs,table tennis,television,safe (rooms),restaurant,tennis court,pool table,beach,cosmetic mirror,golf course,volleyball,sauna,telephone,lift,serviced apartment,flatscreen tv,fan,minigolf,boat rental,convenience store,all inclusive (upon inquiry),wifi (public areas),from 2 stars,sailing,satisfactory rating,gym,safe (hotel),sun umbrellas,express check-in / check-out,reception (24/7),diving,nightclub,convention hotel,3 star,ironing board,laundry service,pet friendly,swimming pool (combined filter),cot
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1
82017,1,1,0,0,0,1,0,1,1,1,0,1,1,0,1,1,0,1,1,1,1,1,1,0,0,1,0,1,1,1,0,0,1,0,0,0,0,0,1,0,0,1,1,1,0,1,0,1,1,1,0,0,1,1,0,0,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1,0,1,0,1,0,0,1,1,1,1,0,1,1,0,0,1,1,0,0,1,0,1,0,1,1
1309013,0,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,1,0,0,1,1,1,1,0,0,0,0,0,0,1,1,1,0,1,0,0,1,1,1,0,1,1,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,0,0,1,0,1,1,0,1,1,1,0,0,0,0,0,1,1,0,0,0,1,0,0,1,1,1,1,0,0,1,0,1,1,0,0,0,0,0,1,1,1
1362762,0,1,0,0,0,0,1,0,1,1,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,1,0,0,1,0,0,1,1,1,0,1,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,1,0,1,1
5048896,0,1,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,0,1,0,1,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,1,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0,1
86915,0,1,0,0,0,0,1,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1,1,0,1,1,0,1,1,1,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,1,1,0,1,1,0,0,0,0,1,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,1,0,1,0,1,0,0,1,1,0,0,0,1,1,1,0,1,0,1,1,0,0,0,0,1,1,0,0,0,0,1,0,0,1
2188006,0,1,0,0,0,0,1,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,1,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,0,1,1,0,0,1,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0
1832285,0,1,0,0,0,0,1,1,0,1,1,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,1,0,0,1,1,1,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,1,1,1,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,1,0,0,0,1,0,1,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,0
2443040,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,1,1,0,0,0,1,0,1,1,0,0,0,0,0,0,0,1,1
2826096,0,1,0,0,0,0,1,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,1,0,1,0,1,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,1,1,1
2137914,0,1,0,0,0,1,1,1,1,1,0,0,0,0,0,1,0,0,0,1,1,1,1,0,0,1,0,0,1,1,0,1,1,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,1,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,1,1,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,1,1,0,0,0,0,0,1,0,0,0,1,0,0,1,1


In [159]:
refs_meta_useful.columns[(refs_meta_useful.iloc[-12]-refs_meta_useful.iloc[2])>0]

Index(['conference rooms', 'room service (24/7)', 'from 3 stars',
       'business centre', 'wheelchair accessible', 'satellite tv',
       'sitting area (rooms)', 'room service', 'swimming pool (indoor)',
       'computer with internet', 'terrace (hotel)', 'romantic', 'bike rental',
       'spa hotel', 'playground', 'very good rating', 'deck chairs',
       'safe (rooms)', 'tennis court', 'pool table', 'golf course', 'sauna',
       'telephone', 'fan', 'radio', 'from 2 stars', 'gym', 'safe (hotel)',
       'laundry service', 'pet friendly', 'swimming pool (combined filter)'],
      dtype='object')

In [151]:
meta_mapping.head()

Unnamed: 0,item_id,on-site boutique shopping,free wifi (combined),from 4 stars,accessible parking,porter,conference rooms,theme hotel,fridge,room service (24/7),bungalows,central heating,swimming pool (outdoor),openable windows,balcony,hotel,fitness,cable tv,large groups,car park,steam room,resort,from 3 stars,water slide,senior travellers,2 star,business centre,concierge,hypoallergenic bedding,wheelchair accessible,farmstay,surfing,satellite tv,tennis court (indoor),pousada (br),adults only,massage,eco-friendly hotel,hypoallergenic rooms,sitting area (rooms),bowling,luxury hotel,free wifi (rooms),air conditioning,swimming pool (bar),good rating,electric kettle,beach bar,shower,hairdresser,bed & breakfast,room service,airport shuttle,spa (wellness facility),swimming pool (indoor),solarium,hydrotherapy,health retreat,honeymoon,computer with internet,terrace (hotel),bathtub,hairdryer,free wifi (public areas),non-smoking rooms,hammam,romantic,boutique hotel,business hotel,family friendly,kosher food,self catering,hostal (es),casino (hotel),gay-friendly,wifi (rooms),kids' club,bike rental,beauty salon,excellent rating,club hotel,spa hotel,4 star,playground,house / apartment,hiking trail,washing machine,horse riding,design hotel,childcare,towels,hotel bar,hot stone massage,microwave,halal food,singles,ski resort,jacuzzi (hotel),motel,direct beach access,desk,very good rating,organised activities,deck chairs,hostel,table tennis,country hotel,television,safe (rooms),restaurant,tennis court,pool table,beach,cosmetic mirror,airport hotel,golf course,volleyball,sauna,skiing,telephone,lift,serviced apartment,flatscreen tv,casa rural (es),shooting sports,fan,minigolf,boat rental,body treatments,convenience store,teleprinter,all inclusive (upon inquiry),radio,5 star,wifi (public areas),from 2 stars,sailing,satisfactory rating,gym,safe (hotel),sun umbrellas,express check-in / check-out,reception (24/7),szep kartya,diving,nightclub,1 star,convention hotel,3 star,guest house,ironing board,doctor on-site,laundry service,pet friendly,swimming pool (combined filter),camping site,accessible hotel,cot
0,5101,0,0,1,1,1,1,0,0,0,0,1,0,1,1,1,1,1,0,1,0,0,1,0,0,0,1,0,1,0,0,0,1,1,0,0,0,0,1,1,1,1,0,1,0,1,1,0,1,0,0,1,1,0,0,0,0,0,0,1,1,1,1,0,1,0,0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,1,1,1,0,1,0,1,0,0,0,1,1,0,1,0,0,1,1,0,0,0,0,0,1,0,1,1,0,1,1,1,0,1,1,0,0,0,0,0,0,0,1,0,1,0,0,0,1,1
1,5416,0,1,1,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,1,1,1,1,1,0,0,0,1,1,0,0,0,0,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,1,0,1,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1
2,5834,0,1,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,1,1,1,0,0,0,1,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,1,1,0,1,0,0,0,1,1,0,0,0,0,0,0,1,1,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
3,5910,0,1,1,0,0,1,0,0,0,0,1,0,1,0,1,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,1,1,0,1,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0
4,6066,0,1,1,1,0,1,0,0,0,0,1,1,1,1,1,1,1,0,1,1,0,1,0,0,0,1,0,1,1,0,1,1,1,0,0,1,0,1,1,1,1,1,0,0,1,0,0,1,0,0,1,0,1,1,1,0,0,1,1,1,1,1,1,1,1,1,0,1,1,0,0,0,0,0,1,0,1,1,0,0,1,1,1,0,0,0,0,0,0,1,1,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,1,1,1,1,1,1,1,0,0,1,1,0,1,1,0,1,0,0,0,0,1,1,0,0,0,1,0,1,1,1,1,1,1,0,0,1,0,1,0,0,1,0,0,1,0,1,1,1,0,1,1


In [75]:
# dics = raw_eg.to_dict(orient='list')
# inds = []
# refs = dics['reference']
# for i in range(len(refs)):
#     imps = dics['impressions'][i].split('|')
#     if refs[i] in imps:
#         inds.append(imps.index(refs[i]))
#     else:
#         inds.append(-1)
# for k,v in enumerate(inds:
#     if i not != -1:
        

In [124]:
t1 = pd.read_csv('./data/train.csv', usecols=['impressions'])

In [125]:
t1 = t1.dropna()

In [128]:
t1_imps = list(set(np.concatenate(t1['impressions'].str.split('|').values)))

In [133]:
t1_imps = [int(i) for i in t1_imps]

In [134]:
meta_mapping.item_id.isin(t1_imps).sum()

852742

In [130]:
meta_mapping.shape

(927142, 158)

In [135]:
len(set(t1_imps) - set(meta_mapping.item_id.unique()))

798

In [136]:
len(t1_imps)

853540

In [137]:
798/853540

0.000934929821683811

In [53]:
train_raw['country'] = train_raw.city.str.extract(', (.+)')

In [55]:
train_raw['ps'] = train_raw.prices.str.split('|')

In [66]:
nona_price = train_raw.dropna(subset=['ps']).reset_index(drop=True)

In [68]:
nona_price['ps_avg'] = nona_price.ps.apply(lambda x: np.mean([int(i) for i in x]))

In [70]:
nona_price.groupby('country')['ps_avg'].mean()

country
Albania                      44.514286
Algeria                      79.506571
Andorra                     155.645227
Antigua and Barbuda         334.920000
Argentina                    76.644868
Armenia                      59.480000
Aruba                       247.728035
Australia                   158.054048
Austria                     196.096451
Azerbaijan                   66.899487
BES Islands                 115.520000
Bahamas                     274.195569
Bahrain                     130.617778
Bangladesh                  104.055714
Barbados                    360.847778
Belarus                      70.746422
Belgium                     129.410807
Belize                      115.721714
Benin                        48.300000
Bhutan                       71.029524
Bolivia                      51.216147
Bosnia and Herzegovina       45.344074
Botswana                    122.835833
Brazil                       82.710434
Bulgaria                     55.583195
Cambodia         

In [None]:
pd.options.display.max_colwidth = 100

In [None]:
one_rid = np.random.choice(oids, 1)[0]
print(one_rid)
df[df.session_id==one_rid][display_cols].reset_index(drop=True)

In [None]:
# df[df.session_id==one_rid][display_cols].reset_index(drop=True).to_dict(orient='list')

In [None]:
from colorama import Fore
dic = df[df.session_id==one_rid][display_cols].reset_index(drop=True).to_dict(orient='list')
prices = dic['prices'][0].split('|')
ref = str(dic['reference'][0])
imps = dic['impressions'][0].split('|')
# prices[imps.index(ref)] = f'{Fore.RED} {prices[imps.index(ref)]}' #{test_id}
prices[imps.index(ref)] = f'==={prices[imps.index(ref)]}===' #{test_id}
dic['prices'] = '|'.join(prices)
pp.pprint(dic)
# print(dic['prices'])

In [None]:
print(f'{Fore.RED}{prices[imps.index(ref)]}')

In [None]:
# # and the raw
# train[train.session_id==one_rid][raw_display_cols].reset_index(drop=True)

In [None]:
# pp.pprint(train[train.session_id==one_rid][raw_display_cols].reset_index(drop=True).to_dict(orient='list'))

---
### Longer

In [None]:
mid_rid = np.random.choice(mids, 1)[0]
print(mid_rid)
# df[df.session_id==mid_rid][display_cols].reset_index(drop=True)

In [None]:
np.sort([int(i) for i in df[df.session_id==mid_rid]['prices'].str.split('|').values[0]])

In [None]:
# pp.pprint(df[df.session_id==mid_rid][display_cols].reset_index(drop=True).to_dict(orient='list'))

In [None]:
# the raw
train[train.session_id==mid_rid][raw_display_cols].reset_index(drop=True)

In [None]:
pp.pprint(train[train.session_id==mid_rid][raw_display_cols].reset_index(drop=True).to_dict(orient='list'))

In [None]:
# dup_mask = train[[c for c in train.columns if c not in ['timestamp', 'step']]].duplicated(keep=False)
# dup_mask.sum()

In [None]:
# dup_cols = [c for c in train.columns if c not in ['timestamp', 'step']]
# # train[dup_cols].loc[train[dup_cols].shift(-1) != train[dup_cols]]
# train.groupby(dup_cols).apply(lambda x: )

In [None]:
train.shape

In [None]:
99508/train.shape[0]

### Look at test set
---

In [None]:
test = load_data('tedt')

---
### Click-encodings

In [None]:
from create_model_inputs import click_view_encoding
_ = click_view_encoding(sids=None, fold='testing', m=5)

In [None]:
cv_enc = pd.read_csv('./cache/foldtesting_clickview_encodings.csv')

In [None]:
cv_enc.shape

In [None]:
cv_enc['clicked'].hist(bins=100)

---
### clickouts

In [None]:
%%time
df = load_data('train', usecols=['session_id', 'action_type', 'impressions', 'reference'])

In [None]:
df.shape

In [None]:
df = df.drop_duplicates(subset=[c for c in df.columns if c != 'step']).reset_index(drop=True)

In [None]:
df.shape

In [None]:
%%time
# filter away sessions do not have clickout
def filter_away(df):
    return (df.action_type=='clickout item').sum() > 0
fids = df.groupby('session_id').apply(filter_away)

In [None]:
df_has = df[df.session_id.isin(fids[fids].index)]

In [None]:
df_has.shape

In [None]:
# number of clickout rows
(df_has.action_type == 'clickout item').sum()

In [None]:
last_df_has = df_has.groupby('session_id').last().reset_index()

In [None]:
last_df_has.shape

In [None]:
# average number of clickout per session
ctns = df_has[df_has.action_type=='clickout item'].groupby('session_id')['session_id'].size()

In [None]:
ctns.head()

In [None]:
s = df_has.groupby('session_id')['session_id'].size()

In [None]:
s.head()

In [None]:
# avg_clickout = ctns/s

In [None]:
# avg_clickout.hist(bins=100)

In [None]:
np.log(ctns).hist(bins=100)

In [None]:
ctn