In [13]:
from utils import load_data, check_gpu, check_dir

import pandas as pd
import numpy as np
from tqdm import tqdm
import gc
from utils import meta_encoding
from functools import partial
import matplotlib.pyplot as plt
from scipy.stats import rankdata
import datetime
pd.options.display.max_colwidth = 1000
pd.options.display.max_columns = 1000

%matplotlib inline
%load_ext autoreload
%autoreload 2
def fprint(df, name):
    print(f'{name} shape: ({df.shape[0]:,}, {df.shape[1]})')
import pprint
pp = pprint.PrettyPrinter(indent=1)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
train_raw = load_data('train', nrows=1000000)
train_raw['timestamp'] = train_raw['timestamp'].apply(lambda ts: datetime.datetime.utcfromtimestamp(ts))


[06-13 15:32:34 - utils-113 - load_data - INFO] Loading train using 1,000,000 rows (999,995 trimmed) which is 6.28% out of total train data


In [3]:
# train_ids = train_raw.session_id.unique()

### Check clean data

In [None]:
train_cleaned = pd.read_parquet('gbm_cache/preprocess_train_1000000_no_test_added.snappy')

In [None]:
rid = np.random.choice(train_ids, 1)[0]
rid

In [None]:
# raw
train_raw[train_raw.session_id==rid].head()

In [None]:
# clean
train_cleaned[train_cleaned.session_id==rid].head()

In [None]:
train_cleaned.action_type.head()

In [None]:
# check current filters
cfs_ids = train_cleaned[train_cleaned.current_filters.notna()].session_id.unique()

In [None]:
train_cleaned[train_cleaned.session_id==np.random.choice(cfs_ids, 1)[0]]

### Check feature generations

In [51]:
train_inputs = pd.read_parquet('./gbm_cache/train_inputs_1000000_test_added.snappy')
cf_cols = [c for c in train_inputs.columns if 'current_filters' in c]
drop_cols = cf_cols  # + ['country', 'platform']
# drop cf col for now
train_inputs.drop(drop_cols, axis=1, inplace=True)

In [5]:
train_ids = train_inputs.session_id.unique()

In [6]:
rid = np.random.choice(train_ids, 1)[0]
rid

'c3c1d9f0cfc4e'

In [7]:
raw_demo = train_raw[train_raw.session_id==rid].reset_index(drop=True)
raw_demo

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
0,CEGFN4D49WZI,c3c1d9f0cfc4e,2018-11-06 19:35:56,1,interaction item image,52419,DE,"Erfurt, Germany",desktop,,,
1,CEGFN4D49WZI,c3c1d9f0cfc4e,2018-11-06 19:35:56,2,interaction item image,52419,DE,"Erfurt, Germany",desktop,,,
2,CEGFN4D49WZI,c3c1d9f0cfc4e,2018-11-06 19:36:43,3,clickout item,52419,DE,"Erfurt, Germany",desktop,,14910|52419|53438|15918|15402|128443|772473|626051|158537|1193842|14824|5179324,220|193|310|265|162|119|99|43|51|83|96|65
3,CEGFN4D49WZI,c3c1d9f0cfc4e,2018-11-06 19:54:24,4,interaction item image,52419,DE,"Erfurt, Germany",desktop,,,
4,CEGFN4D49WZI,c3c1d9f0cfc4e,2018-11-06 19:54:37,5,interaction item image,52419,DE,"Erfurt, Germany",desktop,,,
5,CEGFN4D49WZI,c3c1d9f0cfc4e,2018-11-06 19:55:30,6,interaction item image,52419,DE,"Erfurt, Germany",desktop,,,
6,CEGFN4D49WZI,c3c1d9f0cfc4e,2018-11-06 19:55:31,7,interaction item image,52419,DE,"Erfurt, Germany",desktop,,,


check input data with

In [52]:
input_demo = train_inputs[train_inputs.session_id==rid].reset_index(drop=True)
input_demo

Unnamed: 0,session_id,step,device,imp_changed,session_size,session_duration,last_duration,co,search,inter,fs,cs,last_reference_relative_loc,mean_rating,median_rating,mean_star,median_star,n_imps,target,mean_price,median_price,prices_0,prices_1,prices_2,prices_3,prices_4,prices_5,prices_6,prices_7,prices_8,prices_9,prices_10,prices_11,prices_12,prices_13,prices_14,prices_15,prices_16,prices_17,prices_18,prices_19,prices_20,prices_21,prices_22,prices_23,prices_24,prices_rank_0,prices_rank_1,prices_rank_2,prices_rank_3,prices_rank_4,prices_rank_5,prices_rank_6,prices_rank_7,prices_rank_8,prices_rank_9,prices_rank_10,prices_rank_11,prices_rank_12,prices_rank_13,prices_rank_14,prices_rank_15,prices_rank_16,prices_rank_17,prices_rank_18,prices_rank_19,prices_rank_20,prices_rank_21,prices_rank_22,prices_rank_23,prices_rank_24,prev_click_0,prev_click_1,prev_click_2,prev_click_3,prev_click_4,prev_click_5,prev_click_6,prev_click_7,prev_click_8,prev_click_9,prev_click_10,prev_click_11,prev_click_12,prev_click_13,prev_click_14,prev_click_15,prev_click_16,prev_click_17,prev_click_18,prev_click_19,prev_click_20,prev_click_21,prev_click_22,prev_click_23,prev_click_24,prev_interact_0,prev_interact_1,prev_interact_2,prev_interact_3,prev_interact_4,prev_interact_5,prev_interact_6,prev_interact_7,prev_interact_8,prev_interact_9,prev_interact_10,prev_interact_11,prev_interact_12,prev_interact_13,prev_interact_14,prev_interact_15,prev_interact_16,prev_interact_17,prev_interact_18,prev_interact_19,prev_interact_20,prev_interact_21,prev_interact_22,prev_interact_23,prev_interact_24,half_prices_rank_0,half_prices_rank_1,half_prices_rank_2,half_prices_rank_3,half_prices_rank_4,half_prices_rank_5,half_prices_rank_6,half_prices_rank_7,half_prices_rank_8,half_prices_rank_9,half_prices_rank_10,half_prices_rank_11,ratings_0,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,ratings_6,ratings_7,ratings_8,ratings_9,ratings_10,ratings_11,ratings_12,ratings_13,ratings_14,ratings_15,ratings_16,ratings_17,ratings_18,ratings_19,ratings_20,ratings_21,ratings_22,ratings_23,ratings_24,stars_0,stars_1,stars_2,stars_3,stars_4,stars_5,stars_6,stars_7,stars_8,stars_9,stars_10,stars_11,stars_12,stars_13,stars_14,stars_15,stars_16,stars_17,stars_18,stars_19,stars_20,stars_21,stars_22,stars_23,stars_24
0,c3c1d9f0cfc4e,3.0,1,,2,47.0,47.0,0.0,0.0,1.0,33,8,0.08,1.916667,2.0,2.75,3.5,12,1,142.166667,109.0,220.0,193.0,310.0,265.0,162.0,119.0,99.0,43.0,51.0,83.0,96.0,65.0,,,,,,,,,,,,,,0.833333,0.75,1.0,0.916667,0.666667,0.583333,0.5,0.083333,0.166667,0.333333,0.416667,0.25,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.833333,0.75,1.0,0.916667,0.666667,0.583333,0.5,0.083333,0.166667,0.333333,0.416667,0.25,0.4,1.0,1.0,0.4,0.8,0.6,0.2,0.2,0.6,0.8,0.8,0.2,,,,,,,,,,,,,,1.0,0.8,1.0,1.0,1.0,1.0,0.2,0.4,0.6,0.8,1.0,0.2,,,,,,,,,,,,,


In [14]:
ps = '220|193|310|265|162|119|99|43|51|83|96|65'.split('|')
ps = [float(i) for i in ps]
rankdata(ps, method='dense')

array([10,  9, 12, 11,  8,  7,  6,  1,  2,  4,  5,  3])

In [23]:
np.nanmean(ps), np.nanmedian(ps), np.mean(ps), np.median(ps)

(142.16666666666666, 109.0, 142.16666666666666, 109.0)

In [16]:
1/12

0.08333333333333333

In [19]:
2/25

0.08

In [53]:
pp.pprint(input_demo.to_dict(orient='list'))

{'co': [0.0],
 'cs': [8],
 'device': [1],
 'fs': [33],
 'half_prices_rank_0': [0.8333333333333334],
 'half_prices_rank_1': [0.75],
 'half_prices_rank_10': [0.4166666666666667],
 'half_prices_rank_11': [0.25],
 'half_prices_rank_2': [1.0],
 'half_prices_rank_3': [0.9166666666666666],
 'half_prices_rank_4': [0.6666666666666666],
 'half_prices_rank_5': [0.5833333333333334],
 'half_prices_rank_6': [0.5],
 'half_prices_rank_7': [0.08333333333333333],
 'half_prices_rank_8': [0.16666666666666666],
 'half_prices_rank_9': [0.3333333333333333],
 'imp_changed': [nan],
 'inter': [1.0],
 'last_duration': [47.0],
 'last_reference_relative_loc': [0.08],
 'mean_price': [142.16666666666666],
 'mean_rating': [1.9166666666666667],
 'mean_star': [2.75],
 'median_price': [109.0],
 'median_rating': [2.0],
 'median_star': [3.5],
 'n_imps': [12],
 'prev_click_0': [0.0],
 'prev_click_1': [0.0],
 'prev_click_10': [0.0],
 'prev_click_11': [0.0],
 'prev_click_12': [0.0],
 'prev_click_13': [0.0],
 'prev_click_14':

In [24]:
meta_mapping = meta_encoding()

[06-13 15:48:55 - utils-150 - meta_encoding - INFO] Load from existing file: ./gbm_cache/meta_encodings.csv


In [31]:
# refs = raw_eg.impressions.dropna().str.split('|')
# refs = list([int(j) for i in refs for j in i])
refs = '14910|52419|53438|15918|15402|128443|772473|626051|158537|1193842|14824|5179324'
refs = refs.split('|')
refs = [int(i) for i in refs]

In [33]:
refs_meta = meta_mapping[meta_mapping.item_id.isin(refs)]
s = refs_meta.sum(axis=0)
refs_meta_useful = refs_meta[s[s!=0].index].reset_index(drop=True)
ss = refs_meta_useful.sum(axis=0)
refs_meta_useful = refs_meta_useful[ss[ss!=24].index]
refs_meta_useful = refs_meta_useful.set_index('item_id')
refs_meta_useful = refs_meta_useful.reindex(refs)
cols = ['1 star', '2 star', '3 star', '4 star', '5 star', 
                  'excellent rating', 'very good rating', 'good rating', 'satisfactory rating']
refs_meta_useful[[c for c in cols if c in refs_meta_useful.columns]].loc[refs]

Unnamed: 0_level_0,1 star,2 star,3 star,4 star,excellent rating,very good rating,good rating,satisfactory rating
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
14910,0,0,0,1,0,0,0,1
52419,0,0,1,0,1,1,1,1
53438,0,0,0,1,1,1,1,1
15918,0,0,0,1,0,0,0,1
15402,0,0,0,1,0,1,1,1
128443,0,0,0,1,0,0,1,1
772473,0,0,0,0,0,0,0,0
626051,1,0,0,0,0,0,0,0
158537,0,1,0,0,0,0,1,1
1193842,0,0,1,0,0,1,1,1


In [54]:
(6*4+3*2+2+1)/12, (1+4+4+1+3+2+0+0+2+3+3)/12

(2.75, 1.9166666666666667)

### check the meta properties

In [None]:
meta_mapping = meta_encoding()

In [25]:
# refs = raw_eg.impressions.dropna().str.split('|')
# refs = list([int(j) for i in refs for j in i])
refs = '14910|52419|53438|15918|15402|128443|772473|626051|158537|1193842|14824|5179324'
refs = refs.split('|')
refs = [int(i) for i in refs]

In [26]:
refs_meta = meta_mapping[meta_mapping.item_id.isin(refs)]
s = refs_meta.sum(axis=0)
refs_meta_useful = refs_meta[s[s!=0].index].reset_index(drop=True)
ss = refs_meta_useful.sum(axis=0)
refs_meta_useful = refs_meta_useful[ss[ss!=24].index]
refs_meta_useful = refs_meta_useful.set_index('item_id')
refs_meta_useful = refs_meta_useful.reindex(refs)
refs_meta_useful

Unnamed: 0_level_0,free wifi (public areas),good rating,convention hotel,ironing board,car park,express check-in / check-out,conference rooms,washing machine,singles,wheelchair accessible,openable windows,solarium,house / apartment,from 2 stars,electric kettle,wifi (rooms),luxury hotel,cable tv,hairdryer,table tennis,concierge,satisfactory rating,business centre,hiking trail,massage,swimming pool (indoor),horse riding,airport shuttle,minigolf,tennis court (indoor),2 star,swimming pool (combined filter),porter,self catering,hotel bar,radio,computer with internet,3 star,free wifi (rooms),bike rental,balcony,pet friendly,hotel,room service (24/7),tennis court,flatscreen tv,restaurant,air conditioning,4 star,wifi (public areas),bowling,beauty salon,airport hotel,laundry service,satellite tv,country hotel,non-smoking rooms,telephone,accessible parking,central heating,very good rating,business hotel,teleprinter,gym,steam room,excellent rating,hammam,from 4 stars,cosmetic mirror,large groups,from 3 stars,cot,safe (rooms),deck chairs,bathtub,free wifi (combined),reception (24/7),sitting area (rooms),fan,boat rental,shower,hypoallergenic bedding,accessible hotel,safe (hotel),pool table,shooting sports,desk,1 star,fitness,lift,room service,golf course,romantic,eco-friendly hotel,terrace (hotel),family friendly,spa (wellness facility),television,hypoallergenic rooms,sauna
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1
14910,1,0,1,1,1,0,1,1,1,1,1,0,0,1,1,1,1,0,1,0,1,1,1,0,1,0,0,1,0,0,0,0,1,0,1,1,1,0,1,1,0,1,1,0,0,1,1,1,1,1,0,0,0,1,0,0,1,1,0,1,0,1,1,1,0,0,1,1,1,1,1,1,0,1,1,1,1,0,1,0,1,0,1,1,0,0,1,0,0,1,1,0,1,1,0,1,1,1,0,1
52419,0,1,0,0,1,1,0,0,0,0,1,0,0,1,0,0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,1,1,1,0,1,0,0,0,1,0,0,1,0,0,1,1,0,1,1,1,0,0,0,1,0,0,1,0,1,1,1,0,0,0,1,0,0,0,1,0,0,1,1,0,1,0,0,1,1,0,0,0,1,1,0,1,0,0
53438,1,1,1,1,1,0,1,0,0,1,1,0,0,1,0,1,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,1,1,1,1,0,1,1,1,0,1,1,1,0,1,1,1,0,1,1,1,1,1,1,0,0,0,1,0,1,1,1,1,1,1,0,0,1,1,1,1,0,1,1,1,1,0,0,1,0,1,1,1,0,0,0,1,1,0,1,0,0
15918,1,0,1,0,1,0,1,0,0,1,1,0,0,1,0,1,1,1,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0,1,1,0,1,0,1,0,1,1,0,0,0,0,0,0,1,1,1,1,0,1,0,0,0,0,0,1,1,1,1,1,0,0,0,1,1,0,0,0,1,1,1,1,0,0,1,0,0,1,1,0,0,0,0,1,0,1,1,0
15402,1,1,0,0,1,1,1,0,0,1,1,0,0,1,0,1,1,1,1,1,0,1,1,0,0,0,1,0,0,1,0,0,0,0,1,1,1,0,1,1,0,1,1,0,1,0,1,0,1,1,1,0,0,1,0,0,1,1,0,1,1,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,1,0,0,1,1,0,1,1,0,1,0,1,1,1,0,0,0,1,1,0,1,0,0
128443,1,1,1,1,1,1,1,0,1,1,1,0,0,1,1,1,0,1,1,0,0,1,1,1,0,0,1,0,1,0,0,0,0,0,1,1,1,0,1,1,0,1,1,0,0,0,1,0,1,1,0,1,0,1,0,1,1,1,1,1,0,1,0,1,0,0,0,1,1,1,1,1,0,0,1,1,1,1,0,0,1,0,1,1,0,0,1,0,0,1,1,1,0,1,1,1,0,1,0,1
772473,0,0,0,0,1,0,1,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1
626051,0,0,0,0,1,1,0,0,0,1,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0
158537,1,1,0,0,1,0,1,0,0,0,1,0,0,1,0,1,1,0,1,1,0,1,0,1,0,0,0,0,0,1,1,0,0,1,0,1,0,0,1,1,0,1,1,0,0,0,1,0,0,1,0,0,0,1,1,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,1,0,1,1,1,0,0,1,0,1,0,1,0,1,0,0,0,1,1,0,1,0,0
1193842,0,1,0,0,1,0,1,0,0,1,1,0,0,1,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,1,1,0,1,1,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1,1,0,0,1,0,0,0,0,1,1,0,1,0,0


In [9]:
cols_sum = refs_meta_useful.sum(axis=0)
one_cols = cols_sum[cols_sum==1].index
refs_meta_useful[one_cols]

Unnamed: 0_level_0,hypoallergenic bedding,room service (24/7),eco-friendly hotel,business centre,beauty salon,convention hotel,hydrotherapy,health retreat,nightclub,hypoallergenic rooms,direct beach access,shooting sports,cable tv,radio,all inclusive (upon inquiry)
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
130331,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
10280364,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
81752,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
10535822,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
113110,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
320066,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
957499,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
110702,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
5880418,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7749720,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [29]:
meta_mapping[meta_mapping.item_id==110702]

Unnamed: 0,item_id,hairdryer,airport shuttle,hammam,hostal (es),accessible parking,non-smoking rooms,hot stone massage,deck chairs,flatscreen tv,pet friendly,towels,fan,swimming pool (bar),satellite tv,shower,spa hotel,beach,hypoallergenic bedding,bike rental,diving,halal food,hiking trail,washing machine,kids' club,5 star,safe (hotel),room service (24/7),tennis court (indoor),excellent rating,microwave,spa (wellness facility),airport hotel,cosmetic mirror,fridge,eco-friendly hotel,romantic,doctor on-site,1 star,car park,swimming pool (combined filter),boat rental,satisfactory rating,playground,steam room,convenience store,business hotel,kosher food,2 star,water slide,boutique hotel,laundry service,singles,balcony,horse riding,wheelchair accessible,theme hotel,3 star,porter,casino (hotel),on-site boutique shopping,cot,wifi (public areas),conference rooms,body treatments,electric kettle,surfing,skiing,hairdresser,from 4 stars,childcare,sailing,design hotel,from 2 stars,free wifi (combined),accessible hotel,guest house,country hotel,gym,very good rating,free wifi (rooms),pool table,resort,lift,family friendly,business centre,hostel,bed & breakfast,sauna,fitness,beauty salon,openable windows,convention hotel,farmstay,jacuzzi (hotel),tennis court,hydrotherapy,restaurant,health retreat,room service,sitting area (rooms),nightclub,motel,from 3 stars,organised activities,club hotel,television,serviced apartment,safe (rooms),honeymoon,desk,solarium,good rating,ironing board,concierge,sun umbrellas,szep kartya,hypoallergenic rooms,terrace (hotel),hotel bar,beach bar,luxury hotel,self catering,adults only,computer with internet,direct beach access,ski resort,camping site,shooting sports,volleyball,cable tv,teleprinter,pousada (br),senior travellers,central heating,bathtub,minigolf,hotel,table tennis,golf course,reception (24/7),telephone,massage,casa rural (es),bungalows,bowling,free wifi (public areas),radio,gay-friendly,swimming pool (outdoor),house / apartment,wifi (rooms),4 star,swimming pool (indoor),large groups,air conditioning,express check-in / check-out,all inclusive (upon inquiry)
708843,110702,1,0,0,0,0,0,1,1,0,0,0,0,1,1,1,1,1,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,1,1,1,1,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,1,0,1,0,1,1,0,0,0,1,1,1,1,0,1,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,1,0,1,0,1,0,1,1,1,0,0,1,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,1,1,1,0,0,0,1,0,0,1,0,1,1,1,1,1,0,1


In [30]:
m = pd.read_csv('./data/item_metadata.csv')

In [31]:
m[m.item_id==110702]

Unnamed: 0,item_id,properties
708843,110702,Kids' Club|Body Treatments|Convenience Store|WiFi (Public Areas)|Organised Activities|Deck Chairs|Family Friendly|Laundry Service|Conference Rooms|Hiking Trail|Luxury Hotel|Terrace (Hotel)|Spa (Wellness Facility)|Very Good Rating|Television|Minigolf|Business Hotel|Shower|Steam Room|Telephone|Hotel|Reception (24/7)|From 2 Stars|All Inclusive (Upon Inquiry)|Playground|4 Star|From 4 Stars|Sauna|Restaurant|Openable Windows|Desk|Air Conditioning|Swimming Pool (Bar)|Sun Umbrellas|Massage|Satellite TV|Hot Stone Massage|Solarium|Hotel Bar|Pool Table|Car Park|Sailing|Bathtub|Satisfactory Rating|Free WiFi (Combined)|Romantic|Boat Rental|Free WiFi (Rooms)|Safe (Rooms)|Swimming Pool (Indoor)|Jacuzzi (Hotel)|Large Groups|Computer with Internet|WiFi (Rooms)|Swimming Pool (Combined Filter)|Free WiFi (Public Areas)|Lift|Central Heating|Spa Hotel|Swimming Pool (Outdoor)|Cot|Gym|Hairdryer|Beach|From 3 Stars|Good Rating


In [13]:
meta_mapping.columns.values

array(['item_id', 'hairdryer', 'airport shuttle', 'hammam', 'hostal (es)',
       'accessible parking', 'non-smoking rooms', 'hot stone massage',
       'deck chairs', 'flatscreen tv', 'pet friendly', 'towels', 'fan',
       'swimming pool (bar)', 'satellite tv', 'shower', 'spa hotel',
       'beach', 'hypoallergenic bedding', 'bike rental', 'diving',
       'halal food', 'hiking trail', 'washing machine', "kids' club",
       '5 star', 'safe (hotel)', 'room service (24/7)',
       'tennis court (indoor)', 'excellent rating', 'microwave',
       'spa (wellness facility)', 'airport hotel', 'cosmetic mirror',
       'fridge', 'eco-friendly hotel', 'romantic', 'doctor on-site',
       '1 star', 'car park', 'swimming pool (combined filter)',
       'boat rental', 'satisfactory rating', 'playground', 'steam room',
       'convenience store', 'business hotel', 'kosher food', '2 star',
       'water slide', 'boutique hotel', 'laundry service', 'singles',
       'balcony', 'horse riding', 'whe

In [17]:
meta_mapping[['1 star', '2 star', '3 star', 'from 3 stars', '4 star', 'from 4 stars', '5 star']]

Unnamed: 0,1 star,2 star,3 star,from 3 stars,4 star,from 4 stars,5 star
0,0,0,0,1,1,1,0
1,0,0,0,1,1,1,0
2,0,0,1,1,0,0,0
3,0,0,0,1,1,1,0
4,0,0,0,1,1,1,0
5,0,0,1,0,0,0,0
6,0,0,1,1,0,0,0
7,0,0,0,1,1,1,0
8,0,0,0,1,1,1,0
9,0,0,0,1,1,1,0


In [18]:
meta_mapping[['satisfactory rating', 'good rating', 'very good rating', 'excellent rating']]

Unnamed: 0,satisfactory rating,good rating,very good rating,excellent rating
0,1,1,0,0
1,1,1,1,1
2,1,1,1,0
3,1,1,0,0
4,1,1,0,0
5,1,1,0,0
6,1,1,1,0
7,1,1,1,0
8,1,1,0,0
9,1,1,1,0


In [26]:
meta_mapping.head()

Unnamed: 0,item_id,hairdryer,airport shuttle,hammam,hostal (es),accessible parking,non-smoking rooms,hot stone massage,deck chairs,flatscreen tv,pet friendly,towels,fan,swimming pool (bar),satellite tv,shower,spa hotel,beach,hypoallergenic bedding,bike rental,diving,halal food,hiking trail,washing machine,kids' club,5 star,safe (hotel),room service (24/7),tennis court (indoor),excellent rating,microwave,spa (wellness facility),airport hotel,cosmetic mirror,fridge,eco-friendly hotel,romantic,doctor on-site,1 star,car park,swimming pool (combined filter),boat rental,satisfactory rating,playground,steam room,convenience store,business hotel,kosher food,2 star,water slide,boutique hotel,laundry service,singles,balcony,horse riding,wheelchair accessible,theme hotel,3 star,porter,casino (hotel),on-site boutique shopping,cot,wifi (public areas),conference rooms,body treatments,electric kettle,surfing,skiing,hairdresser,from 4 stars,childcare,sailing,design hotel,from 2 stars,free wifi (combined),accessible hotel,guest house,country hotel,gym,very good rating,free wifi (rooms),pool table,resort,lift,family friendly,business centre,hostel,bed & breakfast,sauna,fitness,beauty salon,openable windows,convention hotel,farmstay,jacuzzi (hotel),tennis court,hydrotherapy,restaurant,health retreat,room service,sitting area (rooms),nightclub,motel,from 3 stars,organised activities,club hotel,television,serviced apartment,safe (rooms),honeymoon,desk,solarium,good rating,ironing board,concierge,sun umbrellas,szep kartya,hypoallergenic rooms,terrace (hotel),hotel bar,beach bar,luxury hotel,self catering,adults only,computer with internet,direct beach access,ski resort,camping site,shooting sports,volleyball,cable tv,teleprinter,pousada (br),senior travellers,central heating,bathtub,minigolf,hotel,table tennis,golf course,reception (24/7),telephone,massage,casa rural (es),bungalows,bowling,free wifi (public areas),radio,gay-friendly,swimming pool (outdoor),house / apartment,wifi (rooms),4 star,swimming pool (indoor),large groups,air conditioning,express check-in / check-out,all inclusive (upon inquiry)
0,5101,1,1,0,0,1,1,0,0,1,0,0,1,0,1,1,0,0,1,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,1,1,1,0,1,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,1,1,1,0,0,0,1,0,1,0,0,0,1,0,1,0,1,1,0,0,1,0,0,1,0,1,0,1,0,1,1,0,0,0,1,1,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,1,1,1,0,1,1,1,0,0,0,1,0,1,0,0,0,1,1,0,0,1,1,0
1,5416,1,0,0,0,0,1,0,0,1,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,1,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,1,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,1,0,0,0,0,1,1,0,0,0,1,1,0,0,0,0,0
2,5834,1,0,0,0,0,1,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,1,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0
3,5910,0,0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0,1,0,0,1,0,1,0,0,1,1,0,1,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,1,0,0,0,1,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0
4,6066,1,0,1,0,1,1,1,0,1,1,1,0,0,1,1,1,1,1,1,1,0,0,0,0,0,1,0,1,0,0,1,0,1,0,0,1,0,0,1,1,1,1,1,1,0,1,0,0,0,0,1,0,1,0,1,0,0,0,0,0,1,1,1,1,0,1,0,0,1,0,1,0,1,1,1,0,0,1,0,1,1,0,1,1,1,0,0,1,1,1,1,1,0,1,1,0,1,0,1,1,0,0,1,0,0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,0,1,0,0,1,0,0,0,0,1,1,0,0,0,1,1,0,1,1,0,1,1,1,0,0,1,1,1,0,1,0,1,1,1,0,0,0,0


In [25]:
(meta_mapping.loc[meta_mapping['good rating']==1, 'satisfactory rating']==0).sum()

0

In [26]:
81/187

0.43315508021390375

In [11]:
1/25

0.04

In [23]:
s = '81|120|81|187|70|45|56|130|124|56|58|77|52|40|120|53|65|65|54|30|59|33|45|108|37'
s = s.split('|')
s = [int(i) for i in s]
from scipy.stats import rankdata
rankdata(s, method='dense')

array([15, 17, 15, 20, 13,  5,  9, 19, 18,  9, 10, 14,  6,  4, 17,  7, 12,
       12,  8,  1, 11,  2,  5, 16,  3])

In [25]:
17/20

0.85

In [29]:
# input_demo = train_inputs[train_inputs.session_id==rid].reset_index(drop=True)
# input_demo

Unnamed: 0,session_id,step,device,imp_changed,session_size,session_duration,last_duration,co,search,inter,...,prev_interact_15,prev_interact_16,prev_interact_17,prev_interact_18,prev_interact_19,prev_interact_20,prev_interact_21,prev_interact_22,prev_interact_23,prev_interact_24
0,6f19e9c27106b,5.0,0,,5,157.0,14.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# pp.pprint(input_demo.to_dict(orient='list'))

In [17]:
from create_model_inputs import change_sort_order_mapping
change_sort_order_mapping()

{'interaction sort button': 0,
 'price only': 1,
 'price and recommended': 2,
 'distance only': 3,
 'distance and recommended': 4,
 'rating and recommended': 5,
 'rating only': 6,
 'our recommendations': 7}

### country related

In [None]:
cp = load_data('train', usecols=['city', 'platform'])

In [None]:
cp['country'] = cp['city'].str.split(', ').str[-1]
cp['country'] = cp['country'].str.lower()

In [None]:
countries = cp['country'].unique()

In [None]:
platforms = cp['platform'].unique()

In [None]:
country2code = np.load('data/country2code.npy').item()

In [None]:
code2country = np.load('data/code2country.npy').item()

In [None]:
set(countries) - set(list(country2code.keys()))

In [None]:
set(list(country2code.keys())) - set(countries)

In [None]:
fix_dict = {'bermudas': 'bermuda', 
            'bes islands': None,
            'brunei':'brunei darussalam',
            'china': "people's republic of china",
            'crimea': None,
            'curacao': 'netherlands antilles',
            'democratic republic of congo': 'congo, the democratic republic of',
            'east timor': 'timor',
            'french antilles': None,
            'guinea-bissau': None,
            'iran': 'iran, islamic republic of',
            'ivory coast': "côte d'ivoire",
            'kosovo': None,
            'laos': "lao people's democratic republic"
           }

In [56]:
train_raw.impressions.notna()[train_raw.impressions.notna()].index[0]

13

In [2]:
a = pd.read_parquet('./gbm_cache/preprocess_train_5000000_test_added.snappy')

In [3]:
a.reference.head()

0    1179940
1    4590810
2    4590810
3    3844380
4    9387530
Name: reference, dtype: object

In [4]:
a.reference.isna().sum()

2

In [5]:
a[a.reference.isna()]

Unnamed: 0,session_id,timestamp,step,action_type,current_filters,reference,impressions,prices,device
404026,2a181b2125efe,2018-11-07 12:13:14,6.0,0,,,4622816|3389774|4743270|4090264|4906084|1390332|1949033|7197516|1668209|1857023|9790044|2520372|346171|103993|6402498|513786|9498644|2875112|7038592|1573641|3983502|8119076|2857132|642391|3214070,75|60|50|60|35|30|69|48|56|75|70|32|70|487|54|82|32|30|78|63|92|61|61|50|40,0
1950802,cbe3752713eee,2018-11-07 20:53:57,5.0,0,,,45927|1258844|21154|4719620|873351|21061|21072|21081|21085|21100|21115|21126|21150|45499|45807|1240467|1542573|21057|1666029|7176920|21097|83964|153182|45930|1473027,93|183|240|99|62|166|179|161|94|191|83|65|94|58|74|130|50|60|96|47|511|71|70|124|55,0


In [7]:
b = pd.read_csv('./data/test.csv')

In [11]:
b[b.session_id=='2a181b2125efe']

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
2955288,7X4FZTVRCDQA,2a181b2125efe,2018-11-08 13:31:16,1,clickout item,45643,IT,"Catania, Italy",mobile,,45643|2609026|20210|20238|2774322|20207|20212|20213|20209|20215|20219|1838087|3134547|1277780|104016|1501099|45885|926493|1331335|3894602|3049100|20236|103708|5835954|20220,110|74|84|103|77|98|132|113|59|61|94|45|81|64|92|93|174|93|60|79|71|57|66|104|55
2955289,7X4FZTVRCDQA,2a181b2125efe,2018-11-08 13:33:14,2,clickout item,104016,IT,"Catania, Italy",mobile,,45643|2609026|20210|20238|2774322|20207|20212|20213|20209|20215|20219|1838087|3134547|1277780|104016|1501099|45885|926493|1331335|3894602|3049100|20236|103708|5835954|20220,110|74|84|103|77|98|132|113|59|61|94|45|81|64|92|93|174|93|60|79|71|57|66|104|55
2955290,7X4FZTVRCDQA,2a181b2125efe,2018-11-08 13:35:31,3,clickout item,1694719,IT,"Aci Castello, Italy",mobile,,103708|5835954|449296|1223484|3049100|3813262|3492814|20219|1233899|16746|973075|995623|1041214|20239|1346229|6240332|2861186|1152752|693311|1869703|8590236|643926|8185994|3134553|1694719,56|100|59|42|84|69|119|95|69|810|52|51|32|30|60|63|57|39|69|68|64|49|74|62|49
2955291,7X4FZTVRCDQA,2a181b2125efe,2018-11-07 12:11:39,1,search for destination,"Giardini-Naxos, Italy",IT,"Giardini-Naxos, Italy",mobile,,,
2955292,7X4FZTVRCDQA,2a181b2125efe,2018-11-07 12:11:53,2,interaction item image,101278,IT,"Giardini-Naxos, Italy",mobile,,,
2955293,7X4FZTVRCDQA,2a181b2125efe,2018-11-07 12:11:53,3,interaction item image,101278,IT,"Giardini-Naxos, Italy",mobile,,,
2955294,7X4FZTVRCDQA,2a181b2125efe,2018-11-07 12:11:57,4,clickout item,101278,IT,"Giardini-Naxos, Italy",mobile,,101278|94907|101279|897473|965647|1269352|2808272|97214|513601|195131|4279974|4062968|1632921|3520100|7974094|1969143|1842355|5755448|3370994|1033342|2516532|3789928|977641|1714237|3125476,55|999|139|158|800|50|50|1056|62|50|30|50|55|50|48|59|38|50|50|46|39|45|200|43|30
2955295,7X4FZTVRCDQA,2a181b2125efe,2018-11-07 12:13:01,5,search for destination,"Falcone, Italy",IT,"Falcone, Italy",mobile,,,
2955296,7X4FZTVRCDQA,2a181b2125efe,2018-11-07 12:13:14,6,clickout item,,IT,"Falcone, Italy",mobile,,4622816|3389774|4743270|4090264|4906084|1390332|1949033|7197516|1668209|1857023|9790044|2520372|346171|103993|6402498|513786|9498644|2875112|7038592|1573641|3983502|8119076|2857132|642391|3214070,75|60|50|60|35|30|69|48|56|75|70|32|70|487|54|82|32|30|78|63|92|61|61|50|40


In [9]:
import datetime
b['timestamp'] = b['timestamp'].apply(lambda ts: datetime.datetime.utcfromtimestamp(ts))


In [10]:
b.timestamp

0         2018-11-07 01:53:34
1         2018-11-07 01:53:34
2         2018-11-07 01:54:56
3         2018-11-07 01:55:07
4         2018-11-07 01:55:17
5         2018-11-07 01:56:32
6         2018-11-07 01:56:39
7         2018-11-07 06:08:55
8         2018-11-07 06:09:00
9         2018-11-08 04:36:06
10        2018-11-08 04:37:03
11        2018-11-07 07:24:53
12        2018-11-07 07:25:03
13        2018-11-07 07:25:03
14        2018-11-07 07:25:16
15        2018-11-07 07:27:23
16        2018-11-08 12:27:51
17        2018-11-08 12:28:03
18        2018-11-08 12:28:03
19        2018-11-08 12:28:05
20        2018-11-08 12:28:16
21        2018-11-08 12:28:24
22        2018-11-08 12:28:24
23        2018-11-08 12:31:01
24        2018-11-08 12:31:01
25        2018-11-08 12:31:11
26        2018-11-08 12:31:11
27        2018-11-08 12:31:11
28        2018-11-08 12:31:11
29        2018-11-08 12:31:11
                  ...        
3782305   2018-11-08 17:07:26
3782306   2018-11-08 17:07:26
3782307   

In [38]:
a = pd.DataFrame({'A': np.random.randint(0, 3 ,10), 'B': np.random.randint(0, 5 ,10)})
a.loc[np.random.choice(a.index, 3), 'B'] = np.nan
a

Unnamed: 0,A,B
0,0,2.0
1,2,
2,2,2.0
3,1,4.0
4,2,
5,1,0.0
6,1,4.0
7,0,0.0
8,2,
9,0,0.0


In [50]:
c = a.groupby('A')['B'].apply(list).reset_index()
c

Unnamed: 0,A,B
0,0,"[2.0, 0.0, 0.0]"
1,1,"[4.0, 0.0, 4.0]"
2,2,"[nan, 2.0, nan, nan]"


In [48]:
c.reset_index().B.values.tolist()

[[2.0, 0.0, 0.0], [4.0, 0.0, 4.0], [nan, 2.0, nan, nan]]

In [49]:
m = {0: 'E', 1: 'Q'}
a.A.map(m)

0      E
1    NaN
2    NaN
3      Q
4    NaN
5      Q
6      Q
7      E
8    NaN
9      E
Name: A, dtype: object