In [1]:
from utils import load_data, check_gpu, check_dir

import pandas as pd
import numpy as np
from tqdm import tqdm
import gc
from utils import meta_encoding
from functools import partial
import matplotlib.pyplot as plt

import datetime
pd.options.display.max_colwidth = 1000
%matplotlib inline
%load_ext autoreload
%autoreload 2
def fprint(df, name):
    print(f'{name} shape: ({df.shape[0]:,}, {df.shape[1]})')
import pprint
pp = pprint.PrettyPrinter(indent=1)

In [2]:
train_raw = load_data('train', nrows=1000000)

[06-09 16:12:51 - utils-111 - load_data - INFO] Loading train using 1,000,000 rows (999,995 trimmed) which is 6.28% out of total train data


In [3]:
train_ids = train_raw.session_id.unique()

### Check clean data

In [None]:
train_cleaned = pd.read_parquet('gbm_cache/preprocess_train_1000000_no_test_added.snappy')

In [None]:
rid = np.random.choice(train_ids, 1)[0]
rid

In [None]:
# raw
train_raw[train_raw.session_id==rid].head()

In [None]:
# clean
train_cleaned[train_cleaned.session_id==rid].head()

In [None]:
train_cleaned.action_type.head()

In [None]:
# check current filters
cfs_ids = train_cleaned[train_cleaned.current_filters.notna()].session_id.unique()

In [None]:
train_cleaned[train_cleaned.session_id==np.random.choice(cfs_ids, 1)[0]]

### Check feature generations

In [7]:
train_inputs = pd.read_parquet('./gbm_cache/train_inputs_5000000_test_added.snappy')
cf_cols = [c for c in train_inputs.columns if 'current_filters' in c]
drop_cols = cf_cols  # + ['country', 'platform']
# drop cf col for now
train_inputs.drop(drop_cols, axis=1, inplace=True)

In [22]:
rid = np.random.choice(train_ids, 1)[0]
rid

'5d669ff3f4a63'

In [23]:
raw_demo = train_raw[train_raw.session_id==rid].reset_index(drop=True)
raw_demo

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
0,CZJ71SPWQ5B8,5d669ff3f4a63,1541353187,1,clickout item,4130278,BR,"Vitória da Conquista, Brazil",mobile,,8102560|2788292|3956866|4130278|4137472|4138126|2341278|9226022|4786286|7373042|8422934|5056538|9803884|6301998,16|45|38|41|32|40|32|22|57|41|27|36|47|32
1,CZJ71SPWQ5B8,5d669ff3f4a63,1541353430,2,clickout item,4130278,BR,"Vitória da Conquista, Brazil",mobile,,8102560|2788292|3956866|4130278|4137472|4138126|2341278|9226022|4786286|7373042|8422934|5056538|9803884|6301998,16|45|38|41|32|40|32|22|57|41|27|36|47|32


In [24]:
input_demo = train_inputs[train_inputs.session_id==rid].reset_index(drop=True)
input_demo

Unnamed: 0,session_id,step,device,imp_changed,session_size,session_duration,last_duration,co,search,inter,...,prev_interact_15,prev_interact_16,prev_interact_17,prev_interact_18,prev_interact_19,prev_interact_20,prev_interact_21,prev_interact_22,prev_interact_23,prev_interact_24
0,5d669ff3f4a63,1.0,0,,1,0.0,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5d669ff3f4a63,2.0,0,0.0,2,243.0,243.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
pp.pprint(input_demo.to_dict(orient='list'))

{'co': [nan, 1.0],
 'cs': [8.0, 8.0],
 'device': [0, 0],
 'fs': [33.0, 33.0],
 'imp_changed': [nan, 0.0],
 'inter': [nan, 0.0],
 'last_duration': [nan, 243.00000000000003],
 'last_reference_relative_loc': [nan, 0.16],
 'n_imps': [14, 14],
 'prev_click_0': [nan, 0.0],
 'prev_click_1': [nan, 0.0],
 'prev_click_10': [nan, 0.0],
 'prev_click_11': [nan, 0.0],
 'prev_click_12': [nan, 0.0],
 'prev_click_13': [nan, 0.0],
 'prev_click_14': [0.0, 0.0],
 'prev_click_15': [0.0, 0.0],
 'prev_click_16': [0.0, 0.0],
 'prev_click_17': [0.0, 0.0],
 'prev_click_18': [0.0, 0.0],
 'prev_click_19': [0.0, 0.0],
 'prev_click_2': [nan, 0.0],
 'prev_click_20': [0.0, 0.0],
 'prev_click_21': [0.0, 0.0],
 'prev_click_22': [0.0, 0.0],
 'prev_click_23': [0.0, 0.0],
 'prev_click_24': [0.0, 0.0],
 'prev_click_3': [nan, 1.0],
 'prev_click_4': [nan, 0.0],
 'prev_click_5': [nan, 0.0],
 'prev_click_6': [nan, 0.0],
 'prev_click_7': [nan, 0.0],
 'prev_click_8': [nan, 0.0],
 'prev_click_9': [nan, 0.0],
 'prev_interact_0': [

In [17]:
from create_model_inputs import change_sort_order_mapping
change_sort_order_mapping()

{'interaction sort button': 0,
 'price only': 1,
 'price and recommended': 2,
 'distance only': 3,
 'distance and recommended': 4,
 'rating and recommended': 5,
 'rating only': 6,
 'our recommendations': 7}

### country related

In [None]:
cp = load_data('train', usecols=['city', 'platform'])

In [None]:
cp['country'] = cp['city'].str.split(', ').str[-1]
cp['country'] = cp['country'].str.lower()

In [None]:
countries = cp['country'].unique()

In [None]:
platforms = cp['platform'].unique()

In [None]:
country2code = np.load('data/country2code.npy').item()

In [None]:
code2country = np.load('data/code2country.npy').item()

In [None]:
set(countries) - set(list(country2code.keys()))

In [None]:
set(list(country2code.keys())) - set(countries)

In [None]:
fix_dict = {'bermudas': 'bermuda', 
            'bes islands': None,
            'brunei':'brunei darussalam',
            'china': "people's republic of china",
            'crimea': None,
            'curacao': 'netherlands antilles',
            'democratic republic of congo': 'congo, the democratic republic of',
            'east timor': 'timor',
            'french antilles': None,
            'guinea-bissau': None,
            'iran': 'iran, islamic republic of',
            'ivory coast': "côte d'ivoire",
            'kosovo': None,
            'laos': "lao people's democratic republic"
           }

In [56]:
train_raw.impressions.notna()[train_raw.impressions.notna()].index[0]

13

In [2]:
a = pd.read_parquet('./gbm_cache/preprocess_train_5000000_test_added.snappy')

In [3]:
a.reference.head()

0    1179940
1    4590810
2    4590810
3    3844380
4    9387530
Name: reference, dtype: object

In [4]:
a.reference.isna().sum()

2

In [5]:
a[a.reference.isna()]

Unnamed: 0,session_id,timestamp,step,action_type,current_filters,reference,impressions,prices,device
404026,2a181b2125efe,2018-11-07 12:13:14,6.0,0,,,4622816|3389774|4743270|4090264|4906084|1390332|1949033|7197516|1668209|1857023|9790044|2520372|346171|103993|6402498|513786|9498644|2875112|7038592|1573641|3983502|8119076|2857132|642391|3214070,75|60|50|60|35|30|69|48|56|75|70|32|70|487|54|82|32|30|78|63|92|61|61|50|40,0
1950802,cbe3752713eee,2018-11-07 20:53:57,5.0,0,,,45927|1258844|21154|4719620|873351|21061|21072|21081|21085|21100|21115|21126|21150|45499|45807|1240467|1542573|21057|1666029|7176920|21097|83964|153182|45930|1473027,93|183|240|99|62|166|179|161|94|191|83|65|94|58|74|130|50|60|96|47|511|71|70|124|55,0


In [7]:
b = pd.read_csv('./data/test.csv')

In [11]:
b[b.session_id=='2a181b2125efe']

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
2955288,7X4FZTVRCDQA,2a181b2125efe,2018-11-08 13:31:16,1,clickout item,45643,IT,"Catania, Italy",mobile,,45643|2609026|20210|20238|2774322|20207|20212|20213|20209|20215|20219|1838087|3134547|1277780|104016|1501099|45885|926493|1331335|3894602|3049100|20236|103708|5835954|20220,110|74|84|103|77|98|132|113|59|61|94|45|81|64|92|93|174|93|60|79|71|57|66|104|55
2955289,7X4FZTVRCDQA,2a181b2125efe,2018-11-08 13:33:14,2,clickout item,104016,IT,"Catania, Italy",mobile,,45643|2609026|20210|20238|2774322|20207|20212|20213|20209|20215|20219|1838087|3134547|1277780|104016|1501099|45885|926493|1331335|3894602|3049100|20236|103708|5835954|20220,110|74|84|103|77|98|132|113|59|61|94|45|81|64|92|93|174|93|60|79|71|57|66|104|55
2955290,7X4FZTVRCDQA,2a181b2125efe,2018-11-08 13:35:31,3,clickout item,1694719,IT,"Aci Castello, Italy",mobile,,103708|5835954|449296|1223484|3049100|3813262|3492814|20219|1233899|16746|973075|995623|1041214|20239|1346229|6240332|2861186|1152752|693311|1869703|8590236|643926|8185994|3134553|1694719,56|100|59|42|84|69|119|95|69|810|52|51|32|30|60|63|57|39|69|68|64|49|74|62|49
2955291,7X4FZTVRCDQA,2a181b2125efe,2018-11-07 12:11:39,1,search for destination,"Giardini-Naxos, Italy",IT,"Giardini-Naxos, Italy",mobile,,,
2955292,7X4FZTVRCDQA,2a181b2125efe,2018-11-07 12:11:53,2,interaction item image,101278,IT,"Giardini-Naxos, Italy",mobile,,,
2955293,7X4FZTVRCDQA,2a181b2125efe,2018-11-07 12:11:53,3,interaction item image,101278,IT,"Giardini-Naxos, Italy",mobile,,,
2955294,7X4FZTVRCDQA,2a181b2125efe,2018-11-07 12:11:57,4,clickout item,101278,IT,"Giardini-Naxos, Italy",mobile,,101278|94907|101279|897473|965647|1269352|2808272|97214|513601|195131|4279974|4062968|1632921|3520100|7974094|1969143|1842355|5755448|3370994|1033342|2516532|3789928|977641|1714237|3125476,55|999|139|158|800|50|50|1056|62|50|30|50|55|50|48|59|38|50|50|46|39|45|200|43|30
2955295,7X4FZTVRCDQA,2a181b2125efe,2018-11-07 12:13:01,5,search for destination,"Falcone, Italy",IT,"Falcone, Italy",mobile,,,
2955296,7X4FZTVRCDQA,2a181b2125efe,2018-11-07 12:13:14,6,clickout item,,IT,"Falcone, Italy",mobile,,4622816|3389774|4743270|4090264|4906084|1390332|1949033|7197516|1668209|1857023|9790044|2520372|346171|103993|6402498|513786|9498644|2875112|7038592|1573641|3983502|8119076|2857132|642391|3214070,75|60|50|60|35|30|69|48|56|75|70|32|70|487|54|82|32|30|78|63|92|61|61|50|40


In [9]:
import datetime
b['timestamp'] = b['timestamp'].apply(lambda ts: datetime.datetime.utcfromtimestamp(ts))


In [10]:
b.timestamp

0         2018-11-07 01:53:34
1         2018-11-07 01:53:34
2         2018-11-07 01:54:56
3         2018-11-07 01:55:07
4         2018-11-07 01:55:17
5         2018-11-07 01:56:32
6         2018-11-07 01:56:39
7         2018-11-07 06:08:55
8         2018-11-07 06:09:00
9         2018-11-08 04:36:06
10        2018-11-08 04:37:03
11        2018-11-07 07:24:53
12        2018-11-07 07:25:03
13        2018-11-07 07:25:03
14        2018-11-07 07:25:16
15        2018-11-07 07:27:23
16        2018-11-08 12:27:51
17        2018-11-08 12:28:03
18        2018-11-08 12:28:03
19        2018-11-08 12:28:05
20        2018-11-08 12:28:16
21        2018-11-08 12:28:24
22        2018-11-08 12:28:24
23        2018-11-08 12:31:01
24        2018-11-08 12:31:01
25        2018-11-08 12:31:11
26        2018-11-08 12:31:11
27        2018-11-08 12:31:11
28        2018-11-08 12:31:11
29        2018-11-08 12:31:11
                  ...        
3782305   2018-11-08 17:07:26
3782306   2018-11-08 17:07:26
3782307   