In [1]:
from utils import load_data, check_gpu, check_dir

import pandas as pd
import numpy as np
from tqdm import tqdm
import gc
from utils import meta_encoding
from functools import partial
import matplotlib.pyplot as plt
from scipy.stats import rankdata
import datetime
import lightgbm as lgb
pd.options.display.max_colwidth = 1000
pd.options.display.max_columns = 1000

%matplotlib inline
%load_ext autoreload
%autoreload 2
def fprint(df, name):
    print(f'{name} shape: ({df.shape[0]:,}, {df.shape[1]})')
import pprint
pp = pprint.PrettyPrinter(indent=1)

In [2]:
train_raw = load_data('train', nrows=5000000)
train_raw['timestamp'] = train_raw['timestamp'].apply(lambda ts: datetime.datetime.utcfromtimestamp(ts))


[15|22:32:20|utils              :114|load_data                |INFO] Loading train using 5,000,000 rows (4,999,976 trimmed) which is 31.38% out of total train data


In [3]:
train_ids = train_raw.session_id.unique()

In [4]:
# look at raw
rid = np.random.choice(train_ids, 1)[0]
rid

'1f6409d88e355'

### Check clean data

In [8]:
train_cleaned = pd.read_parquet('gbm_cache/preprocess_train_100000_no_test_added.snappy')

In [12]:
train_ids = train_cleaned.session_id.unique()

problems: 'b160c4dab0582'

In [13]:
rid = np.random.choice(train_ids, 1)[0]
rid

'517878880bc06'

In [9]:
rid = 'b160c4dab0582'

In [14]:
# raw
raw_demo = train_raw[train_raw.session_id==rid].copy()
raw_demo

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
38351,X4327A0BJV6P,517878880bc06,2018-11-03 12:26:16,1,clickout item,1497671,IN,"Puttaparthi, India",desktop,,1891715|4108080|6059178|4644110|4861306|1497671|1540991|5057838|6544066|4485900|9388226|2701888|4591102|7760844|4649962|8970056|7044990,40|24|16|14|11|9|6|12|18|58|5|27|13|5|7|9|69


In [15]:
# clean
train_cleaned[train_cleaned.session_id==rid]

Unnamed: 0,session_id,timestamp,step,action_type,current_filters,reference,impressions,prices,device
7635,517878880bc06,2018-11-03 12:26:16,1.0,0,,1497671,1891715|4108080|6059178|4644110|4861306|1497671|1540991|5057838|6544066|4485900|9388226|2701888|4591102|7760844|4649962|8970056|7044990,40|24|16|14|11|9|6|12|18|58|5|27|13|5|7|9|69,0


### Check feature generations

In [4]:
train_inputs = pd.read_parquet('./gbm_cache/train_inputs_5000000_no_test_added.snappy')
cf_cols = [c for c in train_inputs.columns if 'current_filters' in c]
drop_cols = cf_cols  # + ['country', 'platform']
# drop cf col for now
train_inputs.drop(drop_cols, axis=1, inplace=True)

In [5]:
# # load model 
# clf = lgb.Booster(model_file='./models/lgb_cv0.model')

In [6]:
train_ids = train_inputs.session_id.unique()

In [7]:
rid = np.random.choice(train_ids, 1)[0]
rid

'ef1d58a8ac2f0'

In [20]:
raw_display_cols = ['session_id', 'timestamp', 'step', 'action_type', 'reference', 'platform', 'city', 'device', 
                    'current_filters', 'impressions', 'prices']

In [21]:
raw_demo = train_raw[train_raw.session_id==rid].reset_index(drop=True)
click_mask = raw_demo.action_type=='clickout item'
answers = raw_demo.loc[click_mask, 'reference']
raw_demo.loc[click_mask, 'reference'] = '?'
raw_demo[raw_display_cols]

Unnamed: 0,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
0,ef1d58a8ac2f0,2018-11-06 22:08:54,1,interaction item image,980873,BR,"Natal, Brazil",mobile,,,
1,ef1d58a8ac2f0,2018-11-06 22:08:54,2,interaction item image,980873,BR,"Natal, Brazil",mobile,,,
2,ef1d58a8ac2f0,2018-11-06 22:08:58,3,clickout item,?,BR,"Natal, Brazil",mobile,,980873|6919074|1031616|326961|477846|623286|1024058|104985|117545|326906|6016004|120773|939394|2097018|1246784|1450593|2418544|5808922|899171|2145840|109326|120774|1710829|1185340|370916,44|65|30|47|43|33|41|102|95|112|6|62|40|78|30|66|66|123|61|33|68|89|177|44|49
3,ef1d58a8ac2f0,2018-11-06 22:15:15,4,clickout item,?,BR,"Natal, Brazil",mobile,,980873|6919074|1031616|326961|477846|623286|1024058|104985|117545|326906|6016004|120773|939394|2097018|1246784|1450593|2418544|5808922|899171|2145840|109326|120774|1710829|1185340|370916,44|65|30|47|43|33|41|102|95|112|6|62|40|78|30|66|66|123|61|33|68|89|177|44|49


In [22]:
answers

2    980873
3    980873
Name: reference, dtype: object

check input data with

In [23]:
input_demo = train_inputs[train_inputs.session_id==rid].reset_index(drop=True)
input_demo

Unnamed: 0,session_id,step,device,fs,sort_order,imp_changed,session_size,session_duration,last_duration,last_action_type,last_reference_relative_loc,mean_rating,median_rating,mean_star,median_star,n_imps,target,mean_price,median_price,prices_0,prices_1,prices_2,prices_3,prices_4,prices_5,prices_6,prices_7,prices_8,prices_9,prices_10,prices_11,prices_12,prices_13,prices_14,prices_15,prices_16,prices_17,prices_18,prices_19,prices_20,prices_21,prices_22,prices_23,prices_24,prices_rank_0,prices_rank_1,prices_rank_2,prices_rank_3,prices_rank_4,prices_rank_5,prices_rank_6,prices_rank_7,prices_rank_8,prices_rank_9,prices_rank_10,prices_rank_11,prices_rank_12,prices_rank_13,prices_rank_14,prices_rank_15,prices_rank_16,prices_rank_17,prices_rank_18,prices_rank_19,prices_rank_20,prices_rank_21,prices_rank_22,prices_rank_23,prices_rank_24,prev_click_0,prev_click_1,prev_click_2,prev_click_3,prev_click_4,prev_click_5,prev_click_6,prev_click_7,prev_click_8,prev_click_9,prev_click_10,prev_click_11,prev_click_12,prev_click_13,prev_click_14,prev_click_15,prev_click_16,prev_click_17,prev_click_18,prev_click_19,prev_click_20,prev_click_21,prev_click_22,prev_click_23,prev_click_24,prev_interact_0,prev_interact_1,prev_interact_2,prev_interact_3,prev_interact_4,prev_interact_5,prev_interact_6,prev_interact_7,prev_interact_8,prev_interact_9,prev_interact_10,prev_interact_11,prev_interact_12,prev_interact_13,prev_interact_14,prev_interact_15,prev_interact_16,prev_interact_17,prev_interact_18,prev_interact_19,prev_interact_20,prev_interact_21,prev_interact_22,prev_interact_23,prev_interact_24,half_prices_rank_0,half_prices_rank_1,half_prices_rank_2,half_prices_rank_3,half_prices_rank_4,half_prices_rank_5,half_prices_rank_6,half_prices_rank_7,half_prices_rank_8,half_prices_rank_9,half_prices_rank_10,half_prices_rank_11,ratings_0,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,ratings_6,ratings_7,ratings_8,ratings_9,ratings_10,ratings_11,ratings_12,ratings_13,ratings_14,ratings_15,ratings_16,ratings_17,ratings_18,ratings_19,ratings_20,ratings_21,ratings_22,ratings_23,ratings_24,stars_0,stars_1,stars_2,stars_3,stars_4,stars_5,stars_6,stars_7,stars_8,stars_9,stars_10,stars_11,stars_12,stars_13,stars_14,stars_15,stars_16,stars_17,stars_18,stars_19,stars_20,stars_21,stars_22,stars_23,stars_24
0,ef1d58a8ac2f0,3.0,0,33,8,,2,4.0,4.0,4,0.04,2.72,3.0,3.04,3.0,25,0,64.16,61.0,44.0,65.0,30.0,47.0,43.0,33.0,41.0,102.0,95.0,112.0,6.0,62.0,40.0,78.0,30.0,66.0,66.0,123.0,61.0,33.0,68.0,89.0,177.0,44.0,49.0,0.333333,0.571429,0.095238,0.380952,0.285714,0.142857,0.238095,0.857143,0.809524,0.904762,0.047619,0.52381,0.190476,0.714286,0.095238,0.619048,0.619048,0.952381,0.47619,0.142857,0.666667,0.761905,1.0,0.333333,0.428571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.75,0.166667,0.583333,0.416667,0.25,0.333333,0.916667,0.833333,1.0,0.083333,0.666667,0.2,0.8,0.4,0.6,0.4,0.4,0.8,0.6,0.8,1.0,0.8,0.8,0.4,1.0,0.6,1.0,1.0,1.0,0.6,1.0,0.8,1.0,1.0,1.0,0.6,0.5,0.75,0.25,0.75,0.5,0.75,0.5,0.5,1.0,1.0,0.5,0.5,0.5,1.0,0.5,0.25,0.75,0.75,0.75,0.25,0.5,0.75,0.5,0.25,0.5


In [27]:
s = '44|65|30|47|43|33|41|102|95|112|6|62|40|78|30|66|66|123|61|33|68|89|177|44|49'
s = s.split('|')
s = [float(i) for i in s]
rankdata(s, method='dense'), rankdata(s, method='dense').max()

(array([ 7, 12,  2,  8,  6,  3,  5, 18, 17, 19,  1, 11,  4, 15,  2, 13, 13,
        20, 10,  3, 14, 16, 21,  7,  9]), 21)

In [37]:
train_inputs.sort_order.unique()

array([8, 0, 1])

In [26]:
pp.pprint(input_demo.to_dict())

{'device': {0: 0},
 'fs': {0: 33},
 'half_prices_rank_0': {0: 0.5},
 'half_prices_rank_1': {0: 0.75},
 'half_prices_rank_10': {0: 0.08333333333333333},
 'half_prices_rank_11': {0: 0.6666666666666666},
 'half_prices_rank_2': {0: 0.16666666666666666},
 'half_prices_rank_3': {0: 0.5833333333333334},
 'half_prices_rank_4': {0: 0.4166666666666667},
 'half_prices_rank_5': {0: 0.25},
 'half_prices_rank_6': {0: 0.3333333333333333},
 'half_prices_rank_7': {0: 0.9166666666666666},
 'half_prices_rank_8': {0: 0.8333333333333334},
 'half_prices_rank_9': {0: 1.0},
 'imp_changed': {0: nan},
 'last_action_type': {0: 4},
 'last_duration': {0: 4.0},
 'last_reference_relative_loc': {0: 0.04},
 'mean_price': {0: 64.16},
 'mean_rating': {0: 2.72},
 'mean_star': {0: 3.04},
 'median_price': {0: 61.0},
 'median_rating': {0: 3.0},
 'median_star': {0: 3.0},
 'n_imps': {0: 25},
 'prev_click_0': {0: 0.0},
 'prev_click_1': {0: 0.0},
 'prev_click_10': {0: 0.0},
 'prev_click_11': {0: 0.0},
 'prev_click_12': {0: 0.0}

In [None]:
shap_values[0].shape

In [None]:
ps = '220|193|310|265|162|119|99|43|51|83|96|65'.split('|')
ps = [float(i) for i in ps]
rankdata(ps, method='dense')

In [None]:
np.nanmean(ps), np.nanmedian(ps), np.mean(ps), np.median(ps)

In [None]:
1/12

In [None]:
2/25

In [None]:
meta_mapping = meta_encoding()

In [None]:
# refs = raw_eg.impressions.dropna().str.split('|')
# refs = list([int(j) for i in refs for j in i])
refs = '20720|20814|83606|20772|20752|45909|9844958|20861|20681|45379|2085654|20785|152418|20848|20736|20750|20949|20743|917187|20745|20677|905283|83339|20843|20819'
refs = refs.split('|')
refs = [int(i) for i in refs]

In [None]:
refs_meta = meta_mapping[meta_mapping.item_id.isin(refs)]
s = refs_meta.sum(axis=0)
refs_meta_useful = refs_meta[s[s!=0].index].reset_index(drop=True)
ss = refs_meta_useful.sum(axis=0)
refs_meta_useful = refs_meta_useful[ss[ss!=24].index]
refs_meta_useful = refs_meta_useful.set_index('item_id')
refs_meta_useful = refs_meta_useful.reindex(refs)
# cols = ['1 star', '2 star', '3 star', '4 star', '5 star', 
#                   'excellent rating', 'very good rating', 'good rating', 'satisfactory rating']
# refs_meta_useful[[c for c in cols if c in refs_meta_useful.columns]].loc[refs]
refs_meta_useful

In [None]:
(6*4+3*2+2+1)/12, (1+4+4+1+3+2+0+0+2+3+3)/12

### check the meta properties

In [None]:
meta_mapping = meta_encoding()

In [None]:
# refs = raw_eg.impressions.dropna().str.split('|')
# refs = list([int(j) for i in refs for j in i])
refs = '14910|52419|53438|15918|15402|128443|772473|626051|158537|1193842|14824|5179324'
refs = refs.split('|')
refs = [int(i) for i in refs]

In [None]:
refs_meta = meta_mapping[meta_mapping.item_id.isin(refs)]
s = refs_meta.sum(axis=0)
refs_meta_useful = refs_meta[s[s!=0].index].reset_index(drop=True)
ss = refs_meta_useful.sum(axis=0)
refs_meta_useful = refs_meta_useful[ss[ss!=24].index]
refs_meta_useful = refs_meta_useful.set_index('item_id')
refs_meta_useful = refs_meta_useful.reindex(refs)
refs_meta_useful

In [None]:
cols_sum = refs_meta_useful.sum(axis=0)
one_cols = cols_sum[cols_sum==1].index
refs_meta_useful[one_cols]

In [None]:
meta_mapping[meta_mapping.item_id==110702]

In [None]:
m = pd.read_csv('./data/item_metadata.csv')

In [None]:
m[m.item_id==110702]

In [None]:
meta_mapping.columns.values

In [None]:
meta_mapping[['1 star', '2 star', '3 star', 'from 3 stars', '4 star', 'from 4 stars', '5 star']]

In [None]:
meta_mapping[['satisfactory rating', 'good rating', 'very good rating', 'excellent rating']]

In [None]:
meta_mapping.head()

In [None]:
(meta_mapping.loc[meta_mapping['good rating']==1, 'satisfactory rating']==0).sum()

In [None]:
81/187

In [None]:
1/25

In [None]:
s = '81|120|81|187|70|45|56|130|124|56|58|77|52|40|120|53|65|65|54|30|59|33|45|108|37'
s = s.split('|')
s = [int(i) for i in s]
from scipy.stats import rankdata
rankdata(s, method='dense')

In [None]:
17/20

In [None]:
# input_demo = train_inputs[train_inputs.session_id==rid].reset_index(drop=True)
# input_demo

In [None]:
# pp.pprint(input_demo.to_dict(orient='list'))

In [None]:
from create_model_inputs import change_sort_order_mapping
change_sort_order_mapping()

### look at feature distribution

In [32]:
train_inputs.columns.values

array(['session_id', 'step', 'device', 'fs', 'sort_order', 'imp_changed',
       'session_size', 'session_duration', 'last_duration',
       'last_action_type', 'last_reference_relative_loc', 'mean_rating',
       'median_rating', 'mean_star', 'median_star', 'n_imps', 'target',
       'mean_price', 'median_price', 'prices_0', 'prices_1', 'prices_2',
       'prices_3', 'prices_4', 'prices_5', 'prices_6', 'prices_7',
       'prices_8', 'prices_9', 'prices_10', 'prices_11', 'prices_12',
       'prices_13', 'prices_14', 'prices_15', 'prices_16', 'prices_17',
       'prices_18', 'prices_19', 'prices_20', 'prices_21', 'prices_22',
       'prices_23', 'prices_24', 'prices_rank_0', 'prices_rank_1',
       'prices_rank_2', 'prices_rank_3', 'prices_rank_4', 'prices_rank_5',
       'prices_rank_6', 'prices_rank_7', 'prices_rank_8', 'prices_rank_9',
       'prices_rank_10', 'prices_rank_11', 'prices_rank_12',
       'prices_rank_13', 'prices_rank_14', 'prices_rank_15',
       'prices_rank_16', 'pr

In [35]:
nans = train_inputs.isna().sum()
nans[nans>0].to_dict()

{'imp_changed': 259463,
 'last_duration': 97186,
 'last_reference_relative_loc': 210298,
 'mean_rating': 2,
 'median_rating': 2,
 'mean_star': 2,
 'median_star': 2,
 'prices_1': 1185,
 'prices_2': 2703,
 'prices_3': 4331,
 'prices_4': 6054,
 'prices_5': 7955,
 'prices_6': 10027,
 'prices_7': 12033,
 'prices_8': 14251,
 'prices_9': 16759,
 'prices_10': 19536,
 'prices_11': 25847,
 'prices_12': 31699,
 'prices_13': 37384,
 'prices_14': 42905,
 'prices_15': 47967,
 'prices_16': 53124,
 'prices_17': 57973,
 'prices_18': 62607,
 'prices_19': 67045,
 'prices_20': 71446,
 'prices_21': 75468,
 'prices_22': 79503,
 'prices_23': 83572,
 'prices_24': 101427,
 'prices_rank_1': 1185,
 'prices_rank_2': 2703,
 'prices_rank_3': 4331,
 'prices_rank_4': 6054,
 'prices_rank_5': 7955,
 'prices_rank_6': 10027,
 'prices_rank_7': 12033,
 'prices_rank_8': 14251,
 'prices_rank_9': 16759,
 'prices_rank_10': 19536,
 'prices_rank_11': 25847,
 'prices_rank_12': 31699,
 'prices_rank_13': 37384,
 'prices_rank_14': 4

### country related

In [None]:
cp = load_data('train', usecols=['city', 'platform'])

In [None]:
cp['country'] = cp['city'].str.split(', ').str[-1]
cp['country'] = cp['country'].str.lower()

In [None]:
countries = cp['country'].unique()

In [None]:
platforms = cp['platform'].unique()

In [None]:
country2code = np.load('data/country2code.npy').item()

In [None]:
code2country = np.load('data/code2country.npy').item()

In [None]:
set(countries) - set(list(country2code.keys()))

In [None]:
set(list(country2code.keys())) - set(countries)

In [None]:
fix_dict = {'bermudas': 'bermuda', 
            'bes islands': None,
            'brunei':'brunei darussalam',
            'china': "people's republic of china",
            'crimea': None,
            'curacao': 'netherlands antilles',
            'democratic republic of congo': 'congo, the democratic republic of',
            'east timor': 'timor',
            'french antilles': None,
            'guinea-bissau': None,
            'iran': 'iran, islamic republic of',
            'ivory coast': "côte d'ivoire",
            'kosovo': None,
            'laos': "lao people's democratic republic"
           }

In [None]:
train_raw.impressions.notna()[train_raw.impressions.notna()].index[0]

In [None]:
a = pd.read_parquet('./gbm_cache/preprocess_train_5000000_test_added.snappy')

In [None]:
a.reference.head()

In [None]:
a.reference.isna().sum()

In [None]:
a[a.reference.isna()]

In [None]:
b = pd.read_csv('./data/test.csv')

In [None]:
b[b.session_id=='2a181b2125efe']

In [None]:
import datetime
b['timestamp'] = b['timestamp'].apply(lambda ts: datetime.datetime.utcfromtimestamp(ts))


In [None]:
b.timestamp

In [None]:
a = pd.DataFrame({'A': np.random.randint(0, 3 ,10), 'B': np.random.randint(0, 5 ,10)})
a.loc[np.random.choice(a.index, 3), 'B'] = np.nan
a

In [None]:
c = a.groupby('A')['B'].apply(list).reset_index()
c

In [None]:
c.reset_index().B.values.tolist()

In [None]:
m = {0: 'E', 1: 'Q'}
a.A.map(m)