In [1]:
import pandas as pd
import numpy as np
import datetime
import time 
import os
import gc
import sys
from functools import partial

from utils import ignore_warnings, load_data

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
# ignore_warnings()

In [3]:
%%time
# nrows = 10000
nrows = None
train = load_data('train', nrows=nrows)#, verbose=True)

CPU times: user 21.7 s, sys: 1.45 s, total: 23.1 s
Wall time: 23.1 s


In [None]:
test = load_data('test', nrows=nrows)

In [4]:
meta = load_data('item_metadata', verbose=True)

Memory usage: 295.55 mb


### convert all string type to lower case

In [5]:
%%time
cols_lower = ['action_type', 'reference', 'platform', 'city', 'device', 'current_filters']
for c in cols_lower:
    print('lowering ', c)
    train[c] = train[c].str.lower()

meta['properties'] = meta['properties'].str.lower()


lowering  action_type
lowering  reference
lowering  platform
lowering  city
lowering  device
lowering  current_filters
CPU times: user 16.7 s, sys: 1.51 s, total: 18.2 s
Wall time: 18.2 s


### reduce train df size

In [6]:
%%time
create_mapper = lambda unique_values: {v: k for k, v in enumerate(unique_values)}
def convert(df, col, mapper):
    df.loc[df[col].notna(), col] = df.loc[df[col].notna()][col].map(mapper)

def replace(str_list, replace_dict):
    return '|'.join([replace_dict[s] for s in str_list])

# reduce all data types to int and cache the mapping used
t_int = time.time()
fprint = lambda msg: print(f"{msg:<40} {'='*20} time elapsed = {(time.time()-t_int)/60:.2f} mins")
# ===========================================================================================
# 0) all session_ids and user_ids
unique_user_ids = train.user_id.unique()
user_id_mapper = create_mapper(unique_user_ids)
convert(train, 'user_id', user_id_mapper)
fprint('done user_id')
                           
unique_session_ids = train.session_id.unique()
session_id_mapper = create_mapper(unique_session_ids)
convert(train, 'session_id', session_id_mapper)
fprint('done session_id')
# ===========================================================================================
# 1) get timestamp range and subtract the min
min_ts = train['timestamp'].min()
train['timestamp'] -= min_ts
fprint('done timestamp')
# ===========================================================================================
# 2) action_type
unique_action_types = train.action_type.dropna().unique()
action_mapper = create_mapper(unique_action_types)
convert(train, 'action_type', action_mapper)
fprint('done action_type')
# ===========================================================================================
# # 3) all item_ids and reference
impression_lists = train[train.impressions.notna()].impressions.str.split('|')
unique_impressions = list(set([j for i in impression_lists for j in i]))
complete_item_ids = list(train[train.reference.notna()].reference.unique()) + unique_impressions
reference_item_id_mapper = create_mapper(complete_item_ids)
convert(train, 'reference', reference_item_id_mapper)
fprint('done reference')

reference_item_id_mapper_str = {k: str(v) for k, v in reference_item_id_mapper.items()}
replace_impression = partial(replace, replace_dict=reference_item_id_mapper_str)
train['imps'] = train['impressions'].str.split('|')
train.loc[train['impressions'].notna(), 'imps'] = train[train['impressions'].notna()]['imps'].apply(replace_impression)
del train['impressions']  
fprint('done item_id')
# import re
# # convert the digit value to int instead of string
# complete_item_ids = [i if re.search('[a-zA-Z]', i) else int(i) for i in complete_item_ids]
# item_id_mapper = {v: k for k, v in enumerate(complete_item_ids)}
         
# ===========================================================================================
# 4) all platform
unique_platform = train.platform.dropna().unique()
platform_mapper = create_mapper(unique_platform)
convert(train, 'platform', platform_mapper)
fprint('done platform')
# ===========================================================================================

# 4) all cities
unique_cities = train.city.dropna().unique()
city_mapper = create_mapper(unique_cities)
convert(train, 'city', city_mapper)
fprint('done city')
# ===========================================================================================

# 5) all device
unique_device = train.device.dropna().unique()
device_mapper = create_mapper(unique_device)
convert(train, 'device', device_mapper)
fprint('done device')
# ===========================================================================================

# 6) filters/properties
# unique filters from filters
filter_lists = train[train.current_filters.notna()].current_filters.str.split('|')
unique_filters = list(set([j for i in filter_lists for j in i]))
# unique properties from meta
properties_lists = meta.properties.str.split('|')
unique_properties = list(set([j for i in properties_lists for j in i]))
all_properties = list(set(unique_filters + unique_properties))
properties_mapper = create_mapper(all_properties)
properties_mapper_str = {k: str(v) for k, v in properties_mapper.items()}
                    
replace_properties = partial(replace, replace_dict=properties_mapper_str)
train['cf'] = train['current_filters'].str.split('|')
train.loc[train['cf'].notna(), 'cf'] = train[train['cf'].notna()]['cf'].apply(replace_properties)
del train['current_filters']
fprint('done cf')

meta['ps_list'] = meta.properties.str.split('|')
meta['ps'] = meta.ps_list.apply(replace_properties)
del meta['ps_list'], meta['properties']
fprint('done meta properties')
                           
# rename
train.rename(columns={'imps': 'impressions', 'cf': 'current_filters'}, inplace=True)
meta.rename(columns={'ps': 'properties'}, inplace=True)

CPU times: user 1min 50s, sys: 15.2 s, total: 2min 5s
Wall time: 2min 5s


In [7]:
# save all the mappings
mapper_dict = {}
mapper_dict['user_id'] = user_id_mapper
mapper_dict['session_id'] = session_id_mapper
mapper_dict['action_type'] = action_mapper
mapper_dict['reference_item'] = reference_item_id_mapper
mapper_dict['user_id'] = user_id_mapper
mapper_dict['platfrom'] = platform_mapper
mapper_dict['city'] = city_mapper
mapper_dict['device'] = device_mapper
mapper_dict['properties'] = properties_mapper

In [8]:
train.memory_usage(deep=True).sum() / 1024 ** 2

3032.7572088241577

In [9]:
train.head()

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,prices,impressions,current_filters
0,0,0,7052,1,0,0,0,0,0,,,
1,0,0,7114,2,1,597985,0,0,0,,,
2,0,0,7114,3,1,597985,0,0,0,,,
3,0,0,7124,4,1,597985,0,0,0,,,
4,0,0,7124,5,1,944093,0,0,0,,,


### preprocessing

In [10]:
mapper_dict['action_type']

{'search for poi': 0,
 'interaction item image': 1,
 'clickout item': 2,
 'interaction item info': 3,
 'interaction item deals': 4,
 'search for destination': 5,
 'filter selection': 6,
 'interaction item rating': 7,
 'search for item': 8,
 'change of sort order': 9}

In [11]:
df = train.iloc[:10000].copy()

In [12]:
df.head()

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,prices,impressions,current_filters
0,0,0,7052,1,0,0,0,0,0,,,
1,0,0,7114,2,1,597985,0,0,0,,,
2,0,0,7114,3,1,597985,0,0,0,,,
3,0,0,7124,4,1,597985,0,0,0,,,
4,0,0,7124,5,1,944093,0,0,0,,,


In [13]:
data_source = 'train'
# 1) Cliping sessions up to last clickout (if there is clickout)
def clip_last_click(grp):
    # 'clickout item': 2
    check = grp.action_type.values == 2
    if check.sum() != 0:
        return grp.iloc[:np.argwhere(check)[-1][0]+1]
    else:
        return grp

fprint('cliping sessions off up to last clickout')
df = df.groupby('session_id').apply(clip_last_click).reset_index(drop=True)



In [14]:
df.shape

(8462, 12)

In [15]:
sids = df.session_id.unique()
df[df.session_id==np.random.choice(sids, 1)[0]]

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,prices,impressions,current_filters
8395,569,571,33189,1,1,1178303,23,499,1,,,
8396,569,571,33189,2,1,1178303,23,499,1,,,
8397,569,571,33199,3,1,1178303,23,499,1,,,
8398,569,571,33199,4,1,1178303,23,499,1,,,
8399,569,571,33199,5,1,1178303,23,499,1,,,
8400,569,571,33202,6,2,792053,23,499,1,203|145|99|175|168|130|104|105|263|112|142|121...,707355|567643|1178303|671025|792053|822392|101...,67.0


In [16]:
# 2) Only select sessions that have a click out
def filter_clickout(grp, mode):
    # sessions has clickouts
    # 'clickout item': 2
    has_clickout = (grp['action_type'].values == 2).sum() != 0
    if mode == 'train':
        # last row has reference and it's not nan
        has_ref = ((grp['action_type'].iloc[-1] == 2) &
                   (grp.iloc[-1][['impressions', 'reference', 'prices']].isna().sum() == 0))
    else:
        # test should have the last reference as nan for clickout
        has_ref = ((grp['action_type'].iloc[-1] == 2) &
                   (grp.iloc[-1][['reference']].isna()))
    return has_clickout & has_ref

fprint('filtering out sessions without clickout and reference, or clickout is not valid')
fprint(f'length before filtering: {len(df):,}')
filter_clickout_ = partial(filter_clickout, mode=data_source)
valid_clicked = df.groupby('session_id').apply(filter_clickout_)
click_session_ids = valid_clicked[valid_clicked].index
# filter
df = df[df.session_id.isin(click_session_ids)].reset_index(drop=True)
# del valid_clicked, click_session_ids
gc.collect()
fprint(f'{data_source} length after filtering: {len(df):,}')



In [17]:
df.shape

(7432, 12)

### create session fts

In [18]:
# some custom funcs used in agggregation
def mean_dwell_time(x):
    if len(x) == 1:
        return np.nan
    else:
        return np.mean(np.diff(np.sort(x)))
    
def median_dwell_time(x):
    if len(x) == 1:
        return np.nan
    else:
        return np.median(np.diff(np.sort(x)))

def dwell_time_before_last(x):
    if len(x) == 1:
        return np.nan
    else:
        sorted_x = np.sort(x)
        return sorted_x[-1] - sorted_x[-2]
    
def var_dwell_time(x):
    if len(x) == 1:
        return np.nan
    else:
        return np.var(np.diff(np.sort(x)))

def get_first(x):
    return x.iloc[0]

def get_last(x):
    return x.iloc[-1]

def n_clickouts(x):
    # 'clickout item': 2
    return (x.values == 2).sum()

def click_rel_pos_avg(x):
    # 'clickout item': 2
    return np.mean(np.argwhere((x.values == 2))+1)/len(x)

def span(x):
    return x.max() - x.min()

def second_last(x):
    if len(x) == 1:
        return np.nan
    else:
        return x.iloc[-2]

# define some aggs
session_aggs = {'timestamp': [span, mean_dwell_time, var_dwell_time, median_dwell_time, dwell_time_before_last],
                'step': ['max'],
                'action_type': ['nunique', n_clickouts, click_rel_pos_avg, second_last],
                'reference': [second_last],
                'city': ['nunique', get_last],
                'platform': [get_last],
                'device': [get_last],
                'n_imps': [get_last],
                'n_filters': [get_last],
                }

df['imp_list'] = df.impressions.str.split('|')
df['n_imps'] = df.imp_list.str.len()
del df['imp_list']
gc.collect()
df['cf_list'] = df.current_filters.str.split('|')
df['n_filters'] = df.cf_list.str.len()
del df['cf_list']
gc.collect()

session_grp = df.groupby('session_id')
session_fts = session_grp.agg(session_aggs)
session_fts.columns = ['_'.join(col).strip() for col in session_fts.columns.values]
del df['n_imps'], df['n_filters']
gc.collect()


63

In [19]:
session_fts.head()

Unnamed: 0_level_0,timestamp_span,timestamp_mean_dwell_time,timestamp_var_dwell_time,timestamp_median_dwell_time,timestamp_dwell_time_before_last,step_max,action_type_nunique,action_type_n_clickouts,action_type_click_rel_pos_avg,action_type_second_last,reference_second_last,city_nunique,city_get_last,platform_get_last,device_get_last,n_imps_get_last,n_filters_get_last
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,1025,68.333333,52782.355556,0.0,16.0,16,3,2,0.9375,0.0,3.0,1,0,0,0,25.0,
2,134,3.941176,61.702422,0.0,1.0,35,3,1,1.0,3.0,594497.0,1,2,2,0,25.0,
3,614,102.333333,12221.555556,77.5,330.0,7,4,2,0.928571,2.0,1077990.0,1,3,3,1,25.0,
4,253,4.685185,167.512003,0.0,38.0,55,4,2,0.990909,2.0,779011.0,1,4,4,1,25.0,8.0
5,79,39.5,1056.25,39.5,72.0,3,3,1,1.0,8.0,1028527.0,2,6,0,1,25.0,


### create meta features

In [None]:
def create_rating_colum(meta):
    # create rating columns
    ratings = ['good rating', 'satisfactory rating', 'excellent rating']
    for r in ratings:
        meta[r.replace(' ', '_')] = meta.properties.str.findall(f'\|{r}').str.len()

In [None]:
def create_meta_fts(meta):
    meta['ps'] = meta['properties'].str.split('|')
    # numer of properties
    meta['nprop'] = meta.ps.str.len()
    # star ratings
    meta['star'] = meta.properties.str.extract('[\|](\d) star')
    meta['star'] = meta['star'].astype(float)
    # add ratings
    create_rating_colum(meta)
    # action encodings
    action_encodings = action_encoding()
    meta = pd.merge(meta, action_encodings, left_on='item_id', right_on='reference')
    # choose columns
    act_cols = [c for c in action_encodings.columns if c != 'reference']

    use_cols = ['item_id', 'nprop', 'n_clicks', 'star', 'good_rating', 'satisfactory_rating',
                'excellent_rating', 'ps']
    use_cols += act_cols
    meta = meta[use_cols].set_index('item_id')
    meta.to_csv(meta_file)
    
    return meta

In [29]:
def action_encoding(df):
    # now group on reference ids
    action_grp = df.groupby('reference')['action_type']
    # get value counts for each action type
    action_ctn = action_grp.value_counts()
    action_ctn_df = action_ctn.reset_index(name='ctn')

    # list of all unique action type
    # actions = list(np.sort(df.action_type.unique()))
    action_mapper = mapper_dict['action_type']
    # actions_id = action_mapper.values()
    actions_name = list(action_mapper.keys())

    # create ohe
    ohe = pd.DataFrame(np.eye(len(actions_name), dtype=int)[action_ctn_df.action_type.values], 
                       columns=actions_name)
    ohe = ohe.mul(action_ctn_df['ctn'], axis=0)
    action_ctn_df = pd.concat([action_ctn_df, ohe], axis=1)

    action_encoding = action_ctn_df.groupby('reference')[actions_name].sum()
    # also add normalized percentage over count of each actions over total
    normalized = action_encoding.div(action_encoding.sum(axis=1)+1, axis=0) # +1 for smoothing avoiding leakage
    action_encoding = action_encoding.join(normalized, lsuffix='_ctn', rsuffix='_per')
    # set the popularity (i.e the number of clickout) counts to rank, avoid leakage
    # but even if we convert them to rank it still leaks (e.g. 0 clicks out will always stay behind a rank threshold)
    # so for now we drop it (try to use embeddings)
    del action_encoding['clickout item_ctn']
