In [1]:
import pandas as pd, numpy as np
from tqdm.notebook import tqdm
import os, sys, pickle, glob, gc
from collections import Counter
import itertools
pd.set_option('display.max_columns', None)
from otto_utils import *

In [2]:
MODE = 1

In [3]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}

if MODE==1:
    dataset = "otto-chunk-data-inparquet-format"
if MODE==0:
    dataset = "otto-validation"

def load_test():    
    dfs = []
    for e, chunk_file in enumerate(glob.glob(f'input/{dataset}/test_parquet/*')):
        chunk = pd.read_parquet(chunk_file)
        chunk.ts = (chunk.ts/1000).astype('int32')
        chunk['type'] = chunk['type'].map(type_labels).astype('int8')
        dfs.append(chunk)
    return pd.concat(dfs).reset_index(drop=True)

test_df = load_test()
# test_df = pd.read_parquet("input/otto-shimacos-validation/shimacos_test_0.parquet")
print('Test data has shape',test_df.shape)
print(f'{len(set(test_df.session))} unique sessions')

Test data has shape (6928123, 4)
1671803 unique sessions


In [4]:
test_df = test_df.sort_values(['session','ts'],ascending=[True,True]).reset_index(drop=True)
session_features = test_df[['session']].drop_duplicates().reset_index(drop=True)

In [5]:
def add_new_col(session_features, fromdf, new_col, group_col, fillna=0, dtype='float'):
    session_features[new_col] = session_features[
        'session'
    ].map(
        dict(zip(
            fromdf['session'],
            fromdf[group_col]
        ))
    ).fillna(fillna).astype(dtype)

### Action and Items per Session

In [6]:
items = test_df.groupby(['session']).agg({'aid':['count','nunique']}).reset_index()
items.columns = ['session','actions','items']

for new_col, group_col in zip(
    ['session_actions','session_items'],
    ['actions','items']
):
    add_new_col(session_features, items, new_col, group_col, fillna=0, dtype='int16')

del items
gc_clear()

### Actions and Items per Session and Type

In [7]:
items_by_action = test_df.groupby(['session','type']).agg({'aid':['count','nunique']}).reset_index()
items_by_action.columns = ['session','type','actions','items']

for type_name, type_code in type_labels.items():
    add_new_col(
        session_features, 
        items_by_action[items_by_action['type']==type_code][['session','actions']], 
        new_col = f'session_{type_name}', 
        group_col = 'actions', 
        fillna = 0, 
        dtype = 'int16'
    )
    add_new_col(
        session_features, 
        items_by_action[items_by_action['type']==type_code][['session','items']], 
        new_col = f'session_items_{type_name[:-1]}ed', 
        group_col = 'items', 
        fillna = 0, 
        dtype = 'int16'
    )
    
del items_by_action
gc_clear()

### Sessions Length

In [8]:
sessions_length = test_df.groupby(['session'])['ts'].agg(['min','max']).diff(axis=1).reset_index()[['session','max']]
add_new_col(session_features, sessions_length, 'session_full_length', 'max', fillna=0, dtype='int')

sessions_starts = test_df.groupby(['session'])['ts'].agg('min').reset_index()
add_new_col(session_features, sessions_starts, 'session_first_ts', 'ts', fillna=-1, dtype='int')

sessions_ends = test_df.groupby(['session'])['ts'].agg('max').reset_index()
add_new_col(session_features, sessions_ends, 'session_last_ts', 'ts', fillna=-1, dtype='int')

del sessions_length, sessions_ends, sessions_starts
gc_clear()

### Real Sessions

In [9]:
test_df['d'] = test_df.groupby('session').ts.diff()
test_df.d = (test_df.d > 60*60*2).fillna(0).astype('int8')
test_df['d'] = test_df.groupby('session').d.cumsum()

real_sessions = (test_df.groupby(['session']).agg({'d':max})+1).reset_index()

add_new_col(session_features, real_sessions, 'session_real_num', 'd', fillna=0, dtype='int8')

del real_sessions
gc_clear()

In [10]:
real_sessions_length = test_df.groupby(['session','d']).agg({'ts':[min,max]}).diff(axis=1).reset_index()
real_sessions_length.columns = ['session','d','nan','sess_len']
real_sessions_length = real_sessions_length.groupby(['session']).agg({'sess_len':'mean'}).reset_index()

add_new_col(session_features, real_sessions_length, 'session_avg_real_length', 'sess_len', fillna=0)

del real_sessions_length
gc_clear()

In [11]:
items_per_real_session = test_df.groupby(['session','d']).agg({'aid':'nunique'}).reset_index()
items_per_real_session.columns = ['session','d','aid_unique']
items_per_real_session = items_per_real_session.groupby(['session']).agg({'aid_unique':'mean'}).reset_index()
items_per_real_session.columns = ['session','aid_avg']

add_new_col(session_features, items_per_real_session, 'session_avg_real_items_num', 'aid_avg', fillna=0)

del items_per_real_session
gc_clear()

In [12]:
for col in ['actions','clicks','carts','orders']:
    session_features[f'session_{col}_avg_real'] = session_features[f'session_{col}'] / session_features['session_real_num']

### Average Actions Hour

In [13]:
test_df['hour'] = (test_df['ts']%(24*3600))/3600
test_df['dow'] = (test_df['ts']%(7*24*3600))//(24*3600)

In [14]:
hour_by_action = test_df.groupby(['session','type']).agg({'hour':'mean'}).reset_index()
hour_by_action.columns = ['session','type','avg_hour']

for type_name, type_code in type_labels.items():
    add_new_col(
        session_features, 
        hour_by_action[hour_by_action['type']==type_code][['session','avg_hour']], 
        new_col = f'session_{type_name}_avg_hour', 
        group_col = 'avg_hour', 
        fillna = -1
    )
    
del hour_by_action
gc_clear()

### Clicks time difference

In [15]:
test_df['tdiff'] = test_df.groupby(['session','d'])['ts'].transform('diff')
clicks_diff = test_df.groupby(['session']).agg({'tdiff':['mean','median']}).reset_index()
clicks_diff.columns = ['session','click_diff_mean','click_diff_median']

add_new_col(session_features, clicks_diff, 'session_click_diff_mean', 'click_diff_mean', fillna=-1)
add_new_col(session_features, clicks_diff, 'session_click_diff_median', 'click_diff_median', fillna=-1)

del clicks_diff, test_df['tdiff'] 
gc_clear()

### DOW

In [16]:
first_df = test_df[['session','dow']].drop_duplicates(['session'], keep='first')
last_df = test_df[['session','dow']].drop_duplicates(['session'], keep='last')

add_new_col(session_features, first_df, 'session_start_dow', 'dow', fillna=-1, dtype='int8')
add_new_col(session_features, last_df, 'session_end_dow', 'dow', fillna=-1, dtype='int8')

del first_df, last_df 
gc_clear()

In [17]:
selected_feats = [
    'session',
    'session_actions',
    'session_avg_real_items_num',
    'session_avg_real_length',
    'session_carts',
    'session_carts_avg_hour',
    'session_carts_avg_real',
    'session_click_diff_mean',
    'session_clicks',
    'session_orders',
    'session_full_length',
    'session_items',
    'session_items_carted',
    'session_items_clicked',
    'session_last_ts'
]

In [18]:
session_features[selected_feats].to_parquet(f"feats/FE_sessions_{MODE}.pqt",index=False)

In [19]:
session_features[selected_feats]

Unnamed: 0,session,session_actions,session_avg_real_items_num,session_avg_real_length,session_carts,session_carts_avg_hour,session_carts_avg_real,session_click_diff_mean,session_clicks,session_orders,session_full_length,session_items,session_items_carted,session_items_clicked,session_last_ts
0,12899779,1,1.00,0.0,0,-1.000000,0.00,-1.000000,1,0,0,1,0,1,1661724000
1,12899780,5,4.00,155.0,0,-1.000000,0.00,38.750000,5,0,155,4,0,4,1661724155
2,12899781,11,2.00,291.0,1,13.946944,0.25,166.285714,10,0,336160,5,1,5,1662060160
3,12899782,70,11.25,929.0,16,17.024531,4.00,56.303030,46,8,79953,38,14,32,1661803953
4,12899783,11,3.00,2178.0,0,-1.000000,0.00,816.750000,11,0,317140,9,0,9,1662041140
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1671798,14571577,1,1.00,0.0,0,-1.000000,0.00,-1.000000,1,0,0,1,0,1,1662328774
1671799,14571578,1,1.00,0.0,0,-1.000000,0.00,-1.000000,1,0,0,1,0,1,1662328775
1671800,14571579,1,1.00,0.0,0,-1.000000,0.00,-1.000000,1,0,0,1,0,1,1662328775
1671801,14571580,1,1.00,0.0,0,-1.000000,0.00,-1.000000,1,0,0,1,0,1,1662328781
