In [1]:
import pandas as pd, numpy as np
from tqdm.notebook import tqdm
import os, sys, pickle, glob, gc
from collections import Counter
import itertools
pd.set_option('display.max_columns', None)
from otto_utils import *

In [2]:
MODE = 1

In [3]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}

if MODE==1:
    dataset = "otto-chunk-data-inparquet-format"
if MODE==0:
    dataset = "otto-validation"

def load_test():    
    dfs = []
    for e, chunk_file in enumerate(glob.glob(f'input/{dataset}/test_parquet/*')):
        chunk = pd.read_parquet(chunk_file)
        chunk.ts = (chunk.ts/1000).astype('int32')
        chunk['type'] = chunk['type'].map(type_labels).astype('int8')
        dfs.append(chunk)
    return pd.concat(dfs).reset_index(drop=True)

test_df = load_test()
print('Test data has shape',test_df.shape)
print(f'{len(set(test_df.session))} unique sessions')

Test data has shape (6928123, 4)
1671803 unique sessions


In [4]:
test_df = test_df.sort_values(['session','ts'],ascending=[True,False]).reset_index(drop=True)
test_df['n'] = test_df.groupby('session').cumcount()

In [5]:
for type_name, type_code in type_labels.items():
    print(f"handling {type_name} ...")
    a = test_df[
        test_df.type==type_code
    ].groupby(
        ['session','aid'],
        as_index=False
    ).agg(
        {
            'ts' : ['count',max],
            'n' : min,
        }
    )
    a.columns = [
        'session',
        'aid',
        f'a2s_{type_name}_num',
        f'a2s_last_{type_name[:-1]}_ts',
        f'a2s_last_{type_name[:-1]}_index'
    ]
    
    if type_code==0:
        feats_df = a
    else:
        feats_df = feats_df.merge(a, on=['session','aid'], how='outer')

del a
gc_clear()

num_columns = ['a2s_clicks_num','a2s_carts_num','a2s_orders_num']
index_columns = ['a2s_last_click_index','a2s_last_cart_index','a2s_last_order_index']
ts_columns = ['a2s_last_click_ts','a2s_last_cart_ts','a2s_last_order_ts']


for col in num_columns:
    feats_df[col] = feats_df[col].fillna(0).astype('int16')
for col in index_columns:
    feats_df[col] = feats_df[col].fillna(999).astype('int16')
for col in ts_columns:
    feats_df[col] = feats_df[col].fillna(-1).astype('int')
        
feats_df['a2s_actions_num'] = feats_df[num_columns].sum(axis=1)
feats_df['a2s_last_action_index'] = feats_df[index_columns].min(axis=1)
feats_df['a2s_last_action_ts'] = feats_df[ts_columns].max(axis=1)

feats_df['a2s_best_action_type'] = 0
feats_df.loc[feats_df['a2s_carts_num'] > 0,'a2s_best_action_type'] = 1
feats_df.loc[feats_df['a2s_orders_num'] > 0,'a2s_best_action_type'] = 2
feats_df['a2s_best_action_type'] = feats_df['a2s_best_action_type'].astype('int8')

feats_df

handling clicks ...
handling carts ...
handling orders ...


Unnamed: 0,session,aid,a2s_clicks_num,a2s_last_click_ts,a2s_last_click_index,a2s_carts_num,a2s_last_cart_ts,a2s_last_cart_index,a2s_orders_num,a2s_last_order_ts,a2s_last_order_index,a2s_actions_num,a2s_last_action_index,a2s_last_action_ts,a2s_best_action_type
0,12899779,59625,1,1661724000,0,0,-1,999,0,-1,999,1,0,1661724000,0
1,12899780,582732,1,1661724058,3,0,-1,999,0,-1,999,1,3,1661724058,0
2,12899780,736515,1,1661724136,1,0,-1,999,0,-1,999,1,1,1661724136,0
3,12899780,973453,1,1661724109,2,0,-1,999,0,-1,999,1,2,1661724109,0
4,12899780,1142000,2,1661724155,0,0,-1,999,0,-1,999,2,0,1661724155,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5006552,14567528,172423,0,-1,999,0,-1,999,1,1662326425,8,1,8,1662326425,2
5006553,14567528,1708491,0,-1,999,0,-1,999,1,1662326425,9,1,9,1662326425,2
5006554,14568250,422075,0,-1,999,0,-1,999,1,1662326667,0,1,0,1662326667,2
5006555,14568250,471339,0,-1,999,0,-1,999,1,1662326667,1,1,1,1662326667,2


In [6]:
selected_feats = [
    'session','aid',
    'a2s_actions_num',
    'a2s_best_action_type',
    'a2s_carts_num',
    'a2s_clicks_num',
    'a2s_orders_num',
    'a2s_last_action_index',
    'a2s_last_cart_index',
    'a2s_last_click_index',
    'a2s_last_action_ts',
    'a2s_last_click_ts',
    'a2s_last_cart_ts',
    'a2s_last_order_ts'
]

In [7]:
feats_df[selected_feats].to_parquet(f"feats/FE_aids2sessions_{MODE}.pqt",index=False)

In [8]:
feats_df[selected_feats]

Unnamed: 0,session,aid,a2s_actions_num,a2s_best_action_type,a2s_carts_num,a2s_clicks_num,a2s_orders_num,a2s_last_action_index,a2s_last_cart_index,a2s_last_click_index,a2s_last_action_ts,a2s_last_click_ts,a2s_last_cart_ts,a2s_last_order_ts
0,12899779,59625,1,0,0,1,0,0,999,0,1661724000,1661724000,-1,-1
1,12899780,582732,1,0,0,1,0,3,999,3,1661724058,1661724058,-1,-1
2,12899780,736515,1,0,0,1,0,1,999,1,1661724136,1661724136,-1,-1
3,12899780,973453,1,0,0,1,0,2,999,2,1661724109,1661724109,-1,-1
4,12899780,1142000,2,0,0,2,0,0,999,0,1661724155,1661724155,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5006552,14567528,172423,1,2,0,0,1,8,999,999,1662326425,-1,-1,1662326425
5006553,14567528,1708491,1,2,0,0,1,9,999,999,1662326425,-1,-1,1662326425
5006554,14568250,422075,1,2,0,0,1,0,999,999,1662326667,-1,-1,1662326667
5006555,14568250,471339,1,2,0,0,1,1,999,999,1662326667,-1,-1,1662326667
