In [1]:
import pandas as pd, numpy as np
from tqdm.notebook import tqdm
import os, sys, pickle, glob, gc
from collections import Counter
import itertools
pd.set_option('display.max_columns', None)
from otto_utils import *

In [2]:
MODE = 1

In [3]:
if MODE==1:
    dataset = "otto-chunk-data-inparquet-format"
if MODE==0:
    dataset = "otto-validation"

type_labels = {'clicks':0, 'carts':1, 'orders':2}

def load_df(name='test'):    
    dfs = []
    for e, chunk_file in enumerate(glob.glob(f'input/{dataset}/{name}_parquet/*')):
        chunk = pd.read_parquet(chunk_file)
        chunk.ts = (chunk.ts/1000).astype('int32')
        chunk['type'] = chunk['type'].map(type_labels).astype('int8')
        dfs.append(chunk)
    return pd.concat(dfs).reset_index(drop=True)

test_df = load_df("test")
print('Test data has shape',test_df.shape)
print(f'{len(set(test_df.session))} unique sessions')

train_df = load_df("train")
print('Train data has shape',train_df.shape)
print(f'{len(set(train_df.session))} unique sessions')

Test data has shape (6928123, 4)
1671803 unique sessions
Train data has shape (216716096, 4)
12899779 unique sessions


In [4]:
def get_fe_1(df, sfx, unique=True):
    dct = {0:'CL', 1:'CA', 2:'OR'}
    
    if unique:
        df.drop_duplicates(subset=['session','aid','type'], inplace=True)
    
    t = df.groupby(['aid','type']).agg({'session':'count'}).reset_index()
    t = pd.pivot_table(
        t, 
        values='session', 
        index=['aid'], 
        columns=['type'], 
        aggfunc=np.sum, 
        fill_value=0
    ).reset_index()
    t.columns = ['aid','num_0','num_1','num_2']
    
    t['aid_CL2CA'] = t['num_1']/t['num_0']
    t['aid_CA2OR'] = t['num_2']/t['num_1']
    t['aid_CL2OR'] = t['num_2']/t['num_0']

    for i in [0,1,2]:
        t[f'aid_{dct[i]}_vs_mean'] = t[f'num_{i}'] / t[f'num_{i}'].mean()
        for pct in [False,True]:
            t[
                f'aid_{dct[i]}_rank_{"pct" if pct else "int"}'
            ] = t[f'num_{i}'].rank(method='average', ascending=False, pct=pct)
        del t[f'num_{i}']
    
    t.columns = ["aid"] + [c+"_"+sfx for c in t.columns if c!='aid']
    
    return t

In [5]:
tst = get_fe_1(test_df.copy(), sfx='tst', unique=True)
trn = get_fe_1(train_df.copy(), sfx='trn', unique=True)
feats_df = trn.merge(tst,on=['aid'],how='outer')

for c in tst.columns:
    if c!='aid':
        feats_df[c[:-3]+"tst_vs_trn"] = feats_df[c[:-3]+"tst"] / feats_df[c[:-3]+"trn"]
        
del trn, tst
gc_clear()


for sfx in ['trn','tst','tst_vs_trn']:
    for act in ['CL2CA','CA2OR','CL2OR']:
        feats_df[f'aid_{act}_{sfx}'].fillna(-1,inplace=True)
    for act in ['CL','CA','OR']:
        feats_df[f'aid_{act}_vs_mean_{sfx}'].fillna(-1,inplace=True)
        feats_df[f'aid_{act}_rank_int_{sfx}'].fillna(2e6,inplace=True)
        feats_df[f'aid_{act}_rank_pct_{sfx}'].fillna(2,inplace=True)
        
feats_df

Unnamed: 0,aid,aid_CL2CA_trn,aid_CA2OR_trn,aid_CL2OR_trn,aid_CL_vs_mean_trn,aid_CL_rank_int_trn,aid_CL_rank_pct_trn,aid_CA_vs_mean_trn,aid_CA_rank_int_trn,aid_CA_rank_pct_trn,aid_OR_vs_mean_trn,aid_OR_rank_int_trn,aid_OR_rank_pct_trn,aid_CL2CA_tst,aid_CA2OR_tst,aid_CL2OR_tst,aid_CL_vs_mean_tst,aid_CL_rank_int_tst,aid_CL_rank_pct_tst,aid_CA_vs_mean_tst,aid_CA_rank_int_tst,aid_CA_rank_pct_tst,aid_OR_vs_mean_tst,aid_OR_rank_int_tst,aid_OR_rank_pct_tst,aid_CL2CA_tst_vs_trn,aid_CA2OR_tst_vs_trn,aid_CL2OR_tst_vs_trn,aid_CL_vs_mean_tst_vs_trn,aid_CL_rank_int_tst_vs_trn,aid_CL_rank_pct_tst_vs_trn,aid_CA_vs_mean_tst_vs_trn,aid_CA_rank_int_tst_vs_trn,aid_CA_rank_pct_tst_vs_trn,aid_OR_vs_mean_tst_vs_trn,aid_OR_rank_int_tst_vs_trn,aid_OR_rank_pct_tst_vs_trn
0,0,0.000000,-1.000000,0.000000,0.506732,460602.0,0.248222,0.000000,1545169.5,0.832705,0.000000,1256772.0,0.677285,0.000000,-1.000000,0.000000,0.636198,199543.5,0.254687,0.000000,496451.0,0.633644,0.000000,413098.0,0.527256,-1.000000,-1.000000,-1.000000,1.255491,4.332233e-01,1.026043,-1.000000,3.212923e-01,0.760946,-1.000000,3.286976e-01,0.778485
1,1,0.034483,0.000000,0.000000,0.408201,539951.5,0.290984,0.130085,1035279.0,0.557921,0.000000,1256772.0,0.677285,-1.000000,-1.000000,-1.000000,-1.000000,2000000.0,2.000000,-1.000000,2000000.0,2.000000,-1.000000,2000000.0,2.000000,-1.000000,-1.000000,-1.000000,-1.000000,2.000000e+06,2.000000,-1.000000,2.000000e+06,2.000000,-1.000000,2.000000e+06,2.000000
2,2,0.000000,-1.000000,0.000000,0.225214,813738.5,0.438530,0.000000,1545169.5,0.832705,0.000000,1256772.0,0.677285,-1.000000,-1.000000,-1.000000,-1.000000,2000000.0,2.000000,-1.000000,2000000.0,2.000000,-1.000000,2000000.0,2.000000,-1.000000,-1.000000,-1.000000,-1.000000,2.000000e+06,2.000000,-1.000000,2.000000e+06,2.000000,-1.000000,2.000000e+06,2.000000
3,3,0.122161,0.217949,0.026625,17.974922,13652.0,0.007357,20.293184,11903.0,0.006415,13.444350,23275.5,0.012543,0.147826,0.117647,0.017391,18.290694,4520.5,0.005770,26.793551,2626.0,0.003352,25.352252,6047.5,0.007719,1.210089,0.539792,0.653197,1.017567,3.311236e-01,0.784231,1.320323,2.206167e-01,0.522507,1.885718,2.598226e-01,0.615362
4,4,0.065217,0.000000,0.000000,1.942474,156592.0,0.084389,1.170761,246221.0,0.132691,0.000000,1256772.0,0.677285,0.000000,-1.000000,0.000000,0.795248,162864.5,0.207872,0.000000,496451.0,0.633644,0.000000,413098.0,0.527256,0.000000,-1.000000,-1.000000,0.409399,1.040056e+00,2.463262,0.000000,2.016282e+00,4.775349,-1.000000,3.286976e-01,0.778485
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1855598,1855598,0.000000,-1.000000,0.000000,0.098531,1334178.5,0.719000,0.000000,1545169.5,0.832705,0.000000,1256772.0,0.677285,-1.000000,-1.000000,-1.000000,-1.000000,2000000.0,2.000000,-1.000000,2000000.0,2.000000,-1.000000,2000000.0,2.000000,-1.000000,-1.000000,-1.000000,-1.000000,2.000000e+06,2.000000,-1.000000,2.000000e+06,2.000000,-1.000000,2.000000e+06,2.000000
1855599,1855599,0.000000,-1.000000,0.000000,0.154835,1030696.0,0.555451,0.000000,1545169.5,0.832705,0.000000,1256772.0,0.677285,0.000000,-1.000000,0.000000,0.159050,605034.0,0.772233,0.000000,496451.0,0.633644,0.000000,413098.0,0.527256,-1.000000,-1.000000,-1.000000,1.027220,5.870150e-01,1.390282,-1.000000,3.212923e-01,0.760946,-1.000000,3.286976e-01,0.778485
1855600,1855600,0.107143,0.166667,0.017857,0.788250,329754.5,0.177707,0.780507,337408.5,0.181832,0.395422,510666.5,0.275202,1.000000,0.000000,0.000000,0.159050,605034.0,0.772233,1.576091,142235.0,0.181541,0.000000,413098.0,0.527256,9.333333,0.000000,0.000000,0.201775,1.834801e+00,4.345531,2.019317,4.215513e-01,0.998399,0.000000,8.089389e-01,1.915885
1855601,1855601,0.116667,0.000000,0.000000,0.844554,312364.0,0.168336,0.910592,299774.5,0.161551,0.000000,1256772.0,0.677285,-1.000000,-1.000000,-1.000000,-1.000000,2000000.0,2.000000,-1.000000,2000000.0,2.000000,-1.000000,2000000.0,2.000000,-1.000000,-1.000000,-1.000000,-1.000000,2.000000e+06,2.000000,-1.000000,2.000000e+06,2.000000,-1.000000,2.000000e+06,2.000000


In [6]:
def get_ratings(df,sfx):
    for type_name, type_code in type_labels.items():
        tops = pd.DataFrame(
            df.loc[df['type']==type_code,'aid'].value_counts()
        ).reset_index()
        tops.columns = ['aid',type_name]
        tops.reset_index(inplace=True)
        tops.columns = [f'{type_name}_rating_{sfx}','aid',type_name]
        
        if type_code==0:
            out = tops[['aid',f'{type_name}_rating_{sfx}']]
        else:
            out = out.merge(
                tops[['aid',f'{type_name}_rating_{sfx}']],
                on = ['aid'], how='outer'
            )
    del tops
    gc_clear()
    return out

In [7]:
ratings = get_ratings(test_df,"test")
ratings = ratings.merge(
    get_ratings(train_df,"train"),
    on=['aid'],
    how='outer'
)
ratings = ratings.merge(
    get_ratings(pd.concat([train_df,test_df]),"full"),
    on=['aid'],
    how='outer'
)


feats_df = feats_df.merge(ratings, on=['aid'], how='outer')

del ratings
gc_clear()

for sfx in ['test','train','full']:
    for act in ['clicks','carts','orders']:
        feats_df[f"{act}_rating_{sfx}"] = feats_df[f"{act}_rating_{sfx}"].fillna(2000000).astype(int)
        
feats_df

Unnamed: 0,aid,aid_CL2CA_trn,aid_CA2OR_trn,aid_CL2OR_trn,aid_CL_vs_mean_trn,aid_CL_rank_int_trn,aid_CL_rank_pct_trn,aid_CA_vs_mean_trn,aid_CA_rank_int_trn,aid_CA_rank_pct_trn,aid_OR_vs_mean_trn,aid_OR_rank_int_trn,aid_OR_rank_pct_trn,aid_CL2CA_tst,aid_CA2OR_tst,aid_CL2OR_tst,aid_CL_vs_mean_tst,aid_CL_rank_int_tst,aid_CL_rank_pct_tst,aid_CA_vs_mean_tst,aid_CA_rank_int_tst,aid_CA_rank_pct_tst,aid_OR_vs_mean_tst,aid_OR_rank_int_tst,aid_OR_rank_pct_tst,aid_CL2CA_tst_vs_trn,aid_CA2OR_tst_vs_trn,aid_CL2OR_tst_vs_trn,aid_CL_vs_mean_tst_vs_trn,aid_CL_rank_int_tst_vs_trn,aid_CL_rank_pct_tst_vs_trn,aid_CA_vs_mean_tst_vs_trn,aid_CA_rank_int_tst_vs_trn,aid_CA_rank_pct_tst_vs_trn,aid_OR_vs_mean_tst_vs_trn,aid_OR_rank_int_tst_vs_trn,aid_OR_rank_pct_tst_vs_trn,clicks_rating_test,carts_rating_test,orders_rating_test,clicks_rating_train,carts_rating_train,orders_rating_train,clicks_rating_full,carts_rating_full,orders_rating_full
0,0,0.000000,-1.000000,0.000000,0.506732,460602.0,0.248222,0.000000,1545169.5,0.832705,0.000000,1256772.0,0.677285,0.000000,-1.000000,0.000000,0.636198,199543.5,0.254687,0.000000,496451.0,0.633644,0.000000,413098.0,0.527256,-1.000000,-1.000000,-1.000000,1.255491,4.332233e-01,1.026043,-1.000000,3.212923e-01,0.760946,-1.000000,3.286976e-01,0.778485,252137,2000000,2000000,508078,2000000,2000000,484443,2000000,2000000
1,1,0.034483,0.000000,0.000000,0.408201,539951.5,0.290984,0.130085,1035279.0,0.557921,0.000000,1256772.0,0.677285,-1.000000,-1.000000,-1.000000,-1.000000,2000000.0,2.000000,-1.000000,2000000.0,2.000000,-1.000000,2000000.0,2.000000,-1.000000,-1.000000,-1.000000,-1.000000,2.000000e+06,2.000000,-1.000000,2.000000e+06,2.000000,-1.000000,2.000000e+06,2.000000,2000000,2000000,2000000,623157,1207610,2000000,627638,1210785,2000000
2,2,0.000000,-1.000000,0.000000,0.225214,813738.5,0.438530,0.000000,1545169.5,0.832705,0.000000,1256772.0,0.677285,-1.000000,-1.000000,-1.000000,-1.000000,2000000.0,2.000000,-1.000000,2000000.0,2.000000,-1.000000,2000000.0,2.000000,-1.000000,-1.000000,-1.000000,-1.000000,2.000000e+06,2.000000,-1.000000,2.000000e+06,2.000000,-1.000000,2.000000e+06,2.000000,2000000,2000000,2000000,950394,2000000,2000000,978377,2000000,2000000
3,3,0.122161,0.217949,0.026625,17.974922,13652.0,0.007357,20.293184,11903.0,0.006415,13.444350,23275.5,0.012543,0.147826,0.117647,0.017391,18.290694,4520.5,0.005770,26.793551,2626.0,0.003352,25.352252,6047.5,0.007719,1.210089,0.539792,0.653197,1.017567,3.311236e-01,0.784231,1.320323,2.206167e-01,0.522507,1.885718,2.598226e-01,0.615362,3461,2773,2553,10455,11920,22313,9857,10904,20955
4,4,0.065217,0.000000,0.000000,1.942474,156592.0,0.084389,1.170761,246221.0,0.132691,0.000000,1256772.0,0.677285,0.000000,-1.000000,0.000000,0.795248,162864.5,0.207872,0.000000,496451.0,0.633644,0.000000,413098.0,0.527256,0.000000,-1.000000,-1.000000,0.409399,1.040056e+00,2.463262,0.000000,2.016282e+00,4.775349,-1.000000,3.286976e-01,0.778485,134592,2000000,2000000,152737,287711,2000000,151982,273680,2000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1855598,1855598,0.000000,-1.000000,0.000000,0.098531,1334178.5,0.719000,0.000000,1545169.5,0.832705,0.000000,1256772.0,0.677285,-1.000000,-1.000000,-1.000000,-1.000000,2000000.0,2.000000,-1.000000,2000000.0,2.000000,-1.000000,2000000.0,2.000000,-1.000000,-1.000000,-1.000000,-1.000000,2.000000e+06,2.000000,-1.000000,2.000000e+06,2.000000,-1.000000,2.000000e+06,2.000000,2000000,2000000,2000000,1535283,2000000,2000000,1559615,2000000,2000000
1855599,1855599,0.000000,-1.000000,0.000000,0.154835,1030696.0,0.555451,0.000000,1545169.5,0.832705,0.000000,1256772.0,0.677285,0.000000,-1.000000,0.000000,0.159050,605034.0,0.772233,0.000000,496451.0,0.633644,0.000000,413098.0,0.527256,-1.000000,-1.000000,-1.000000,1.027220,5.870150e-01,1.390282,-1.000000,3.212923e-01,0.760946,-1.000000,3.286976e-01,0.778485,487974,2000000,2000000,1092786,2000000,2000000,1083148,2000000,2000000
1855600,1855600,0.107143,0.166667,0.017857,0.788250,329754.5,0.177707,0.780507,337408.5,0.181832,0.395422,510666.5,0.275202,1.000000,0.000000,0.000000,0.159050,605034.0,0.772233,1.576091,142235.0,0.181541,0.000000,413098.0,0.527256,9.333333,0.000000,0.000000,0.201775,1.834801e+00,4.345531,2.019317,4.215513e-01,0.998399,0.000000,8.089389e-01,1.915885,490240,111421,2000000,319179,385181,582056,326098,355774,653152
1855601,1855601,0.116667,0.000000,0.000000,0.844554,312364.0,0.168336,0.910592,299774.5,0.161551,0.000000,1256772.0,0.677285,-1.000000,-1.000000,-1.000000,-1.000000,2000000.0,2.000000,-1.000000,2000000.0,2.000000,-1.000000,2000000.0,2.000000,-1.000000,-1.000000,-1.000000,-1.000000,2.000000e+06,2.000000,-1.000000,2.000000e+06,2.000000,-1.000000,2.000000e+06,2.000000,2000000,2000000,2000000,311866,351470,2000000,318330,337946,2000000


In [8]:
def get_multi_percent(df,sfx):
    for type_name, type_code in type_labels.items():
        t = df[df['type']==type_code].groupby(['aid','session'],as_index=False).agg({'ts':'count'})
        t['ts'] = (t['ts']>1).astype('int8')
        t = t.groupby(['aid'],as_index=False).agg({'ts':['count','sum']})
        t.columns = ['aid','users','multi_users']
        t[f'aid_multi_{type_name}_percent_{sfx}'] = t['multi_users']/t['users']
        del t['users'], t['multi_users']
        if type_code==0:
            out = t
        else:
            out = out.merge(t,on=['aid'],how='outer')
    del t
    gc_clear()
    return out

In [9]:
multi = get_multi_percent(test_df,"test")
multi = multi.merge(
    get_multi_percent(train_df,"train"),
    on=['aid'],
    how='outer'
)
multi = multi.merge(
    get_multi_percent(pd.concat([train_df,test_df]),"full"),
    on=['aid'],
    how='outer'
)


feats_df = feats_df.merge(multi, on=['aid'], how='outer')

del multi
gc_clear()

for sfx in ['test','train','full']:
    for act in ['clicks','carts','orders']:
        feats_df[f"aid_multi_{act}_percent_{sfx}"] = feats_df[f"aid_multi_{act}_percent_{sfx}"].fillna(-1)
        
        
feats_df

Unnamed: 0,aid,aid_CL2CA_trn,aid_CA2OR_trn,aid_CL2OR_trn,aid_CL_vs_mean_trn,aid_CL_rank_int_trn,aid_CL_rank_pct_trn,aid_CA_vs_mean_trn,aid_CA_rank_int_trn,aid_CA_rank_pct_trn,aid_OR_vs_mean_trn,aid_OR_rank_int_trn,aid_OR_rank_pct_trn,aid_CL2CA_tst,aid_CA2OR_tst,aid_CL2OR_tst,aid_CL_vs_mean_tst,aid_CL_rank_int_tst,aid_CL_rank_pct_tst,aid_CA_vs_mean_tst,aid_CA_rank_int_tst,aid_CA_rank_pct_tst,aid_OR_vs_mean_tst,aid_OR_rank_int_tst,aid_OR_rank_pct_tst,aid_CL2CA_tst_vs_trn,aid_CA2OR_tst_vs_trn,aid_CL2OR_tst_vs_trn,aid_CL_vs_mean_tst_vs_trn,aid_CL_rank_int_tst_vs_trn,aid_CL_rank_pct_tst_vs_trn,aid_CA_vs_mean_tst_vs_trn,aid_CA_rank_int_tst_vs_trn,aid_CA_rank_pct_tst_vs_trn,aid_OR_vs_mean_tst_vs_trn,aid_OR_rank_int_tst_vs_trn,aid_OR_rank_pct_tst_vs_trn,clicks_rating_test,carts_rating_test,orders_rating_test,clicks_rating_train,carts_rating_train,orders_rating_train,clicks_rating_full,carts_rating_full,orders_rating_full,aid_multi_clicks_percent_test,aid_multi_carts_percent_test,aid_multi_orders_percent_test,aid_multi_clicks_percent_train,aid_multi_carts_percent_train,aid_multi_orders_percent_train,aid_multi_clicks_percent_full,aid_multi_carts_percent_full,aid_multi_orders_percent_full
0,0,0.000000,-1.000000,0.000000,0.506732,460602.0,0.248222,0.000000,1545169.5,0.832705,0.000000,1256772.0,0.677285,0.000000,-1.000000,0.000000,0.636198,199543.5,0.254687,0.000000,496451.0,0.633644,0.000000,413098.0,0.527256,-1.000000,-1.000000,-1.000000,1.255491,4.332233e-01,1.026043,-1.000000,3.212923e-01,0.760946,-1.000000,3.286976e-01,0.778485,252137,2000000,2000000,508078,2000000,2000000,484443,2000000,2000000,0.000000,-1.000000,-1.0,0.194444,-1.000000,-1.000000,0.175000,-1.000000,-1.000000
1,1,0.034483,0.000000,0.000000,0.408201,539951.5,0.290984,0.130085,1035279.0,0.557921,0.000000,1256772.0,0.677285,-1.000000,-1.000000,-1.000000,-1.000000,2000000.0,2.000000,-1.000000,2000000.0,2.000000,-1.000000,2000000.0,2.000000,-1.000000,-1.000000,-1.000000,-1.000000,2.000000e+06,2.000000,-1.000000,2.000000e+06,2.000000,-1.000000,2.000000e+06,2.000000,2000000,2000000,2000000,623157,1207610,2000000,627638,1210785,2000000,-1.000000,-1.000000,-1.0,0.137931,0.000000,-1.000000,0.137931,0.000000,-1.000000
2,2,0.000000,-1.000000,0.000000,0.225214,813738.5,0.438530,0.000000,1545169.5,0.832705,0.000000,1256772.0,0.677285,-1.000000,-1.000000,-1.000000,-1.000000,2000000.0,2.000000,-1.000000,2000000.0,2.000000,-1.000000,2000000.0,2.000000,-1.000000,-1.000000,-1.000000,-1.000000,2.000000e+06,2.000000,-1.000000,2.000000e+06,2.000000,-1.000000,2.000000e+06,2.000000,2000000,2000000,2000000,950394,2000000,2000000,978377,2000000,2000000,-1.000000,-1.000000,-1.0,0.062500,-1.000000,-1.000000,0.062500,-1.000000,-1.000000
3,3,0.122161,0.217949,0.026625,17.974922,13652.0,0.007357,20.293184,11903.0,0.006415,13.444350,23275.5,0.012543,0.147826,0.117647,0.017391,18.290694,4520.5,0.005770,26.793551,2626.0,0.003352,25.352252,6047.5,0.007719,1.210089,0.539792,0.653197,1.017567,3.311236e-01,0.784231,1.320323,2.206167e-01,0.522507,1.885718,2.598226e-01,0.615362,3461,2773,2553,10455,11920,22313,9857,10904,20955,0.295652,0.117647,0.5,0.398590,0.115385,0.117647,0.390086,0.115607,0.138889
4,4,0.065217,0.000000,0.000000,1.942474,156592.0,0.084389,1.170761,246221.0,0.132691,0.000000,1256772.0,0.677285,0.000000,-1.000000,0.000000,0.795248,162864.5,0.207872,0.000000,496451.0,0.633644,0.000000,413098.0,0.527256,0.000000,-1.000000,-1.000000,0.409399,1.040056e+00,2.463262,0.000000,2.016282e+00,4.775349,-1.000000,3.286976e-01,0.778485,134592,2000000,2000000,152737,287711,2000000,151982,273680,2000000,0.400000,-1.000000,-1.0,0.260870,0.000000,-1.000000,0.265734,0.000000,-1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1855598,1855598,0.000000,-1.000000,0.000000,0.098531,1334178.5,0.719000,0.000000,1545169.5,0.832705,0.000000,1256772.0,0.677285,-1.000000,-1.000000,-1.000000,-1.000000,2000000.0,2.000000,-1.000000,2000000.0,2.000000,-1.000000,2000000.0,2.000000,-1.000000,-1.000000,-1.000000,-1.000000,2.000000e+06,2.000000,-1.000000,2.000000e+06,2.000000,-1.000000,2.000000e+06,2.000000,2000000,2000000,2000000,1535283,2000000,2000000,1559615,2000000,2000000,-1.000000,-1.000000,-1.0,0.000000,-1.000000,-1.000000,0.000000,-1.000000,-1.000000
1855599,1855599,0.000000,-1.000000,0.000000,0.154835,1030696.0,0.555451,0.000000,1545169.5,0.832705,0.000000,1256772.0,0.677285,0.000000,-1.000000,0.000000,0.159050,605034.0,0.772233,0.000000,496451.0,0.633644,0.000000,413098.0,0.527256,-1.000000,-1.000000,-1.000000,1.027220,5.870150e-01,1.390282,-1.000000,3.212923e-01,0.760946,-1.000000,3.286976e-01,0.778485,487974,2000000,2000000,1092786,2000000,2000000,1083148,2000000,2000000,0.000000,-1.000000,-1.0,0.181818,-1.000000,-1.000000,0.166667,-1.000000,-1.000000
1855600,1855600,0.107143,0.166667,0.017857,0.788250,329754.5,0.177707,0.780507,337408.5,0.181832,0.395422,510666.5,0.275202,1.000000,0.000000,0.000000,0.159050,605034.0,0.772233,1.576091,142235.0,0.181541,0.000000,413098.0,0.527256,9.333333,0.000000,0.000000,0.201775,1.834801e+00,4.345531,2.019317,4.215513e-01,0.998399,0.000000,8.089389e-01,1.915885,490240,111421,2000000,319179,385181,582056,326098,355774,653152,0.000000,0.000000,-1.0,0.232143,0.000000,0.000000,0.228070,0.000000,0.000000
1855601,1855601,0.116667,0.000000,0.000000,0.844554,312364.0,0.168336,0.910592,299774.5,0.161551,0.000000,1256772.0,0.677285,-1.000000,-1.000000,-1.000000,-1.000000,2000000.0,2.000000,-1.000000,2000000.0,2.000000,-1.000000,2000000.0,2.000000,-1.000000,-1.000000,-1.000000,-1.000000,2.000000e+06,2.000000,-1.000000,2.000000e+06,2.000000,-1.000000,2.000000e+06,2.000000,2000000,2000000,2000000,311866,351470,2000000,318330,337946,2000000,-1.000000,-1.000000,-1.0,0.233333,0.000000,-1.000000,0.233333,0.000000,-1.000000


In [12]:
def get_favourite_dows(df,sfx):
    df['dow'] = ((df['ts']%(7*24*60*60))//(24*60*60)).astype('int8')
    for type_name, type_code in type_labels.items():
        fav = df[
            df['type']==type_code
        ].groupby(
            ['aid','dow'],
            as_index=False
        ).agg(
            {'session': 'count'}
        ).sort_values(
            by=['aid','session'],
            ascending=[True,False]
        ).drop_duplicates(['aid'])
        del fav['session']
        fav.columns = ['aid',f'aid_{type_name}_favourite_dow_{sfx}']
        if type_code==0:
            out = fav
        else:
            out = out.merge(fav,on=['aid'],how='outer')
    del df['dow'], fav
    gc_clear()
    return out

In [13]:
fav = get_favourite_dows(test_df,"test")
fav = fav.merge(
    get_favourite_dows(train_df,"train"),
    on=['aid'],
    how='outer'
)
fav = fav.merge(
    get_favourite_dows(pd.concat([train_df,test_df]),"full"),
    on=['aid'],
    how='outer'
)

feats_df = feats_df.merge(fav, on=['aid'], how='outer')

del fav
gc_clear()

In [14]:
selected_feats = [
    'aid',
    'aid_CA2OR_trn',
    'aid_CA_rank_int_tst_vs_trn',
    'aid_CA_vs_mean_trn',
    'aid_CA_vs_mean_tst',
    'aid_CA_vs_mean_tst_vs_trn',
    'aid_CL2CA_trn',
    'aid_CL2CA_tst',
    'aid_CL2OR_trn',
    'aid_CL_rank_int_trn',
    'aid_CL_rank_int_tst_vs_trn',
    'aid_CL_rank_pct_tst_vs_trn',
    'aid_CL_vs_mean_trn',
    'aid_CL_vs_mean_tst_vs_trn',
    'aid_multi_clicks_percent_full',
    'aid_multi_orders_percent_train',
    'aid_clicks_favourite_dow_test',
    'carts_rating_full',
    'carts_rating_train',
    'clicks_rating_full',
    'clicks_rating_train',
    'orders_rating_full'
]

In [15]:
feats_df[selected_feats].to_parquet(f'feats/FE_aids_{MODE}.pqt',index=False)

In [16]:
feats_df[selected_feats]

Unnamed: 0,aid,aid_CA2OR_trn,aid_CA_rank_int_tst_vs_trn,aid_CA_vs_mean_trn,aid_CA_vs_mean_tst,aid_CA_vs_mean_tst_vs_trn,aid_CL2CA_trn,aid_CL2CA_tst,aid_CL2OR_trn,aid_CL_rank_int_trn,aid_CL_rank_int_tst_vs_trn,aid_CL_rank_pct_tst_vs_trn,aid_CL_vs_mean_trn,aid_CL_vs_mean_tst_vs_trn,aid_multi_clicks_percent_full,aid_multi_orders_percent_train,aid_clicks_favourite_dow_test,carts_rating_full,carts_rating_train,clicks_rating_full,clicks_rating_train,orders_rating_full
0,0,-1.000000,3.212923e-01,0.000000,0.000000,-1.000000,0.000000,0.000000,0.000000,460602.0,4.332233e-01,1.026043,0.506732,1.255491,0.175000,-1.000000,3.0,2000000,2000000,484443,508078,2000000
1,1,0.000000,2.000000e+06,0.130085,-1.000000,-1.000000,0.034483,-1.000000,0.000000,539951.5,2.000000e+06,2.000000,0.408201,-1.000000,0.137931,-1.000000,,1210785,1207610,627638,623157,2000000
2,2,-1.000000,2.000000e+06,0.000000,-1.000000,-1.000000,0.000000,-1.000000,0.000000,813738.5,2.000000e+06,2.000000,0.225214,-1.000000,0.062500,-1.000000,,2000000,2000000,978377,950394,2000000
3,3,0.217949,2.206167e-01,20.293184,26.793551,1.320323,0.122161,0.147826,0.026625,13652.0,3.311236e-01,0.784231,17.974922,1.017567,0.390086,0.117647,4.0,10904,11920,9857,10455,20955
4,4,0.000000,2.016282e+00,1.170761,0.000000,0.000000,0.065217,0.000000,0.000000,156592.0,1.040056e+00,2.463262,1.942474,0.409399,0.265734,-1.000000,2.0,273680,287711,151982,152737,2000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1855598,1855598,-1.000000,2.000000e+06,0.000000,-1.000000,-1.000000,0.000000,-1.000000,0.000000,1334178.5,2.000000e+06,2.000000,0.098531,-1.000000,0.000000,-1.000000,,2000000,2000000,1559615,1535283,2000000
1855599,1855599,-1.000000,3.212923e-01,0.000000,0.000000,-1.000000,0.000000,0.000000,0.000000,1030696.0,5.870150e-01,1.390282,0.154835,1.027220,0.166667,-1.000000,4.0,2000000,2000000,1083148,1092786,2000000
1855600,1855600,0.166667,4.215513e-01,0.780507,1.576091,2.019317,0.107143,1.000000,0.017857,329754.5,1.834801e+00,4.345531,0.788250,0.201775,0.228070,0.000000,0.0,355774,385181,326098,319179,653152
1855601,1855601,0.000000,2.000000e+06,0.910592,-1.000000,-1.000000,0.116667,-1.000000,0.000000,312364.0,2.000000e+06,2.000000,0.844554,-1.000000,0.233333,-1.000000,,337946,351470,318330,311866,2000000
