In [3]:
%%capture
try:
    import transformers4rec
except:
    print("Install packages\n\n")
    !pip install transformers4rec[pytorch,nvtabular]
    !pip install -U nvtabular==1.3.3
    !pip install beartype
    !pip install -U pytorch_lightning

In [6]:
import os
import ast
import json
import glob
import torch
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import nvtabular as nvt
import matplotlib.pyplot as plt
import gc
import cudf

from tqdm import tqdm
from datetime import datetime
from collections import Counter
from sklearn import preprocessing

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

from nvtabular.ops import *
from merlin.schema.tags import Tags
from merlin_standard_lib import Schema
#from transformers4rec import torch as tr
from transformers4rec.torch import Trainer
from transformers4rec.torch.ranking_metric import RecallAt
from nvtabular.loader.torch import TorchAsyncItr, DLDataLoader
from transformers4rec.config.trainer import T4RecTrainingArguments

TypeError: __new__() missing 1 required positional argument: 'task'

In [None]:
OUTPUT_PATH =  '/kaggle/working/'
os.makedirs('output', exist_ok=True)

In [None]:
train_w1 = dict()
label_w1 = dict()

for file in glob.glob("/kaggle/input/otto-prep-4-weeks/train_w0_part*.parquet"):
   train_w1[file] = pd.read_parquet(file)

for file in glob.glob("/kaggle/input/otto-prep-4-weeks/label_w0_part*.parquet"):
   label_w1[file] = pd.read_parquet(file)

train_w1 = pd.concat(train_w1.values())
label_w1 = pd.concat(label_w1.values())

# Preprocessing

In [None]:
def prepare_target(label_df):

    target = label_df.groupby(['session', 'type'])['aid'].apply(list)
    target = target.reset_index().rename(columns={'aid': 'labels'})
    target.loc[target.type == 0, 'labels'] = target.loc[target.type == 0, 'labels'].str[:1]
    return target

In [None]:
#label encoding aids

temp_all_data = pd.read_parquet('/kaggle/input/otto-full-optimized-memory-footprint/train.parquet')


le = preprocessing.LabelEncoder()
le.fit(temp_all_data['aid'])
del temp_all_data

train_w1['aid'] = le.transform(train_w1['aid'])
label_w1['aid'] = le.transform(label_w1['aid'])

#set start = 1 to reserve 0 for nan
train_w1['aid+1'] = train_w1['aid']+1
train_w1['aid'] = train_w1['aid+1']
label_w1['aid+1'] = label_w1['aid']+1
label_w1['aid'] = label_w1['aid+1']

train_w1 = train_w1.drop(['aid+1'], axis = 1)
label_w1 = label_w1.drop(['aid+1'], axis = 1)



In [None]:
label_w1 = prepare_target(label_w1)

label_w1.to_parquet(OUTPUT_PATH+'label_w1.parquet')

# Feature Engineering 

In [None]:
def add_days_types(df):
    df[['clicks_Monday', 'clicks_Tuesday', 'clicks_Wednesday', 'clicks_Thursday', 'clicks_Friday', 'clicks_Saturday', 'clicks_Sunday',
         'carts_Monday', 'carts_Tuesday', 'carts_Wednesday', 'carts_Thursday', 'carts_Friday', 'carts_Saturday', 'carts_Sunday',
         'order_Monday', 'order_Tuesday', 'order_Wednesday', 'order_Thursday', 'order_Friday', 'order_Saturday', 'order_Sunday',]] = 0 #default value for nan
    df['weekday'] = df.ts.dt.weekday
    
    days_clicks = ['clicks_Monday', 'clicks_Tuesday', 'clicks_Wednesday', 'clicks_Thursday', 'clicks_Friday', 'clicks_Saturday', 'clicks_Sunday']
    days_carts = ['carts_Monday', 'carts_Tuesday', 'carts_Wednesday', 'carts_Thursday', 'carts_Friday', 'carts_Saturday', 'carts_Sunday']
    days_orders = ['order_Monday', 'order_Tuesday', 'order_Wednesday', 'order_Thursday', 'order_Friday', 'order_Saturday', 'order_Sunday',]
    
    types = [0,1,2]
    
    days_types = [days_clicks, days_carts, days_orders]
    
    for days, action in zip(days_types, types):
        for day, wday in zip(days, range(7)):
            df.loc[(df.weekday == wday) & (df.type == action) , day] = df.loc[(df.weekday == wday) & (df.type == action) ,'aid']
            
    return df
            


train_w1 = add_days_types(train_w1)
train_w1.to_parquet(OUTPUT_PATH+'train_w1.parquet')
del train_w1

In [None]:
# Define Groupby Workflow
features = ['clicks_Monday', 'clicks_Tuesday', 'clicks_Wednesday', 'clicks_Thursday', 'clicks_Friday', 'clicks_Saturday', 'clicks_Sunday',
         'carts_Monday', 'carts_Tuesday', 'carts_Wednesday', 'carts_Thursday', 'carts_Friday', 'carts_Saturday', 'carts_Sunday',
         'order_Monday', 'order_Tuesday', 'order_Wednesday', 'order_Thursday', 'order_Friday', 'order_Saturday', 'order_Sunday',] + ['session'] + ['aid'] +['type']

feature_clicks_1 = ['clicks_Monday', 'clicks_Tuesday', 'clicks_Wednesday'] + ['session'] 

feature_clicks_2 = ['clicks_Thursday', 'clicks_Friday', 'clicks_Saturday', 'clicks_Sunday',] + ['session'] 

feature_carts_1 = ['carts_Monday', 'carts_Tuesday', 'carts_Wednesday'] + ['session'] 

feature_carts_2 = ['carts_Thursday', 'carts_Friday', 'carts_Saturday', 'carts_Sunday'] + ['session'] 

feature_order_1 = ['order_Monday', 'order_Tuesday', 'order_Wednesday' ] + ['session'] 

feature_order_2 = ['order_Thursday', 'order_Friday', 'order_Saturday', 'order_Sunday',] + ['session'] 

feature_aid_type = ['session'] + ['aid'] +['type']




agg = {}
for f in features:
    agg[f] = ['list']

# Group interaction features by session
feature_clicks_1 = feature_clicks_1 >> nvt.ops.Groupby(
    groupby_cols=["session"], 
    aggs=agg,
    name_sep="-")

feature_clicks_2 = feature_clicks_2 >> nvt.ops.Groupby(
    groupby_cols=["session"], 
    aggs=agg,
    name_sep="-")

feature_carts_1 = feature_carts_1 >> nvt.ops.Groupby(
    groupby_cols=["session"], 
    aggs=agg,
    name_sep="-")

feature_carts_2 = feature_carts_2 >> nvt.ops.Groupby(
    groupby_cols=["session"], 
    aggs=agg,
    name_sep="-")

feature_order_1 = feature_order_1 >> nvt.ops.Groupby(
    groupby_cols=["session"], 
    aggs=agg,
    name_sep="-")

feature_order_2 = feature_order_2 >> nvt.ops.Groupby(
    groupby_cols=["session"], 
    aggs=agg,
    name_sep="-")

feature_aid_type = feature_aid_type  >> nvt.ops.Groupby(
    groupby_cols=["session"], 
    aggs=agg,
    name_sep="-")



features = feature_clicks_1 + feature_clicks_2 + feature_carts_1 + feature_carts_2 + feature_order_1 + feature_order_2 + feature_aid_type

workflow = nvt.Workflow(features)
print('workflow complete')


dataset = nvt.Dataset('/kaggle/working/train_w1.parquet')
print('ds complete')


workflow.fit(dataset)
print('fit complete')

workflow.transform(nvt.Dataset('/kaggle/working/train_w1.parquet')).to_parquet('/kaggle/working/output')