In [None]:
!nvidia-smi

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

In [None]:
import os
cwd = '/content/drive/MyDrive/Competitions/SIGIR/SIGIR-ecom-data-challenge-main/submission/008_after_stage2/'
os.chdir(cwd)

In [None]:
import os
os.getcwd()

In [None]:
import os, sys

os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [None]:
import gc
import pickle as pkl
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import GroupKFold

sys.path.append("../util")
from lstm import lstm_model
from utils import reduce_df, save_checkpoint


In [None]:
os.environ['TF_MEMORY_ALLOCATION'] = "0.6"
gpus = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_virtual_device_configuration(
    gpus[0],
    [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*50)]
)

In [None]:
input_path = Path('/content/drive/MyDrive/Data_Competitions/SIGIR2021/')
#input_path = Path('./')

In [None]:
pd.set_option('display.max_columns', None)

# Load data

In [None]:
%%time
all_df = pd.read_csv(input_path / 'input_df_meta_rnn_stage2.csv')
print(all_df.shape)

In [None]:
all_df

In [None]:
reduce_df(all_df)

In [None]:
all_df.columns

In [None]:
gc.collect()

In [None]:
LAGS = 19
lag_targets = []

for i in range(LAGS, -1, -1):
    lag_targets.append(f'product_hash_lag{i}')

lag_targets

In [None]:
LAGS_CATEGORY = 19
lag_categories = []

for i in range(LAGS_CATEGORY, -1, -1):
    lag_categories.append(f'category_lag{i}')

lag_categories

In [None]:
"""LAGS_URL = 19
lag_urls = []

for i in range(LAGS_URL, -1, -1):
    lag_urls.append(f'url_lag{i}')

lag_urls"""

In [None]:
FEATURES = ['product_action_id',
            'cum_pageview_id', 'cum_product_id', 'cum_search_id', 'cum_event_id',
            'num_following_search_id', 'num_following_pageview_id',
            'price_bucket_id', 'price_null_id', 'dayofweek_id', 'hour_id', 'first_url', 
            'weekend_id', 'num_search_id', 'num_pageview_id', 'first_product', 'first_category'] + lag_targets + lag_categories# + lag_urls
TARGET = ['next_product']

In [None]:
all_df = all_df.rename(columns={'category_hash_id': 'category_lag0', 'hashed_url_id': 'url_lag0'})

In [None]:
len(all_df.columns)

In [None]:
set(all_df.columns) - set(FEATURES)

In [None]:
for c in all_df.columns:
    print(c)

# Embeddings

In [None]:
len(FEATURES)

In [None]:
# EMBEDDING INPUT SIZES
value_counts = []
for c in FEATURES:
    max_ = all_df[c].max()
    value_counts.append(max_+1)
    print(c, max_+1)

In [None]:
MAX_EMBED_DIM = 256
embed_dims = [max(1, min(int((n+2)*0.25), MAX_EMBED_DIM)) for n in value_counts]
embed_dims

In [None]:
embedding_map = {i:(j, k) for i, j, k in zip(FEATURES, value_counts, embed_dims)}
embedding_map

In [None]:
target_count = all_df['next_product'].max() + 1
target_count

# Training

In [None]:
def epoch2lr(epoch):
    rates = [1e-3, 1e-4, 1e-5, 1e-6]
    return rates[epoch]
    
lr = tf.keras.callbacks.LearningRateScheduler(epoch2lr, verbose = True)

In [None]:
HIDDEN_DIM = 512

In [None]:
WEIGHT_PATH = './checkpoints/rnn/'
N_FOLD = 5

In [None]:
all_df.columns

In [None]:
all_df

In [None]:
fold = 0

In [None]:
len(all_df.loc[(all_df.fold == fold) & (all_df.is_test == 0) & (all_df['cum_product_r'] > 0) & (all_df.SessionId % 38 == 0)])

In [None]:
!ls ./checkpoints/rnn/

In [None]:
gc.collect()

In [None]:
start_fold = 1

In [None]:
TRAIN_WITH_TEST = True

for fold in range(start_fold, N_FOLD):    
    print('-'*50)
    print('FOLD %i'%(fold))

    if TRAIN_WITH_TEST:
        train = all_df.loc[(all_df.fold != fold) & (all_df.cum_product_r > 0)].copy()
    else:
        train = all_df.loc[(all_df.is_test == 0) & (all_df.fold != fold) & (all_df.cum_product_r > 0)].copy()

    valid = all_df.loc[(all_df.fold == fold) & (all_df.is_test == 0) & (all_df['cum_product_r'] > 0) & (all_df.SessionId % 38 == 0)].copy()

    print('train shape',train.shape, 'valid shape', valid.shape)    

    sv = tf.keras.callbacks.ModelCheckpoint(
        f'{WEIGHT_PATH}/LSTM_fold{fold}.h5', monitor='val_sparse_top_k_categorical_accuracy', verbose=1, 
        save_best_only=True, save_weights_only=True, mode='max', save_freq='epoch'
    )

    model = lstm_model(FEATURES, embedding_map, MAX_EMBED_DIM, HIDDEN_DIM, target_count)
    print('built', flush=True)
    model.fit(train[FEATURES], train[TARGET],
              validation_data = (valid[FEATURES], valid[TARGET]),
              epochs=5, verbose=1,
              batch_size=512, callbacks=[sv, lr])
    del train, valid
    gc.collect()