We would first like to preface that this notebook is used to demonstrate both Pyspark and LSTM on kaggle, which is unfortunately not a distributed system and could not handle the training data in entirety. 

Therefore, in this notebook we are only using partial data, which we believe should be sufficient enough for a demonstration. Note that the results mentioned in the report uses the entire training data on a paid distributed service on AWS, which we cannot demonstrate here. 

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
shutil.rmtree("/kaggle/working")

In [None]:
!pip install pyspark
!pip install recbole

In [None]:
import glob
import os
import pyspark

from pyspark.sql import SparkSession

spark = SparkSession.builder \
     .master("local[*]") \
     .config("spark.executor.memory", "70g") \
     .config("spark.driver.memory", "50g") \
     .config("spark.memory.offHeap.enabled",True) \
     .config("spark.memory.offHeap.size","16g") \
     .appName("sampleCodeForReference") \
     .getOrCreate()
        
spark = SparkSession(spark)

In [None]:
import numpy

from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

To save some time, we have already prepared the input inter file, just run the cell below

In [None]:
!cp -r ../input/pyspark-output /kaggle/working
!mv /kaggle/working/pyspark-output /kaggle/working/recbox_data


Alternatively, you may run the following two cells to see how the inter file was obtained using Pyspark.

In [None]:
t_df = spark.read.option("header",True) \
              .csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")
t_df = t_df.withColumn("time_stamp", t_df['t_dat'].cast(DateType()))
t_df = t_df.withColumn("time_hash", unix_timestamp(t_df["time_stamp"])).select(col("customer_id"), col("article_id"), col("price"), col("sales_channel_id"), col("time_hash"))
t_df = t_df.select(col("customer_id").alias("cid:token"), col("article_id").alias("aid:token"), col("time_hash").alias("t_hash:float"))
t_df = t_df.filter(col("t_hash:float") > 1585620000)
t_df.show()


In [None]:
!rm -r /kaggle/working/recbox_data
!mkdir /kaggle/working/recbox_data

import pandas as pd

#t_df.coalesce(1).write.option("header", "false").csv('/kaggle/working/recbox_data/recbox_data.inter')
t_df.toPandas().to_csv('/kaggle/working/recbox_data/recbox_data.inter', index=False, sep='\t')

In [None]:
import logging
from logging import getLogger
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.model.sequential_recommender import GRU4Rec
from recbole.trainer import Trainer
from recbole.utils import init_seed, init_logger

In [None]:
parameters = {
    'user_inter_num_interval': "[30,inf)",
    'item_inter_num_interval': "[40,inf)",
    'epochs': 50,
    'data_path': '/kaggle/working',
    'USER_ID_FIELD': 'cid',
    'ITEM_ID_FIELD': 'aid',
    'TIME_FIELD': 't_hash',
    'load_col': {'inter': ['cid', 'aid', 't_hash']},
    'neg_sampling': None,
    'eval_args': {
        'split': {'RS': [9, 0, 1]},
        'group_by': 'user',
        'order': 'TO',
        'mode': 'full'}
}


In [None]:
new_config = Config(model='GRU4Rec', dataset='recbox_data', config_dict=parameters)
init_seed(new_config['seed'], new_config['reproducibility'])
init_logger(new_config)
log = getLogger()
stream_handler = logging.StreamHandler()
stream_handler.setLevel(logging.INFO)
log.addHandler(stream_handler)
log.info(new_config)

In [None]:
dataset = create_dataset(new_config)
log.info(dataset)

In [None]:
!rm -r /kaggle/working/saved

In [None]:
train, valid, test = data_preparation(new_config, dataset)
model = GRU4Rec(new_config, train.dataset).to(new_config['device'])
log.info(model)
trainer = Trainer(new_config, model)
best_valid_score, best_valid_result = trainer.fit(train)

In [None]:
from recbole.utils.case_study import full_sort_topk

ex_users = dataset.id2token(dataset.uid_field, list(range(dataset.user_num)))[1:]

In [None]:
topk_items = []
for in_users in list(range(dataset.user_num))[1:]:
    _, topk_iid_list = full_sort_topk([in_users], model, test, k=12, device=new_config['device'])
    last_topk_iid_list = topk_iid_list[-1]
    ext_items = dataset.id2token(dataset.iid_field, last_topk_iid_list.cpu()).tolist()
    topk_items.append(ext_items)
print(len(topk_items))

The cell below shows the output of the model

In [None]:
import pandas as pd

external_item_str = [' '.join(x) for x in topk_items]
result = pd.DataFrame(ex_users, columns=['customer_id'])
result['prediction'] = external_item_str
result_df = spark.sparkContext.createDataFrame(result)
result_df.show()
