# H&M: Which Algorithm is the Best? Ask RecBole!
There are many recommendation algorithms out there, and it is difficult to tell which one is the best. In fact, it is often the case that an algorithm that is optimal for one dataset does not work well for another dataset. In such case, [RecBole](https://recbole.io/), a set of standardized recommendation algorithms, is very useful to compare different algorithms. So, in the following, I use RecBole to quickly get an idea of which is the best algorithm for H&M recommendations.

## Acknowledgements
This work is inspired from: https://techlife.cookpad.com/entry/2021/11/04/090000

## Setups

In [None]:
! pip install recbole

In [None]:
import os
from datetime import datetime
import time

import numpy as np
import pandas as pd
from recbole.quick_start import run_recbole
from tqdm import tqdm

## Create Atomic Files
First, you need to transform the CSV files to a specific file format called Atomic Files (see [user guide](https://recbole.io/docs/user_guide/usage/running_new_dataset.html)). But if you use [this pre-computed dataset](https://www.kaggle.com/shionhonda/hm-recbole-atomic-files), you can skip running the following cells.

NOTE: I use the last-two-week data for valid & test sets and the last-month data for training. I discard the rest of the data to reduce computation. I further downsampled users by 1/6 to avoid OOM.

NOTE: For simplicity, I only use columns `customer_id`, `article_id`, and `t_dat`. You can include other columns by editing `HMDataset`.

In [None]:
# https://github.com/RUCAIBox/RecSysDatasets/blob/master/conversion_tools/src/base_dataset.py


class BaseDataset(object):
    def __init__(self, input_path, output_path):
        super(BaseDataset, self).__init__()

        self.dataset_name = ''
        self.input_path = input_path
        self.output_path = output_path
        self.check_output_path()

        # input file
        self.inter_file = os.path.join(self.input_path, 'inters.dat')
        self.item_file = os.path.join(self.input_path, 'items.dat')
        self.user_file = os.path.join(self.input_path, 'users.dat')
        self.sep = '\t'

        # output file
        self.output_inter_file, self.output_item_file, self.output_user_file = self.get_output_files()

        # selected feature fields
        self.inter_fields = {}
        self.item_fields = {}
        self.user_fields = {}

    def check_output_path(self):
        if not os.path.isdir(self.output_path):
            os.makedirs(self.output_path)

    def get_output_files(self):
        output_inter_file = os.path.join(self.output_path, self.dataset_name + '.inter')
        output_item_file = os.path.join(self.output_path, self.dataset_name + '.item')
        output_user_file = os.path.join(self.output_path, self.dataset_name + '.user')
        return output_inter_file, output_item_file, output_user_file

    def load_inter_data(self) -> pd.DataFrame():
        raise NotImplementedError

    def load_item_data(self) -> pd.DataFrame():
        raise NotImplementedError

    def load_user_data(self) -> pd.DataFrame():
        raise NotImplementedError

    def convert_inter(self):
        try:
            input_inter_data = self.load_inter_data()
            self.convert(input_inter_data, self.inter_fields, self.output_inter_file)
        except NotImplementedError:
            print('This dataset can\'t be converted to inter file\n')

    def convert_item(self):
        try:
            input_item_data = self.load_item_data()
            self.convert(input_item_data, self.item_fields, self.output_item_file)
        except NotImplementedError:
            print('This dataset can\'t be converted to item file\n')

    def convert_user(self):
        try:
            input_user_data = self.load_user_data()
            self.convert(input_user_data, self.user_fields, self.output_user_file)
        except NotImplementedError:
            print('This dataset can\'t be converted to user file\n')

    @staticmethod
    def convert(input_data, selected_fields, output_file):
        output_data = pd.DataFrame()
        for column in selected_fields:
            output_data[column] = input_data.iloc[:, column]
        with open(output_file, 'w') as fp:
            fp.write('\t'.join([selected_fields[column] for column in output_data.columns]) + '\n')
            for i in tqdm(range(output_data.shape[0])):
                fp.write('\t'.join([str(output_data.iloc[i, j])
                                    for j in range(output_data.shape[1])]) + '\n')

    def parse_json(self, data_path):
        with open(data_path, 'rb') as g:
            for l in g:
                yield eval(l)

    def getDF(self, data_path):
        i = 0
        df = {}
        for d in self.parse_json(data_path):
            df[i] = d
            i += 1
        data = pd.DataFrame.from_dict(df, orient='index')
        
        return data

In [None]:
class HMDataset(BaseDataset):
    def __init__(self, input_path, output_path):
        super(HMDataset, self).__init__(input_path, output_path)
        self.dataset_name = "hm"

        self.inter_file = os.path.join(self.input_path, "transactions_train.csv")
        self.item_file = os.path.join(self.input_path, "articles.csv")
        self.user_file = os.path.join(self.input_path, "customers.csv")

        self.sep = ","

        # output_path
        output_files = self.get_output_files()
        self.output_inter_file = output_files[0]
        self.output_item_file = output_files[1]
        self.output_user_file = output_files[2]

        # selected feature fields
        self.inter_fields = {
            0: "t_dat:float",
            1: "customer_id:token",
            2: "article_id:token",
        }

        self.item_fields = {
            0: "article_id:token",
        }

        self.user_fields = {
            0: "customer_id:token",
        }

    def load_inter_data(self):
        df = pd.read_csv(self.inter_file,
            dtype={"t_dat": "object", "customer_id": "object", "article_id": "object", "price": float, "sales_channel_id": int}
           )
        # approx. 1 month + 2 weeks
        df = df[-len(df)*3//48:].reset_index(drop=True)
        # Further downsampling to avoid OOM
        uus = df["customer_id"].unique()
        sampled_users = np.random.choice(uus, len(uus)//6)
        df = df.query('customer_id in @sampled_users')
        df['t_dat'] = df['t_dat'].apply(lambda x: datetime.timestamp(datetime.strptime(x, "%Y-%m-%d")))
        return df

    def load_item_data(self):
        return pd.read_csv(self.item_file, delimiter=self.sep, engine="python")

    def load_user_data(self):
        return pd.read_csv(self.user_file, delimiter=self.sep, engine="python")

In [None]:
%%time
hmds = HMDataset("../input/h-and-m-personalized-fashion-recommendations", "./hm")
hmds.convert_inter()
hmds.convert_user()
hmds.convert_item()
del hmds

## Configurations
NOTE: Here I train each model for only 5 epochs to save time. If you train a model for submission, you should train hundreds of epochs (defaults to 500) with a lower learning rate (defaults to 0.001).

NOTE: `MAP@12` that I specify here is different from the competiton's evaluation metric. To define the same MAP@12 as the competition's evaluation metric, see [here](https://recbole.io/docs/developer_guide/customize_metrics.html). And I'd appreciate it if you kindly share the code for the correct MAP@12.

NOTE: Of course you can change the following configurations, but it can cause errors that are not really understandable (the most common one is: "some feat is empty, please check the filtering settings"). I worked hard and finally got to the following configurations.

In [None]:
cfg_str = """
data_path: ./
dataset: hm
field_separator: "\\t"
USER_ID_FIELD: customer_id
ITEM_ID_FIELD: article_id
RATING_FIELD: ~
TIME_FIELD: t_dat
show_progress: false

load_col:
    inter: [customer_id, article_id, t_dat]
    user: [customer_id]
    item: [article_id]

epochs: 5
learning_rate: 0.01
user_inter_num_interval: "[0,inf)"
item_inter_num_interval: "[0,inf)"
filter_inter_by_user_or_item: false
neg_sampling:
    uniform: 1
eval_args:
    split: {'RS': [4, 1, 1]}
    group_by: None
    order: TO
    mode: uni50
metrics: ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision', 'MAP']
topk: 12
valid_metric: MAP@12
"""


with open("hm/config.yaml", "w") as f:
    f.write(cfg_str)

## Run Experiments
I selected 10 major algorithms from general, context-aware, and sequatial recommmenders. `Pop` means popularity ranking.

In [None]:
def run(model_name):
    if model_name in [
        "MultiVAE",
        "MultiDAE",
        "MacridVAE",
        "RecVAE",
        "GRU4Rec",
        "NARM",
        "STAMP",
        "NextItNet",
        "TransRec",
        "SASRec",
        "BERT4Rec",
        "SRGNN",
        "GCSAN",
        "GRU4RecF",
        "FOSSIL",
        "SHAN",
        "RepeatNet",
        "HRM",
        "NPE",
    ]:
        parameter_dict = {
            "neg_sampling": None,
        }
        return run_recbole(
            model=model_name,
            dataset='hm',
            config_file_list=['hm/config.yaml'],
            config_dict=parameter_dict,
        )
    else:
        return run_recbole(
            model=model_name,
            dataset='hm',
            config_file_list=['hm/config.yaml'],
        )


In [None]:
%%time
model_list = ["Pop", "ItemKNN", "BPR", "NeuMF", "RecVAE", "LightGCN"] # General
model_list += ["FFM", "DeepFM"] # Context-aware
model_list += ["GRU4Rec", "SHAN"] # Sequential
for model_name in model_list:
    print(f"running {model_name}...")
    start = time.time()
    result = run(model_name)
    t = time.time() - start
    print(f"It took {t/60:.2f} mins")
    print(result)