In [None]:
!pip install recbole -q

In [None]:
import numpy as np
import pandas as pd

from path import Path
import gc
from tqdm import tqdm

In [None]:
path = Path("../input/h-and-m-personalized-fashion-recommendations")
output_path = Path("/kaggle/working/hm")

In [None]:
!mkdir {str(output_path)}

## creating "hm.inter" from "transactions_train.csv"

In [None]:
%%time
df_inter = pd.read_csv(
    path/"transactions_train.csv", 
    dtype={"article_id": str}
)
df_inter.head()

In [None]:
df_inter.shape

In [None]:
df_inter.dtypes

In [None]:
# popular items between August and September
popular_items = df_inter[df_inter.t_dat.str[5:7].isin(["08", "09"])].article_id.value_counts()[:12].index.tolist()
popular_items

In [None]:
df_inter = df_inter.tail(4_000_000)
df_inter.head()

In [None]:
%%time
# t_dat's type is object(=str), but what I want is timestamp
# so convert str like "2018-09-20" to datetime64[s]
# and then, convert it to timestamp like 1537401600
df_inter["timestamp"] = df_inter.t_dat.values.astype('datetime64[s]').astype(np.int64)
df_inter["timestamp"].head()

In [None]:
df_inter = df_inter[["customer_id", "article_id", "timestamp"]]

In [None]:
# rename columns
# RecBole can handle implicit feedback recommendation!
# so no column for "rating"
df_inter = df_inter.rename(columns={
    "customer_id": "user_id:token",
    "article_id": "item_id:token",
    "timestamp": "timestamp:float"
})
df_inter.head()

In [None]:
df_inter.to_csv(output_path/"hm.inter", index=None, sep="\t")

In [None]:
!head {output_path/"hm.inter"}

## creating "hm.user" from "customers.csv"

In [None]:
%%time
df_user = pd.read_csv(path/"customers.csv")
df_user.head()

In [None]:
df_user.shape

In [None]:
df_user.dtypes

In [None]:
df_user.postal_code.nunique()

In [None]:
df_user = df_user[["customer_id", "age", "postal_code"]]

In [None]:
df_user = df_user.rename(columns={
    "customer_id": "user_id:token",
    "age": "age:float",
    "postal_code": "postal_code:token"
})
df_user.head()

In [None]:
df_user.to_csv(output_path/"hm.user", index=None, sep="\t")

In [None]:
!head {output_path/"hm.user"}

## creating "hm.item" from "articles.csv"

In [None]:
%%time
df_item = pd.read_csv(path/"articles.csv", dtype={"article_id": str})
df_item.head()

In [None]:
df_item.iloc[0]

In [None]:
df_item.shape

In [None]:
df_item.dtypes

In [None]:
df_item = df_item[[
    "article_id",
    "product_type_no",
    "product_group_name",
    "colour_group_code",
    "perceived_colour_value_id",
    "index_group_no",
]]

In [None]:
df_item = df_item.rename(columns={
    "article_id": "item_id:token",
    "product_type_no": "product_type_no:float",
    "product_group_name": "product_group_name:token_seq",
    "colour_group_code": "colour_group_code:token",
    "perceived_colour_value_id": "perceived_colour_value_id:token",
    "index_group_no": "index_group_no:token",
})
df_item.head()

In [None]:
df_item.to_csv(output_path/"hm.item", index=None, sep="\t")

In [None]:
!head {output_path/"hm.item"}

## delete dataframes

In [None]:
del df_inter
del df_user
del df_item

gc.collect()

## using RecBole

In [None]:
import logging
from logging import getLogger

from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.model.sequential_recommender import SASRecF
from recbole.trainer import Trainer
from recbole.utils import init_seed, init_logger
from recbole.utils.case_study import full_sort_topk

In [None]:
config_dict = {
    "data_path": "/kaggle/working",
    "USER_ID_FIELD": "user_id",
    "ITEM_ID_FIELD": "item_id",
    "TIME_FIELD": "timestamp",
    "user_inter_num_interval": "[2, inf)",
    "item_inter_num_interval": "[50, inf)",
    "load_col": {
        "inter": ["user_id", "item_id", "timestamp"],
        "user": ["user_id", "age", "postal_code"],
        "item": [
            "item_id",
            "product_type_no", 
            "product_group_name", 
            "colour_group_code",
            "perceived_colour_value_id",
            "index_group_no"
        ]
    }, 
    "selected_features": [
        "product_type_no", 
        "product_group_name", 
        "colour_group_code", 
        "perceived_colour_value_id",
        "index_group_no"
    ],
    "neg_sampling": None,
    "eval_args": {
        "split": {"RS": [0.95, 0.03, 0.02]},
        "group_by": "user",
        "order": "TO",
        "mode": "full"
    },
    "metrics": ["MRR", "Hit", "MAP"],
    "topk": 12,
    "epochs": 30,
    "show_progress": True,
    "valid_metric": "MAP@12",
    "learning_rate": 0.003,
}

In [None]:
model_name = "SASRecF"

In [None]:
%%time

print(f"training {model_name}")

config = Config(model=model_name, dataset='hm', config_dict=config_dict)
init_seed(config['seed'], config['reproducibility'])

dataset = create_dataset(config)
train_data, valid_data, test_data = data_preparation(config, dataset)

model = eval(model_name)(config, train_data.dataset).to(config['device'])
trainer = Trainer(config, model)
_, best_valid_result = trainer.fit(
    train_data, 
    valid_data, 
    callback_fn=lambda epoch_idx, valid_score: print(f"{config['valid_metric']} at epoch {epoch_idx + 1}: {valid_score}")
)

print(f"result: {best_valid_result}")

In [None]:
trainer.evaluate(test_data)

In [None]:
del dataset
del train_data
del valid_data
del test_data

gc.collect()

# making recommendations
https://recbole.io/docs/user_guide/usage/case_study.html  
"Get the top ranked item for each user" looks helpful

In [None]:
submission_df = pd.read_csv(path/"sample_submission.csv")
submission_df.head()

In [None]:
submission_df = submission_df.drop("prediction", axis=1)
submission_df.head()

In [None]:
config_dict["eval_args"] = {
    "split": {"RS": [0.0, 0.0, 1.0]},
    "group_by": "user",
    "order": "TO",
    "mode": "full"
}
config = Config(model=model_name, dataset='hm', config_dict=config_dict)
dataset = create_dataset(config)
train_data, valid_data, test_data = data_preparation(config, dataset)
test_data.dataset

In [None]:
external_user_ids = dataset.id2token(
    dataset.uid_field, list(range(dataset.user_num))
)[1:]

In [None]:
topk_items = []
for internal_user_id in tqdm(list(range(dataset.user_num))[1:]):
    _, topk_iid_list = full_sort_topk([internal_user_id], model, test_data, k=12, device=config['device'])
    last_topk_iid_list = topk_iid_list[-1]
    external_item_list = dataset.id2token(dataset.iid_field, last_topk_iid_list.cpu()).tolist()
    topk_items.append(external_item_list)
print(len(topk_items))

In [None]:
df_recommendation = pd.DataFrame({
    "customer_id": external_user_ids,
    "prediction": [" ".join(topk) for topk in topk_items]
})
del topk_items
df_recommendation.head()

In [None]:
df_recommendation.shape

In [None]:
submission_df = submission_df.merge(df_recommendation, how="left", on="customer_id")
del df_recommendation

submission_df.head()

In [None]:
submission_df.shape

In [None]:
submission_df.isna().sum()

In [None]:
# fill NA with most popular items in August and September
submission_df.prediction = submission_df.prediction.fillna(" ".join(popular_items))
submission_df.isna().sum()

In [None]:
submission_df.head()

In [None]:
submission_df.to_csv("submission.csv", index=None)