* Borrowed some of the preprocessing and EDA code from Ahmet Erdem's [H&M Pure Pytorch Baseline](https://www.kaggle.com/code/aerdem4/h-m-pure-pytorch-baseline/notebook) notebook.
* Employed some tricks from [this thread by Chris Deotte](https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/discussion/308635) to reduce memory footprint.

In [None]:
import gc
import sys
from itertools import chain

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.preprocessing import OrdinalEncoder

## Reading and preprocessing data

In [None]:
df = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv", dtype={"article_id": str}, parse_dates=["t_dat"])
df['customer_id_int'] = df['customer_id'].apply(lambda x: int(x[-16:],16) ).astype('int64')
del df['customer_id']
df['article_id'] = df['article_id'].astype('int32')
print(df.shape)
df.head()

In [None]:
test_df = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv').drop("prediction", axis=1)
test_df['customer_id_int'] = test_df['customer_id'].apply(lambda x: int(x[-16:],16) ).astype('int64')

In [None]:
print("Max `t_dat`:", df["t_dat"].max())
active_articles = df.groupby("article_id")["t_dat"].max().reset_index()
active_articles = active_articles[active_articles["t_dat"] >= "2019-09-01"].reset_index()
n_classes = active_articles.shape[0] + 1
active_articles.shape, n_classes

In [None]:
# restrict the entries to articles that has appearance after 2019-01-01
df = df[df["article_id"].isin(active_articles["article_id"])].reset_index(drop=True)
df.shape

In [None]:
df["week"] = (df["t_dat"].max() - df["t_dat"]).dt.days // 7
print(df["week"].nunique())

## Most-Common-Items baseline

In [None]:
item_counts = df[df.week < 2].article_id.value_counts()
item_counts[:12]

In [None]:
most_frequent_items = item_counts[:12].index.to_numpy()
prediction_str = " ".join(map("{:010d}".format, most_frequent_items))
prediction_str

In [None]:
test_df["prediction"] = prediction_str
test_df.to_csv("submission_most_common_items.csv.gz", compression="gzip", index=False)

##  Recently-Purchased-Items Baseline

In [None]:
df_tmp = df[df["week"] <= 7].sort_values(["customer_id_int", "t_dat"], ascending=False)

In [None]:
def keep_latest_k(articles, k=12):
    result = []
    # Use most commonly bought items to fill the empty spots
    for item in chain(articles, most_frequent_items):
        if item in result:
            continue
        result.append(item)
        if len(result) == k:
            break
    return result

In [None]:
df_latest_items = df_tmp.groupby("customer_id_int").agg({"article_id": keep_latest_k}).reset_index()
df_latest_items["prediction"] = df_latest_items.article_id.apply(lambda x: " ".join(map("{:010d}".format, x)))
df_latest_items.head()

In [None]:
test_df = test_df.drop('prediction', axis=1).merge(
    df_latest_items[["customer_id_int", "prediction"]], 
    how="left", on="customer_id_int")
test_df.head()

In [None]:
# Use most-bought-items for customers without recent purchase history
test_df["prediction"] = test_df["prediction"].fillna(prediction_str)
test_df.head()

In [None]:
test_df[["customer_id", "prediction"]].to_csv("submission_recently_purchased.csv.gz", compression="gzip", index=False)