<i>Copyright (c) Recommenders contributors.</i>

<i>Licensed under the MIT License.</i>

# SASRec 

Self-Attentive Sequential Recommendation (SASRec) [1], is a sequential recommendation system model that uses self-attention mechanisms to capture the sequential patterns in user-item interactions. It is designed to predict the next item a user is likely to interact with based on their previous interactions.

In [None]:
import os
import sys
import torch
import pandas as pd

from recommenders.datasets import movielens
from recommenders.datasets.pandas_df_utils import filter_k_interactions
from recommenders.models.unirec.data.dataset.movielens_utils import merge_category
from recommenders.models.unirec.model.sequential.sasrec import SASRec


print(f"System version: {sys.version}")
print(f"Pandas version: {pd.__version__}")
print(f"PyTorch version: {torch.__version__}")

In [2]:
# top k items to recommend
TOP_K = 10

# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = "100k"

USER_COL = "userId"
ITEM_COL = "movieId"
ITEM_SEQ_COL = "item_seq"
RATING_COL = "rating"
TIMESTAMP_COL = "timestamp"
GENRE_COL = "genre"
CATEGORY_COL = "cateId"

OUTPATH = "."
FULL_USER_HISTORY_PATH = os.path.join(OUTPATH, "full_user_history.csv")

In [None]:
df = movielens.load_pandas_df(
    size=MOVIELENS_DATA_SIZE,
    header=[USER_COL, ITEM_COL, RATING_COL, TIMESTAMP_COL],
    genres_col=GENRE_COL,
    local_cache_path=OUTPATH,
)
df.head(5)

In [None]:
cate_df = df[[ITEM_COL, GENRE_COL]].drop_duplicates()
print(cate_df.shape)
cate_df.head(5)

In [None]:
# Extract all unique genres from the data
all_genres = set(genre for genre_string in cate_df[GENRE_COL] for genre in genre_string.split("|"))

# Create a mapping from genre to ID (1-based index)
genre_to_id = {genre: idx + 1 for idx, genre in enumerate(all_genres)}

# Map genres to IDs using the dynamic mapping
cate_df[CATEGORY_COL] = cate_df[GENRE_COL].apply(
    lambda x: [genre_to_id[genre] for genre in x.split("|") if genre in genre_to_id]
)

print("Genre to ID Mapping:", {genre: id for genre, id in genre_to_id.items()})
print("Number of unique genres:", len(all_genres))
cate_df.drop(columns=[GENRE_COL], inplace=True)
cate_df.head(5)

In [None]:
df.drop(columns=[GENRE_COL], inplace=True)
rating_df = pd.merge(df, cate_df, how="inner", on=[ITEM_COL])

# Merge categories containing a small number of items (lower than min_item_in_cate) into one category, and get the new mappings
cate2idx, item2cate, num_cates = merge_category(rating_df, min_item_in_cate=50)

print("New genre to ID Mapping:", {genre: id for genre, id in cate2idx.items()})
# print(item2cate)
print("Number of unique genres:", num_cates)
rating_df.head()

In [None]:
# Get only positive interactions
data = rating_df.sort_values(by=[USER_COL, TIMESTAMP_COL], ignore_index=True)
print("original dataset size: {0}".format(data.shape))
data = data[data["rating"] >= 3].reset_index(drop=True)
data = data.drop_duplicates(subset=[USER_COL, ITEM_COL], keep="last").reset_index(drop=True)
print("filter by rating>=3 dataset size: {0}".format(data.shape))

# Filter out users and items with less than k interactions
data = filter_k_interactions(data, user_k=10, item_k=10, user_col=USER_COL, item_col=ITEM_COL)
data = data.reset_index(drop=True)
print("k filtered dataset size: {0}".format(data.shape))
data.head()

In [None]:
# Map
users = data[USER_COL].unique()
items = data[ITEM_COL].unique()
num_users, num_items = len(users), len(items)
user_id_map = {id: i+1 for i, id in enumerate(users)}
item_id_map = {id: i+1 for i, id in enumerate(items)}
map_info = {"user": {str(k): v for k, v in user_id_map.items()}, 
            "item": {str(k): v for k, v in item_id_map.items()}, 
            "cate": {str(k): v for k, v in cate2idx.items()}}
print(f"Number of users: {num_users}, Number of items: {num_items}, Number of categories: {num_cates}")

data[USER_COL] = data[USER_COL].apply(lambda x: user_id_map[x])
data[ITEM_COL] = data[ITEM_COL].apply(lambda x: item_id_map[x])
data[CATEGORY_COL] = data[ITEM_COL].apply(lambda x: item2cate[x])
itemid2cate = data.set_index(ITEM_COL)[CATEGORY_COL].to_dict()
data.head()

In [None]:

# Generate full user item sequence
data = data[[USER_COL, ITEM_COL, CATEGORY_COL]]
full_user_history = data.groupby(by=USER_COL, as_index=False).agg(list).reset_index(drop=True)
full_user_history[ITEM_SEQ_COL] = full_user_history[ITEM_COL].apply(lambda x: ",".join(map(str,x)))
full_user_history = full_user_history[[USER_COL, ITEM_SEQ_COL]]
full_user_history.to_csv(FULL_USER_HISTORY_PATH, index=False, sep='\t')
full_user_history.head()

## Reference

\[1\] Wang-Cheng Kang, and Julian McAuley, *Self-Attentive Sequential Recommendation*, arXiv preprint arXiv:1808.09781, 2018. <br>

\[2\] Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Łukasz Kaiser, and Illia Polosukhin, *Attention is all you need*, in Advances in Neural Information Processing Systems, 5998–6008, 2017. <br>