<i>Copyright (c) Recommenders contributors.</i>

<i>Licensed under the MIT License.</i>

# SASRec 

Self-Attentive Sequential Recommendation (SASRec) [1], is a sequential recommendation system model that uses self-attention mechanisms to capture the sequential patterns in user-item interactions. It is designed to predict the next item a user is likely to interact with based on their previous interactions.

In [1]:
import os
import sys
import torch
import pandas as pd

from recommenders.datasets import movielens
from recommenders.datasets.pandas_df_utils import filter_k_interactions
from recommenders.datasets.python_splitters import python_leave_one_out_split
from recommenders.models.unirec.data.dataset.movielens_utils import merge_category
from recommenders.models.unirec.model.sequential.sasrec import SASRec


print(f"System version: {sys.version}")
print(f"Pandas version: {pd.__version__}")
print(f"PyTorch version: {torch.__version__}")

System version: 3.11.9 (main, Apr 19 2024, 16:48:06) [GCC 11.2.0]
Pandas version: 2.2.2
PyTorch version: 2.3.1+cu121


In [2]:
# top k items to recommend
TOP_K = 10

# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = "100k"

USER_COL = "userId"
ITEM_COL = "movieId"
ITEM_SEQ_COL = "item_seq"
RATING_COL = "rating"
TIMESTAMP_COL = "timestamp"
GENRE_COL = "genre"
CATEGORY_COL = "cateId"

OUTPATH = "."
FULL_USER_HISTORY_PATH = os.path.join(OUTPATH, "full_user_history.csv")
USER_HISTORY_PATH = os.path.join(OUTPATH, "user_history.csv")
TRAIN_PATH = os.path.join(OUTPATH, "train.csv")
VALID_PATH = os.path.join(OUTPATH, "valid.csv")
TEST_PATH = os.path.join(OUTPATH, "test.csv")


In [3]:
df = movielens.load_pandas_df(
    size=MOVIELENS_DATA_SIZE,
    header=[USER_COL, ITEM_COL, RATING_COL, TIMESTAMP_COL],
    genres_col=GENRE_COL,
    local_cache_path=OUTPATH,
)
df.head(5)

Unnamed: 0,userId,movieId,rating,timestamp,genre
0,196,242,3.0,881250949,Comedy
1,186,302,3.0,891717742,Crime|Film-Noir|Mystery|Thriller
2,22,377,1.0,878887116,Children's|Comedy
3,244,51,2.0,880606923,Drama|Romance|War|Western
4,166,346,1.0,886397596,Crime|Drama


In [4]:
cate_df = df[[ITEM_COL, GENRE_COL]].drop_duplicates()
print(cate_df.shape)
cate_df.head(5)

(1682, 2)


Unnamed: 0,movieId,genre
0,242,Comedy
1,302,Crime|Film-Noir|Mystery|Thriller
2,377,Children's|Comedy
3,51,Drama|Romance|War|Western
4,346,Crime|Drama


In [5]:
# Extract all unique genres from the data
all_genres = set(genre for genre_string in cate_df[GENRE_COL] for genre in genre_string.split("|"))

# Create a mapping from genre to ID (1-based index)
genre_to_id = {genre: idx + 1 for idx, genre in enumerate(all_genres)}

# Map genres to IDs using the dynamic mapping
cate_df[CATEGORY_COL] = cate_df[GENRE_COL].apply(
    lambda x: [genre_to_id[genre] for genre in x.split("|") if genre in genre_to_id]
)

print("Genre to ID Mapping:", {genre: id for genre, id in genre_to_id.items()})
print("Number of unique genres:", len(all_genres))
cate_df.drop(columns=[GENRE_COL], inplace=True)
cate_df.head(5)

Genre to ID Mapping: {'Adventure': 1, 'Thriller': 2, 'Romance': 3, 'Horror': 4, 'Mystery': 5, "Children's": 6, 'Animation': 7, 'Musical': 8, 'War': 9, 'Crime': 10, 'Comedy': 11, 'unknown': 12, 'Sci-Fi': 13, 'Film-Noir': 14, 'Fantasy': 15, 'Western': 16, 'Drama': 17, 'Documentary': 18, 'Action': 19}
Number of unique genres: 19


Unnamed: 0,movieId,cateId
0,242,[11]
1,302,"[10, 14, 5, 2]"
2,377,"[6, 11]"
3,51,"[17, 3, 9, 16]"
4,346,"[10, 17]"


In [6]:
df.drop(columns=[GENRE_COL], inplace=True)
rating_df = pd.merge(df, cate_df, how="inner", on=[ITEM_COL])

# Merge categories containing a small number of items (lower than min_item_in_cate) into one category, and get the new mappings
cate2idx, item2cate, num_cates = merge_category(rating_df, min_item_in_cate=50)

print("New genre to ID Mapping:", {genre: id for genre, id in cate2idx.items()})
# print(item2cate)
print("Number of unique genres:", num_cates)
rating_df.head()

get cate2items: 100000it [00:00, 1771011.40it/s]
get item2cate: 100000it [00:00, 1933393.57it/s]

New genre to ID Mapping: {11: 1, 10: 2, 5: 3, 2: 4, 6: 5, 17: 6, 3: 7, 9: 8, 13: 9, 19: 10, 1: 11, 8: 12, 4: 13, 14: 14, 16: 14, 18: 14, 7: 14, 15: 14, 12: 14}
Number of unique genres: 14





Unnamed: 0,userId,movieId,rating,timestamp,cateId
0,196,242,3.0,881250949,[11]
1,186,302,3.0,891717742,"[10, 14, 5, 2]"
2,22,377,1.0,878887116,"[6, 11]"
3,244,51,2.0,880606923,"[17, 3, 9, 16]"
4,166,346,1.0,886397596,"[10, 17]"


In [7]:
# Get only positive interactions
data = rating_df.sort_values(by=[USER_COL, TIMESTAMP_COL], ignore_index=True)
print("original dataset size: {0}".format(data.shape))
data = data[data["rating"] >= 3].reset_index(drop=True)
data = data.drop_duplicates(subset=[USER_COL, ITEM_COL], keep="last").reset_index(drop=True)
print("filter by rating>=3 dataset size: {0}".format(data.shape))

# Filter out users and items with less than k interactions
data = filter_k_interactions(data, user_k=10, item_k=10, user_col=USER_COL, item_col=ITEM_COL)
data = data.reset_index(drop=True)
print("k filtered dataset size: {0}".format(data.shape))
data.head()

original dataset size: (100000, 5)
filter by rating>=3 dataset size: (82520, 5)
k filtered dataset size: (80393, 5)


Unnamed: 0,userId,movieId,rating,timestamp,cateId
0,1,168,5.0,874965478,[11]
1,1,172,5.0,874965478,"[19, 1, 17, 3, 13, 9]"
2,1,165,5.0,874965518,[17]
3,1,156,4.0,874965556,"[10, 2]"
4,1,196,5.0,874965677,[17]


In [8]:
# Map
users = data[USER_COL].unique()
items = data[ITEM_COL].unique()
num_users, num_items = len(users), len(items)
user_id_map = {id: i+1 for i, id in enumerate(users)}
item_id_map = {id: i+1 for i, id in enumerate(items)}
map_info = {"user": {str(k): v for k, v in user_id_map.items()}, 
            "item": {str(k): v for k, v in item_id_map.items()}, 
            "cate": {str(k): v for k, v in cate2idx.items()}}
print(f"Number of users: {num_users}, Number of items: {num_items}, Number of categories: {num_cates}")

data[USER_COL] = data[USER_COL].apply(lambda x: user_id_map[x])
data[ITEM_COL] = data[ITEM_COL].apply(lambda x: item_id_map[x])
data[CATEGORY_COL] = data[ITEM_COL].apply(lambda x: item2cate[x])
itemid2cate = data.set_index(ITEM_COL)[CATEGORY_COL].to_dict()
data = data[[USER_COL, ITEM_COL, CATEGORY_COL]]
print(f"Size: {data.shape}")
data.head()

Number of users: 939, Number of items: 1016, Number of categories: 14
Size: (80393, 3)


Unnamed: 0,userId,movieId,cateId
0,1,1,"[14, 5, 1]"
1,1,2,"[10, 11, 4]"
2,1,3,[4]
3,1,4,"[10, 1, 6]"
4,1,5,"[2, 6, 4]"


In [9]:

# Generate full user item sequence
full_user_history = data.groupby(by=USER_COL, as_index=False).agg(list).reset_index(drop=True)
full_user_history[ITEM_SEQ_COL] = full_user_history[ITEM_COL].apply(lambda x: ",".join(map(str,x)))
full_user_history = full_user_history[[USER_COL, ITEM_SEQ_COL]]
full_user_history.to_csv(FULL_USER_HISTORY_PATH, index=False, sep="\t")
full_user_history.head()

Unnamed: 0,userId,item_seq
0,1,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1..."
1,2,"213,198,214,215,216,217,218,219,220,221,222,14..."
2,3,"254,106,220,255,256,257,258,259,260,261,221,21..."
3,4,"198,216,248,268,262,220,273,270,279,280,256,28..."
4,5,"197,287,34,288,289,17,9,32,290,190,13,29,291,2..."


In [11]:
df_train0, df_test = python_leave_one_out_split(data, col_name=USER_COL)
df_train, df_valid = python_leave_one_out_split(df_train0, col_name=USER_COL)
# FIXME: Check this
    # df_train = df_train[columns_to_keep]  
    # df_test = df_test[columns_to_keep]  
    # df_valid = df_valid[columns_to_keep]


df_train.to_csv(TRAIN_PATH, index=False, sep="\t")
df_valid.to_csv(VALID_PATH, index=False, sep="\t")
df_test.to_csv(TEST_PATH, index=False, sep="\t")
print(df_train.shape)
print(df_valid.shape)
print(df_test.shape)

In [None]:
user_history = df_train0.groupby(by=USER_COL, as_index=False).agg(list).reset_index(drop=True)
user_history[ITEM_SEQ_COL] = user_history[ITEM_COL].apply(lambda x: ",".join(map(str,x)))
user_history = user_history[[USER_COL, ITEM_SEQ_COL]]
user_history.to_csv(USER_HISTORY_PATH, index=False, sep="\t")
user_history.head()

In [None]:
# FIXME: Check this
# with open(os.path.join(outpath, "map.json"), "w", encoding="utf-8") as jf:
#     json.dump(map_info, jf)

# with open(os.path.join(outpath, "item2cate.json"), "w", encoding="utf-8") as jf:
#     json.dump(itemid2cate, jf)

## Reference

\[1\] Wang-Cheng Kang, and Julian McAuley, *Self-Attentive Sequential Recommendation*, arXiv preprint arXiv:1808.09781, 2018. <br>

\[2\] Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Łukasz Kaiser, and Illia Polosukhin, *Attention is all you need*, in Advances in Neural Information Processing Systems, 5998–6008, 2017. <br>