In [1]:
import math
import os
import random
import zipfile
from io import BytesIO

import lightning as L
import numpy as np
import pandas as pd
import requests
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

In [2]:
URL = "https://files.grouplens.org/datasets/movielens/ml-1m.zip"
BASE_DIR = "../dataset"


def download_and_extract_zip(url, extract_path):
    response = requests.get(url)
    zip_file = zipfile.ZipFile(BytesIO(response.content))
    zip_file.extractall(extract_path)


if not os.path.exists(BASE_DIR):
    os.makedirs(BASE_DIR)
    download_and_extract_zip(URL, BASE_DIR)

In [3]:
DATASET_DIR = os.path.join(BASE_DIR, "ml-1m")

user_filepath = os.path.join(DATASET_DIR, "users.dat")
movie_filepath = os.path.join(DATASET_DIR, "movies.dat")
rating_filepath = os.path.join(DATASET_DIR, "ratings.dat")

users = pd.read_csv(
    user_filepath,
    sep="::",
    names=["user_id", "sex", "age_group", "occupation", "zip_code"],
    dtype={
        "user_id": int,
        "sex": str,
        "age_group": int,
        "occupation": int,
        "zip_code": str,
    },
)

movies = pd.read_csv(
    movie_filepath,
    sep="::",
    names=["movie_id", "title", "genres"],
    dtype={"movie_id": int, "title": str, "genres": str},
    encoding="ISO-8859-1",
)
ratings = pd.read_csv(
    rating_filepath,
    sep="::",
    names=["user_id", "movie_id", "rating", "unix_timestamp"],
    dtype={"user_id": int, "movie_id": int, "rating": int, "unix_timestamp": int},
)

  users = pd.read_csv(
  movies = pd.read_csv(
  ratings = pd.read_csv(


In [4]:
# user feat
unique_user_ids = users["user_id"].unique()
unique_user_ids.sort()

unique_sex = users["sex"].unique()
unique_sex.sort()

unique_age_group = users["age_group"].unique()
unique_age_group.sort()

unique_ocuupation = users["occupation"].unique()
unique_ocuupation.sort()

# movie feat
unique_movie_ids = movies["movie_id"].unique()
unique_movie_ids.sort()

# tokenization
user_id_mapping = {user_id: i for i, user_id in enumerate(unique_user_ids)}
sex_mapping = {sex: i for i, sex in enumerate(unique_sex)}
age_group_mapping = {age_group: i for i, age_group in enumerate(unique_age_group)}
occupation_mapping = {occupation: i for i, occupation in enumerate(unique_ocuupation)}
movie_id_mapping = {movie_id: i for i, movie_id in enumerate(unique_movie_ids)}

In [5]:
ratings["user_id"] = ratings["user_id"].map(user_id_mapping)
ratings["movie_id"] = ratings["movie_id"].map(movie_id_mapping)

users["user_id"] = users["user_id"].map(user_id_mapping)
users["sex"] = users["sex"].map(sex_mapping)
users["age_group"] = users["age_group"].map(age_group_mapping)
users["occupation"] = users["occupation"].map(occupation_mapping)

movies["movie_id"] = movies["movie_id"].map(movie_id_mapping)

In [6]:
ratings_group = ratings.sort_values(by=["unix_timestamp"]).groupby("user_id")

ratings_data = pd.DataFrame(
    data={
        "user_id": list(ratings_group.groups.keys()),
        "movie_ids": list(ratings_group.movie_id.apply(list)),
        "ratings": list(ratings_group.rating.apply(list)),
        "timestamps": list(ratings_group.unix_timestamp.apply(list)),
    }
)
ratings_data

In [7]:
user_id_data = []
movie_seq_data = []
rating_seq_data = []

sequence_length = 5
window_size = 1


for i in range(len(ratings_data)):
    row = ratings_data.iloc[i]

    movie_id_history = torch.tensor(row.movie_ids)
    rating_history = torch.tensor(row.ratings)
    movie_ids_seq = (
        movie_id_history.ravel().unfold(0, sequence_length, window_size).to(torch.int32)
    )
    ratings_seq = (
        rating_history.ravel().unfold(0, sequence_length, window_size).to(torch.int32)
    )

    user_id_data += [row.user_id] * len(movie_ids_seq)
    movie_seq_data += movie_ids_seq.tolist()
    rating_seq_data += ratings_seq.tolist()

In [8]:
sequencd_data_df = pd.DataFrame(
    {
        "user_id": user_id_data,
        "movie_seq": movie_seq_data,
        "rating_seq": rating_seq_data,
    }
)
sequencd_data_df

Unnamed: 0,user_id,movie_seq,rating_seq
0,0,"[3117, 1250, 1009, 1672, 2271]","[4, 5, 5, 4, 3]"
1,0,"[1250, 1009, 1672, 2271, 1768]","[5, 5, 4, 3, 5]"
2,0,"[1009, 1672, 2271, 1768, 3339]","[5, 4, 3, 5, 4]"
3,0,"[1672, 2271, 1768, 3339, 1189]","[4, 3, 5, 4, 4]"
4,0,"[2271, 1768, 3339, 1189, 2735]","[3, 5, 4, 4, 5]"
...,...,...,...
976044,6039,"[1627, 453, 3602, 229, 2848]","[4, 4, 4, 5, 4]"
976045,6039,"[453, 3602, 229, 2848, 1726]","[4, 4, 5, 4, 3]"
976046,6039,"[3602, 229, 2848, 1726, 1852]","[4, 5, 4, 3, 4]"
976047,6039,"[229, 2848, 1726, 1852, 159]","[5, 4, 3, 4, 3]"


In [9]:
all_movies_set = set(movies.movie_id)


def add_target_item(x):
    # Add negative items for CTR prediction
    movie_seq_data = x["movie_seq"]
    input_seq_len = len(movie_seq_data) - 1
    input_seq = movie_seq_data[:input_seq_len]

    if np.random.random() <= 0.5:
        is_clicked = True
        target_item = movie_seq_data[-1]
    else:
        is_clicked = False
        negative_items = list(all_movies_set - set(movie_seq_data))
        target_item = random.choice(negative_items)
    return input_seq, is_clicked, target_item

In [10]:
# Add negative target item by add_target_item()
sequencd_data_df[["input_seq", "is_clicked", "target_item"]] = sequencd_data_df.apply(
    lambda x: add_target_item(x), axis=1, result_type="expand"
)

In [11]:
sequencd_data_df = sequencd_data_df.join(users.set_index("user_id"), on="user_id")
sequencd_data_df

Unnamed: 0,user_id,movie_seq,rating_seq,input_seq,is_clicked,target_item,sex,age_group,occupation,zip_code
0,0,"[3117, 1250, 1009, 1672, 2271]","[4, 5, 5, 4, 3]","[3117, 1250, 1009, 1672]",False,232,0,0,10,48067
1,0,"[1250, 1009, 1672, 2271, 1768]","[5, 5, 4, 3, 5]","[1250, 1009, 1672, 2271]",True,1768,0,0,10,48067
2,0,"[1009, 1672, 2271, 1768, 3339]","[5, 4, 3, 5, 4]","[1009, 1672, 2271, 1768]",True,3339,0,0,10,48067
3,0,"[1672, 2271, 1768, 3339, 1189]","[4, 3, 5, 4, 4]","[1672, 2271, 1768, 3339]",False,1700,0,0,10,48067
4,0,"[2271, 1768, 3339, 1189, 2735]","[3, 5, 4, 4, 5]","[2271, 1768, 3339, 1189]",True,2735,0,0,10,48067
...,...,...,...,...,...,...,...,...,...,...
976044,6039,"[1627, 453, 3602, 229, 2848]","[4, 4, 4, 5, 4]","[1627, 453, 3602, 229]",False,645,1,2,6,11106
976045,6039,"[453, 3602, 229, 2848, 1726]","[4, 4, 5, 4, 3]","[453, 3602, 229, 2848]",True,1726,1,2,6,11106
976046,6039,"[3602, 229, 2848, 1726, 1852]","[4, 5, 4, 3, 4]","[3602, 229, 2848, 1726]",False,1574,1,2,6,11106
976047,6039,"[229, 2848, 1726, 1852, 159]","[5, 4, 3, 4, 3]","[229, 2848, 1726, 1852]",True,159,1,2,6,11106


In [12]:
random_selection = np.random.rand(len(sequencd_data_df)) <= 0.85
train_df = sequencd_data_df[random_selection]
valid_df = sequencd_data_df[~random_selection]

In [13]:
import torch
from torch.utils.data import Dataset


class MovieLensDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        user_id = row.user_id
        sex = row.sex
        age = row.age_group
        occupation = row.occupation
        input_seq = row.input_seq
        target_item = row.target_item
        label = row.is_clicked

        user_feat = (user_id, sex, age, occupation)
        seq_item = torch.tensor(input_seq)

        return (user_feat, seq_item, target_item, label)

In [14]:
class BST(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.embedding_layer = EmbeddingLayer(cfg)
        self.transformer_layer = TransformerLayer(cfg)
        self.mlp_layer = MLPLayer(cfg)

    def forward(self, user_feat, seq_item, target_item):
        user_emb, seq_item_emb, target_item_emb = self.embedding_layer(
            user_feat, seq_item, target_item
        )

        transformer_output = self.transformer_layer(seq_item_emb)

        concat_feat = torch.concat(
            [user_emb, transformer_output, target_item_emb],
            dim=-1,
        )

        p_ctr = self.mlp_layer(concat_feat)
        return p_ctr


class EmbeddingLayer(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg

        self.user_embedding = nn.Embedding(cfg.num_user, cfg.user_emb_dim)
        self.sex_embedding = nn.Embedding(cfg.num_sex, cfg.sex_emb_dim)
        self.age_embedding = nn.Embedding(cfg.num_age_group, cfg.age_group_emb_dim)
        self.occupation_embedding = nn.Embedding(
            cfg.num_occupation, cfg.occupation_emb_dim
        )
        self.movie_embedding = nn.Embedding(cfg.num_movie, cfg.movie_emb_dim)

    def forward(
        self,
        user_feat,
        seq_item,
        target_item,
    ):
        # Get user embeddings
        user_id, sex, age, occupation = user_feat
        user_emb = self.user_embedding(user_id)
        sex_emb = self.sex_embedding(sex)
        age_emb = self.age_embedding(age)
        occupation_emb = self.occupation_embedding(occupation)

        # Get movie embedding
        seq_item_emb = self.movie_embedding(seq_item)
        target_item_emb = self.movie_embedding(target_item)

        user_feat = torch.concat([user_emb, sex_emb, age_emb, occupation_emb], dim=-1)

        return user_feat, seq_item_emb, target_item_emb


class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 100):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)
        )
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer("pe", pe.transpose(0, 1))

    def forward(self, x):
        """
        x:Tensor, shape(batch size, seq_len, emb_dim)
        """
        seq_len = x.size(1)
        x = x + self.pe[:, :seq_len, :]
        return self.dropout(x)


class TransformerLayer(nn.Module):
    def __init__(self, cfg):
        super().__init__()

        self.pe = PositionalEncoding(
            d_model=cfg.d_model,
            dropout=cfg.dropout_rate,
            max_len=cfg.seq_len,
        )

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=cfg.d_model,
            nhead=cfg.nhead,
            dim_feedforward=cfg.dim_feedforward,
            dropout=cfg.dropout_rate,
            batch_first=True,
        )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer=encoder_layer, num_layers=2
        )

    def forward(self, movie_seq_emb):
        x = self.pe(movie_seq_emb)
        enc_out = self.transformer_encoder(x)[:, -1, :]
        return enc_out


class MLPLayer(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(cfg.mlp_dim, cfg.mlp_hidden_1_dim),
            nn.LeakyReLU(),
            nn.Linear(cfg.mlp_hidden_1_dim, cfg.mlp_hidden_2_dim),
            nn.LeakyReLU(),
            nn.Linear(cfg.mlp_hidden_2_dim, cfg.mlp_hidden_3_dim),
            nn.LeakyReLU(),
            nn.Linear(cfg.mlp_hidden_3_dim, cfg.mlp_hidden_4_dim),
            nn.Sigmoid(),
        )

    def forward(self, x):
        output = self.mlp(x)
        return output

In [15]:
class LightningModule(L.LightningModule):
    def __init__(self, CFG):
        super().__init__()
        self.CFG = CFG
        self.model = BST(CFG)
        self.loss_fn = nn.BCELoss()
        self.validation_step_outputs = []
        self.validation_step_labels = []

    def forward(self, user_feat, input_seq, target_item):
        return self.model(user_feat, input_seq, target_item)

    def training_step(self, batch, batch_idx):
        user_feat, input_seq, target_item, label = batch

        output = self(user_feat, input_seq, target_item)
        output = output.squeeze(-1)
        label = label.type(torch.float32)

        loss = self.loss_fn(output, label)
        self.log("train_loss", loss)

        return loss

    def validation_step(self, batch, batch_idx):

        user_feat, input_seq, target_item, label = batch

        output = self(user_feat, input_seq, target_item)
        output = output.squeeze(-1)
        label = label.type(torch.float32)

        self.validation_step_outputs.append(output)
        self.validation_step_labels.append(label)

        loss = self.loss_fn(output, label)
        self.log("valid_loss", loss)
        return loss

    def on_validation_epoch_end(self):
        all_output = torch.concat(self.validation_step_outputs)
        all_label = torch.concat(self.validation_step_labels)
        hit_count = ((all_output > 0.5) == all_label).sum()
        accuracy = hit_count / len(all_output)
        self.log("accuracy", accuracy)

        self.validation_step_outputs.clear()
        self.validation_step_labels.clear()

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=self.CFG.lr)
        return optimizer

In [19]:
class CFG:
    device = "cuda" if torch.cuda.is_available() else "cpu"
    lr = 5e-4
    n_epoch = 2

    num_user = 6040
    num_movie = 3883
    num_sex = 2
    num_age_group = 7
    num_occupation = 21

    seq_len = 4
    num_layers = 2

    user_emb_dim = 32
    movie_emb_dim = 32
    sex_emb_dim = 2
    age_group_emb_dim = 4
    occupation_emb_dim = 4

    d_model = 32
    nhead = 2
    dim_feedforward = 64

    mlp_dim = (
        d_model
        + user_emb_dim
        + sex_emb_dim
        + age_group_emb_dim
        + occupation_emb_dim
        + movie_emb_dim
    )
    mlp_hidden_1_dim = 128
    mlp_hidden_2_dim = 64
    mlp_hidden_3_dim = 32
    mlp_hidden_4_dim = 1

    dropout_rate = 0.1

In [22]:
print(f"Train Size : {len(train_df)}")
print(f"Valid Size : {len(valid_df)}")

train_dataset = MovieLensDataset(train_df)
valid_dataset = MovieLensDataset(valid_df)

train_dataloader = DataLoader(
    train_dataset,
    batch_size=256,
    shuffle=True,
    # num_workers=2,
)

valid_dataloader = DataLoader(
    valid_dataset,
    batch_size=256,
    shuffle=True,
    # num_workers=2,
)

Train Size : 829323
Valid Size : 146726


In [23]:
model = LightningModule(CFG=CFG)
trainer = L.Trainer(
    max_epochs=CFG.n_epoch,
)
trainer.fit(model, train_dataloader, valid_dataloader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name    | Type    | Params | Mode 
--------------------------------------------
0 | model   | BST     | 358 K  | train
1 | loss_fn | BCELoss | 0      | train
--------------------------------------------
358 K     Trainable params
0         Non-trainable params
358 K     Total params
1.435     Total estimated model params size (MB)
43        Modules in train mode
0         Modules in eval mode


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

/Users/ailab/work/behavior-sequence-transformer-pytorch/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


                                                                           

/Users/ailab/work/behavior-sequence-transformer-pytorch/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Epoch 1: 100%|██████████| 3240/3240 [05:02<00:00, 10.72it/s, v_num=1]

`Trainer.fit` stopped: `max_epochs=2` reached.


Epoch 1: 100%|██████████| 3240/3240 [05:02<00:00, 10.72it/s, v_num=1]


In [24]:
trainer.validate(model, valid_dataloader)

/Users/ailab/work/behavior-sequence-transformer-pytorch/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:476: Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.
/Users/ailab/work/behavior-sequence-transformer-pytorch/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Validation DataLoader 0: 100%|██████████| 574/574 [00:45<00:00, 12.61it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     Validate metric           DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        accuracy            0.7609898447990417
       valid_loss           0.48789283633232117
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'valid_loss': 0.48789283633232117, 'accuracy': 0.7609898447990417}]