In [1]:
import os
from pathlib import Path 

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from collections import defaultdict


In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [7]:

# ====================
# 1. データ前処理
# ====================
class MovieLensDataset(Dataset):
    def __init__(self, user_seq, max_len):
        self.user_seq = user_seq
        self.max_len = max_len

    def __len__(self):
        return len(self.user_seq)

    def __getitem__(self, idx):
        seq = self.user_seq[idx]
        # Padding
        seq = seq[-self.max_len:]
        seq = [0] * (self.max_len - len(seq)) + seq
        target = seq[-1]
        input_seq = seq[:-1]
        return torch.tensor(input_seq, dtype=torch.long), torch.tensor(target, dtype=torch.long)


def preprocess_movielens(max_len=50):
    ratings = pd.read_csv(
        os.path.join(Path().resolve(), "datasets/ml-1m/ratings.dat"),
        sep="::",
        engine="python",
        header=None,
        names=["uu_id", "movie_id", "rating", "timestamp"],
    )
    ratings = ratings.sort_values(by=["uu_id", "timestamp"])
    user_seq = defaultdict(list)

    for _, row in ratings.iterrows():
        user_seq[row["uu_id"]].append(row["movie_id"])

    user_seq = list(user_seq.values())

    return user_seq


In [8]:

# ====================
# 2. SASRec モデル
# ====================
class SASRec(nn.Module):
    def __init__(self, num_items, max_len, embed_dim, num_heads, num_layers, dropout):
        super(SASRec, self).__init__()
        self.num_items = num_items
        self.max_len = max_len
        self.embedding = nn.Embedding(num_items, embed_dim)
        self.position_embedding = nn.Embedding(max_len, embed_dim)
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(embed_dim, num_heads, dim_feedforward=embed_dim * 4, dropout=dropout),
            num_layers=num_layers,
        )
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(embed_dim, num_items)

    def forward(self, input_seq):
        seq_len = input_seq.size(1)
        positions = torch.arange(seq_len, device=input_seq.device).unsqueeze(0)
        item_embed = self.embedding(input_seq)
        pos_embed = self.position_embedding(positions)
        x = self.dropout(item_embed + pos_embed)
        x = self.transformer(x)
        logits = self.fc(x)
        return logits



In [9]:

# ====================
# 3. モデル学習と評価
# ====================
def train(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for input_seq, target in tqdm(dataloader, desc="Training"):
        input_seq, target = input_seq.to(device), target.to(device)
        optimizer.zero_grad()
        logits = model(input_seq)
        logits = logits[:, -1, :]  # 最後の出力のみ
        loss = criterion(logits, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)


def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for input_seq, target in tqdm(dataloader, desc="Evaluating"):
            input_seq, target = input_seq.to(device), target.to(device)
            logits = model(input_seq)
            logits = logits[:, -1, :]  # 最後の出力のみ
            loss = criterion(logits, target)
            total_loss += loss.item()
    return total_loss / len(dataloader)



In [10]:

# ====================
# 4. ハイパーパラメータと実行
# ====================
def main():
    # ハイパーパラメータ
    data_path =  os.path.join(Path().resolve(), "datasets/ml-1m/ratings.dat")  # MovieLensデータのパス
    
    max_len = 50
    embed_dim = 64
    num_heads = 4
    num_layers = 2
    dropout = 0.2
    batch_size = 128
    num_epochs = 10
    learning_rate = 0.001
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # データ準備
    user_seq = preprocess_movielens(data_path, max_len=max_len)
    train_seq, test_seq = train_test_split(user_seq, test_size=0.2, random_state=42)

    train_dataset = MovieLensDataset(train_seq, max_len=max_len)
    test_dataset = MovieLensDataset(test_seq, max_len=max_len)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    num_items = max(max(seq) for seq in user_seq) + 1

    # モデル定義
    model = SASRec(num_items, max_len, embed_dim, num_heads, num_layers, dropout).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # 学習と評価
    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        train_loss = train(model, train_loader, criterion, optimizer, device)
        test_loss = evaluate(model, test_loader, criterion, device)
        print(f"Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}")


if __name__ == "__main__":
    main()


TypeError: preprocess_movielens() got multiple values for argument 'max_len'