# Setup

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [15]:
# get and format game data
games_df = pd.read_csv("../../bgg_data/overall_games.csv")
names = games_df[["Name", "BGGId"]]
games_df = games_df.drop(columns=["Name"], errors='ignore')

# get and format user ratings data
ratings_df = pd.read_csv("../../bgg_data/user_ratings.csv")
ratings_df = ratings_df[ratings_df["BGGId"].isin(set(games_df["BGGId"]))]
# train with just the first 50k users (bc 411k is taking wayyyyy too long)
unique_users = ratings_df['Username'].unique()[:15000]
ratings_df = ratings_df[ratings_df["Username"].isin(unique_users)]
user_id_map = {uid: idx for idx, uid in enumerate(unique_users)}

train_ratings_df, test_ratings_df = train_test_split(ratings_df, test_size=0.2, random_state=42)

# Make Datasets and Dataloaders

In [17]:
class RatingsDataset(Dataset):
    def __init__(self, ratings_df, games_df, user_id_map, transform=None):
        self.ratings_df = ratings_df.reset_index(drop=True)
        self.games_df = games_df
        self.user_id_map = user_id_map
        self.transform = transform

    def __len__(self):
        return len(self.ratings_df)

    def __getitem__(self, idx):
        row = self.ratings_df.iloc[idx]
        user_id = row['Username']
        bggid = row['BGGId']
        rating = row['Rating']
        # Map the UserId to an index.
        user_index = self.user_id_map[user_id]
        # Look up game features for this BGGId.
        game_features = self.games_df[self.games_df["BGGId"] == bggid]
        if self.transform:
            game_features = self.transform(game_features)
        # Convert to tensors.
        game_features = torch.tensor(game_features.values.squeeze(), dtype=torch.float32)
        user_index = torch.tensor(user_index, dtype=torch.long)
        rating = torch.tensor(rating, dtype=torch.float32)
        return game_features, user_index, rating

In [18]:
train_dataset = RatingsDataset(train_ratings_df, games_df, user_id_map)
test_dataset = RatingsDataset(test_ratings_df, games_df, user_id_map)

batch_size = 256
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=8)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=8)

# Make Game Encoder

In [19]:
class GameEncoder(nn.Module):
    def __init__(self, input_dim, latent_dim=64):
        super(GameEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Linear(256, latent_dim)
        )
    def forward(self, x):
        return self.encoder(x)

# Make Overall Ratings Predictor

In [20]:
class RatingPredictor(nn.Module):
    def __init__(self, game_encoder, num_users, latent_dim=64, user_emb_dim=32, mlp_hidden_dim=128):
        super(RatingPredictor, self).__init__()
        self.game_encoder = game_encoder
        self.user_embedding = nn.Embedding(num_users, user_emb_dim)
        self.mlp = nn.Sequential(
            nn.Linear(latent_dim + user_emb_dim, mlp_hidden_dim),
            nn.ReLU(),
            nn.Linear(mlp_hidden_dim, mlp_hidden_dim // 2),
            nn.ReLU(),
            nn.Linear(mlp_hidden_dim // 2, 1)
        )
    def forward(self, game_features, user_ids):
        game_emb = self.game_encoder(game_features)
        user_emb = self.user_embedding(user_ids)
        x = torch.cat([game_emb, user_emb], dim=1)
        rating = self.mlp(x)
        return rating.squeeze(1)

# Make Model, Hyperparameters

In [21]:
latent_dim = 64
input_dim = len(train_dataset.games_df.columns)
num_users = len(unique_users)
game_encoder = GameEncoder(input_dim, latent_dim=latent_dim)
model = RatingPredictor(game_encoder, num_users, latent_dim=latent_dim, user_emb_dim=32, mlp_hidden_dim=128)
device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
model.to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
num_epochs = 10

# Define Training Procedure

In [22]:
def train_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0.0
    for game_features, user_ids, ratings in loader:
        game_features = game_features.to(device)
        user_ids = user_ids.to(device)
        ratings = ratings.to(device)
        optimizer.zero_grad()
        outputs = model(game_features, user_ids)
        loss = criterion(outputs, ratings)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * game_features.size(0)
    return total_loss / len(loader.dataset)

def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for game_features, user_ids, ratings in loader:
            game_features = game_features.to(device)
            user_ids = user_ids.to(device)
            ratings = ratings.to(device)
            outputs = model(game_features, user_ids)
            loss = criterion(outputs, ratings)
            total_loss += loss.item() * game_features.size(0)
    return total_loss / len(loader.dataset)

def train_model(model, num_epochs, train_loader, test_loader, optimizer, criterion, device='cpu'):
    train_losses = []
    test_losses = []
    for epoch in range(num_epochs):
        train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
        test_loss = evaluate(model, test_loader, criterion, device)
        print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}")
        train_losses.append(train_loss)
        test_losses.append(test_loss)
    return train_losses, test_losses

# Train the Model

In [23]:
train_loss, test_loss = train_model(
    model,
    num_epochs,
    train_loader, 
    test_loader, 
    optimizer,
    criterion, 
    device=device)

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Users/claudia/opt/anaconda3/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Users/claudia/opt/anaconda3/lib/python3.8/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'RatingsDataset' on <module '__main__' (built-in)>


KeyboardInterrupt: 

# Evaluate the Model

In [None]:
def plot_losses(train_losses, test_losses, title="Training and Test Loss Over Epochs"):
    epochs = range(1, len(train_losses) + 1)
    plt.figure(figsize=(8, 5))
    plt.plot(epochs, train_losses, label='Train Loss')
    plt.plot(epochs, test_losses, label='Test Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title(title)
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
plot_losses(train_loss, test_loss)

In [16]:
len(ratings_df)

4424663