In [None]:

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

torch.manual_seed(42)
np.random.seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Loading & Preprocessing Ratings

In [None]:

ratings = pd.read_csv(
    "data/ratings.dat",
    sep="::",
    engine="python",
    names=["user", "item", "rating", "timestamp"]
)

ratings.drop("timestamp", axis=1, inplace=True)

user_ids = ratings["user"].unique()
item_ids = ratings["item"].unique()
user2idx = {u: i for i, u in enumerate(user_ids)}
item2idx = {i: j for j, i in enumerate(item_ids)}

ratings["user"] = ratings["user"].map(user2idx)
ratings["item"] = ratings["item"].map(item2idx)

n_users = len(user2idx)
n_items = len(item2idx)

### Train/Test Split

In [3]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(ratings, test_size=0.2, random_state=42)

### User–Item Matrix

In [None]:
# Initialize R with zeros
R = np.zeros((n_users, n_items), dtype=np.float32)

# Fill in with known ratings from train_df
for row in train_df.itertuples():
    R[row.user, row.item] = row.rating

print(f"Built R matrix of shape {R.shape}, "
      f"with {np.count_nonzero(R)} known entries.")

Built R matrix of shape (6040, 3706), with 800167 known entries.


### UserDataset and DataLoader

In [None]:
from torch.utils.data import Dataset, DataLoader
import torch

class UserDataset(Dataset):
    def __init__(self, R_matrix):
        # R_matrix: a NumPy array of shape (n_users, n_items)
        self.R = torch.from_numpy(R_matrix)  # convert to tensor

    def __len__(self):
        return self.R.size(0)  # number of users

    def __getitem__(self, idx):
        # returns the ratings vector for user idx
        return self.R[idx]

# Instantiating dataset and loader
batch_size = 64
user_dataset = UserDataset(R)
user_loader  = DataLoader(user_dataset, batch_size=batch_size, shuffle=True)

# Quick sanity check
for batch in user_loader:
    print(batch.shape)   # should print: (batch_size, n_items)
    break

torch.Size([64, 3706])


### AutoRec Model

In [6]:
import torch.nn as nn

class AutoRec(nn.Module):
    def __init__(self, n_items, hidden_dim=512):
        super(AutoRec, self).__init__()
        # Encoder: from n_items → hidden_dim
        self.encoder = nn.Sequential(
            nn.Linear(n_items, hidden_dim),
            nn.ReLU()
        )
        # Decoder: from hidden_dim → n_items
        self.decoder = nn.Linear(hidden_dim, n_items)

    def forward(self, x):
        # x: (batch_size, n_items)
        z = self.encoder(x)
        return self.decoder(z)  # reconstruction of shape (batch_size, n_items)

### Instantiating Model, Loss, and Optimizer

In [7]:
# Hyperparameters
hidden_dim = 512
learning_rate = 1e-3
weight_decay = 1e-5

# Instantiate and move to device
ae_model = AutoRec(n_items, hidden_dim).to(device)

# Loss and optimizer
optimizer_ae = torch.optim.Adam(
    ae_model.parameters(),
    lr=learning_rate,
    weight_decay=weight_decay
)

### Training Loop

In [8]:

num_epochs = 10

for epoch in range(1, num_epochs + 1):
    ae_model.train()
    total_loss = 0.0

    for batch in user_loader:
        batch = batch.to(device)                  # shape: (batch_size, n_items)
        output = ae_model(batch)                  # reconstruction

        # Only compute loss on known ratings
        mask = batch > 0                          # boolean mask
        diff = (output - batch)[mask]             # errors on known entries
        loss = (diff * diff).sum() / mask.sum()   # MSE over known entries

        optimizer_ae.zero_grad()
        loss.backward()
        optimizer_ae.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(user_loader)
    print(f"Epoch {epoch}/{num_epochs}, Train Loss: {avg_loss:.4f}")

Epoch 1/10, Train Loss: 3.6141
Epoch 2/10, Train Loss: 1.8205
Epoch 3/10, Train Loss: 1.2312
Epoch 4/10, Train Loss: 1.0697
Epoch 5/10, Train Loss: 1.0855
Epoch 6/10, Train Loss: 1.0718
Epoch 7/10, Train Loss: 0.9895
Epoch 8/10, Train Loss: 0.9207
Epoch 9/10, Train Loss: 0.7629
Epoch 10/10, Train Loss: 0.6849


### Reconstructing & Evaluating on Test Set

In [9]:

ae_model.eval()
with torch.no_grad():
    R_tensor = torch.from_numpy(R).to(device)
    reconstructed = ae_model(R_tensor).cpu().numpy()

preds, actuals = [], []
for row in test_df.itertuples():
    preds.append(reconstructed[row.user, row.item])
    actuals.append(row.rating)

preds = np.array(preds)
actuals = np.array(actuals)

rmse_ae = np.sqrt(np.mean((preds - actuals) ** 2))
mae_ae  = np.mean(np.abs(preds - actuals))
print(f"AutoRec Test RMSE: {rmse_ae:.4f}")
print(f"AutoRec Test MAE:  {mae_ae:.4f}")

AutoRec Test RMSE: 1.0590
AutoRec Test MAE:  0.8379


## Full reconstructed matrix once

In [18]:
# we already did this in the eval cell
ae_model.eval()
with torch.no_grad():
    R_tensor = torch.from_numpy(R).to(device)
    R_pred = ae_model(R_tensor).cpu().numpy()   # shape: (n_users, n_items)

### Top-N function for AutoRec

In [15]:
def recommend_autoRec(user_id, R_pred, train_df, idx2title, n=10):

    # 1. Which items has this user already rated?
    seen = set(train_df[train_df['user'] == user_id]['item'].values)

    # 2. Compile a list of (item_idx, predicted_score) for unseen items
    candidates = [
        (i, R_pred[user_id, i])
        for i in range(R_pred.shape[1])
        if i not in seen
    ]

    # 3. Sort descending by score and take top-n
    top_n = sorted(candidates, key=lambda x: x[1], reverse=True)[:n]
    top_indices = [i for i, _ in top_n]

    # 4. Map indices back to titles
    return [idx2title[i] for i in top_indices]


In [None]:
movies = pd.read_csv(
    "data/movies.dat",
    sep="::",
    engine="python",
    names=["item", "title", "genres"],
    encoding="latin-1"
)

idx2item = {new_idx: orig_id for orig_id, new_idx in item2idx.items()}

idx2title = {
    new_idx: movies.loc[movies["item"] == orig_id, "title"].values[0]
    for new_idx, orig_id in idx2item.items()
}

# Quick sanity check
list(idx2title.items())[:5]

[(0, "One Flew Over the Cuckoo's Nest (1975)"),
 (1, 'James and the Giant Peach (1996)'),
 (2, 'My Fair Lady (1964)'),
 (3, 'Erin Brockovich (2000)'),
 (4, "Bug's Life, A (1998)")]

### Recommendation

In [17]:
user_id = 0
top10_autoRec = recommend_autoRec(user_id, R_pred, train_df, idx2title, n=10)

print(f"AutoRec Top-10 for user {user_id}:")
for rank, title in enumerate(top10_autoRec, 1):
    print(f"{rank}. {title}")

AutoRec Top-10 for user 0:
1. Beyond Rangoon (1995)
2. Castle, The (1997)
3. Heaven's Burning (1997)
4. Umbrellas of Cherbourg, The (Parapluies de Cherbourg, Les) (1964)
5. Trust (1990)
6. Kundun (1997)
7. Man for All Seasons, A (1966)
8. 400 Blows, The (Les Quatre cents coups) (1959)
9. Soldier's Daughter Never Cries, A (1998)
10. Dersu Uzala (1974)
