# Milestone 1: Modeling v1

In [14]:
import numpy as np  
import pandas as pd 
import re
import ast
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
import time
import os 
import psutil
from sklearn.metrics import mean_squared_error, mean_absolute_error


### Clean & Load Data

In [15]:
# READ USER RATING DATA

user_rating = pd.read_csv('data/user_rate_short_data.csv', names=["timestamp", "user_id", "movie", "rating"])
user_rating

Unnamed: 0,timestamp,user_id,movie,rating
0,2025-02-12T05:11:55,13539,the+lion+king+2+simbas+pride+1998,4
1,2025-02-12T05:11:55,35499,howls+moving+castle+2004,5
2,2025-02-12T05:11:57,58700,so+close+2002,4
3,2025-02-12T05:11:57,87247,much+ado+about+nothing+1993,4
4,2025-02-12T05:12,62758,the+arrival+of+a+train+at+la+ciotat+1896,4
...,...,...,...,...
14995,2025-01-26T01:05:49,51815,ronin+1998,3
14996,2025-01-26T01:20:39,17929,the+mask+of+zorro+1998,1
14997,2025-01-26T04:04:57,2575,stargate+1994,5
14998,2025-01-26T04:50:33,92847,shrek+2001,3


In [16]:
# READ USER WATCH HISTORY DATA

watch_history = pd.read_csv('data/user_watch_short_data.csv', names=["timestamp", "user_id", "movie", "file"])
watch_history

Unnamed: 0,timestamp,user_id,movie,file
0,2025-02-12T05:11:55,54831,konga+1961,22.mpg
1,2025-02-12T05:11:55,31542,avatar+2009,48.mpg
2,2025-02-12T05:11:55,45759,up+2009,11.mpg
3,2025-02-12T05:11:55,23048,love+is+all+you+need+2012,104.mpg
4,2025-02-12T05:11:55,95221,about+a+boy+2002,45.mpg
...,...,...,...,...
29995,2025-02-10T04:55:12,21435,demolition+man+1993,68.mpg
29996,2025-02-10T04:55:26,15384,crouching+tiger_+hidden+dragon+2000,67.mpg
29997,2025-02-10T04:56:12,21435,demolition+man+1993,69.mpg
29998,2025-02-10T04:56:22,15384,crouching+tiger_+hidden+dragon+2000,68.mpg


In [17]:
# READ USER DATA
# ERRORING BECAUSE SOME LINES HAVE MORE THAN 4 COLUMNS
# GET RID OF ROWS THAT CONTAIN MORE THAN 4 COLUMNS

with open('data/user_data.csv', 'r') as f, open('data/user_data_clean.csv', 'w') as fout:
    for line in f:
        if len(line.strip().split(',')) == 4:
            fout.write(line)

user = pd.read_csv('data/user_data_clean.csv', names=["user_id", "age", "occupation", "gender"])
user

Unnamed: 0,user_id,age,occupation,gender
0,89163,29,college/grad student,M
1,81280,34,homemaker,M
2,105683,33,college/grad student,M
3,89163,29,college/grad student,M
4,81280,34,homemaker,M
...,...,...,...,...
104363,10900,33,sales/marketing,M
104364,88576,34,executive/managerial,M
104365,106888,28,sales/marketing,M
104366,106888,28,sales/marketing,M


In [18]:
# READ MOVIE DATA
# ERRORING BECAUSE COLUMN COUNT DOESNT ALIGHT FOR SOME ROWS
# THIS IS BECAUSE CSV PARSER IS SPLITTING ON EVERY COMMA AND THE GENRE COLUMN CONTAINS COMMAS
# USE REGEX

rows = []
pattern = re.compile(r'^([^,]+),([^,]+),(\[.*?\]),([^,]+),([^,]+)$')

with open('data/movie_data.csv', 'r') as f:
    for line in f:
        line = line.strip()
        match = pattern.match(line)
        if match:
            rows.append(match.groups())
        else:
            print("Line didn't match the pattern:", line)

# Create DataFrame from the extracted rows
column_names = ['title', 'flag', 'genres', 'release_date', 'language']
movie = pd.DataFrame(rows, columns=column_names)

# Convert the 'genres' column to a list of strings
movie['genres'] = movie['genres'].apply(ast.literal_eval)
# df['genres'] = [item['name'] for item in df['genres']]
movie['genres'] = movie['genres'].apply(lambda x: [item['name'] for item in x])
movie

Unnamed: 0,title,flag,genres,release_date,language
0,terror+is+a+man+1959,False,[Horror],1959-11-01,en
1,the+decline+of+western+civilization+part+ii+th...,False,"[Documentary, Music]",1988-06-17,en
2,crooklyn+1994,False,"[Comedy, Drama]",1994-05-13,en
3,saturday+night+and+sunday+morning+1960,False,[Drama],1960-08-29,en
4,eastwest+1999,False,[Drama],1999-09-01,fr
...,...,...,...,...,...
17186,4+2005,False,[Drama],2005-01-28,ru
17187,the+great+sacrifice+1944,False,[Drama],1944-12-08,de
17188,the+great+sacrifice+1944,False,[Drama],1944-12-08,de
17189,the+first+star+2009,False,[Comedy],2009-01-01,fr


### Build a GCN Model

In [19]:
# Example user_rating DataFrame
# user_rating = pd.read_csv('data/user_rate_short_data.csv',
#                           names=["timestamp", "user_id", "movie", "rating"])

unique_users = user_rating['user_id'].unique()
unique_users = np.sort(unique_users)
num_users = len(unique_users)
user2index = {uid: i for i, uid in enumerate(unique_users)}

# Suppose 'movie' is a string or ID; map them to indices
unique_movies = user_rating['movie'].unique()
unique_movies = np.sort(unique_movies)
num_movies = len(unique_movies)
movie2index = {m: i for i, m in enumerate(unique_movies)}

# Total nodes = user nodes + movie nodes
num_nodes = num_users + num_movies

In [20]:
edge_list = []

for _, row in user_rating.iterrows():
    uid = row['user_id']
    movie_str = row['movie']
    
    if uid in user2index and movie_str in movie2index:
        user_node = user2index[uid]
        movie_node = movie2index[movie_str] + num_users  # offset
        
        edge_list.append([user_node, movie_node])
        edge_list.append([movie_node, user_node])

# (Optional) If you also have watch_history, repeat similarly for those edges.

# Convert to PyTorch tensor, shape = [2, num_edges]
edge_index = torch.tensor(edge_list, dtype=torch.long).t()
print("edge_index shape:", edge_index.shape)  # [2, E]

edge_index shape: torch.Size([2, 30000])


In [21]:
data = Data(edge_index=edge_index, num_nodes=num_nodes)
print(data)

Data(edge_index=[2, 30000], num_nodes=4633)


In [22]:
import torch.nn as nn

class GCNRecommender(nn.Module):
    def __init__(self, num_nodes, embed_dim, hidden_dim, out_dim):
        super(GCNRecommender, self).__init__()
        
        # Learnable embedding for all nodes (users + movies)
        self.node_embeddings = nn.Embedding(num_nodes, embed_dim)
        
        # Two GCN layers
        self.conv1 = GCNConv(embed_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, out_dim)
        
    def forward(self, edge_index):
        # Start with the embedding table
        x = self.node_embeddings.weight  # shape [num_nodes, embed_dim]
        
        # GCN layer 1
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        
        # GCN layer 2
        x = self.conv2(x, edge_index)
        
        return x

# Instantiate the model
model = GCNRecommender(
    num_nodes=num_nodes,
    embed_dim=16,    # initial embedding dimension
    hidden_dim=32,
    out_dim=16       # final embedding dimension
)

print(model)

GCNRecommender(
  (node_embeddings): Embedding(4633, 16)
  (conv1): GCNConv(16, 32)
  (conv2): GCNConv(32, 16)
)


In [23]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Convert rating to a tensor
ratings = torch.tensor(user_rating['rating'].values, dtype=torch.float)

# Build (user_node, movie_node) pairs
user_nodes = []
movie_nodes = []
for _, row in user_rating.iterrows():
    uid = row['user_id']
    movie_str = row['movie']
    if uid in user2index and movie_str in movie2index:
        user_nodes.append(user2index[uid])
        movie_nodes.append(movie2index[movie_str] + num_users)

user_nodes = torch.tensor(user_nodes, dtype=torch.long)
movie_nodes = torch.tensor(movie_nodes, dtype=torch.long)


# 1) Train/Test Split (Simple)
num_interactions = len(ratings)
train_size = int(0.8 * num_interactions)
indices = torch.randperm(num_interactions)

train_indices = indices[:train_size]
test_indices = indices[train_size:]

train_user_nodes = user_nodes[train_indices]
train_movie_nodes = movie_nodes[train_indices]
train_ratings = ratings[train_indices]

test_user_nodes = user_nodes[test_indices]
test_movie_nodes = movie_nodes[test_indices]
test_ratings = ratings[test_indices]

# 2) Define Loss Function
mse_loss = nn.MSELoss()

# 3) Measure Training Cost
EPOCHS = 5
train_times = []

model.train()
for epoch in range(EPOCHS):
    start_time = time.time()

    optimizer.zero_grad()

    # Forward pass
    node_emb = model(data.edge_index)       # shape [num_nodes, out_dim]
    user_emb = node_emb[train_user_nodes]   # shape [num_train, out_dim]
    movie_emb = node_emb[train_movie_nodes] # shape [num_train, out_dim]

    # Predict ratings via dot product
    pred = (user_emb * movie_emb).sum(dim=1)

    # Compute loss
    loss = mse_loss(pred, train_ratings)
    loss.backward()
    optimizer.step()

    end_time = time.time()
    train_times.append(end_time - start_time)

    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

avg_train_time = sum(train_times) / len(train_times)
print(f"\nTraining Cost:")
print(f"Moderate – Fit time: {avg_train_time:.2f}s (per epoch)\n")

# 4) Measure Inference Cost
model.eval()
start_time = time.time()
with torch.no_grad():
    final_emb = model(data.edge_index)  # single forward pass for all nodes

    # Inference on test set
    test_user_emb = final_emb[test_user_nodes]
    test_movie_emb = final_emb[test_movie_nodes]
    test_preds = (test_user_emb * test_movie_emb).sum(dim=1)

end_time = time.time()
inference_time = end_time - start_time
print(f"Inference Cost:")
print(f"Low – Test time: {inference_time:.2f}s (fast inference time)\n")

# 5) Prediction Accuracy (RMSE & MAE)
test_preds_np = test_preds.cpu().numpy()
test_ratings_np = test_ratings.cpu().numpy()

rmse = np.sqrt(mean_squared_error(test_ratings_np, test_preds_np))
mae = mean_absolute_error(test_ratings_np, test_preds_np)

print(f"Prediction Accuracy:")
print(f"RMSE: {rmse:.4f}, MAE: {mae:.4f}\n")

# 6) Measure Disk/Memory Size
temp_model_path = "temp_gcn_model.pth"
torch.save(model.state_dict(), temp_model_path)

disk_size = os.path.getsize(temp_model_path) / (1024 * 1024)
process = psutil.Process(os.getpid())
memory_usage = process.memory_info().rss / (1024 * 1024)

print(f"Disk/Memory Size:")
print(f"disk size ({disk_size:.2f} MB)/Memory Size: ({memory_usage:.2f} MB)")

os.remove(temp_model_path)


Epoch 1, Loss: 12.8155
Epoch 2, Loss: 10.8197
Epoch 3, Loss: 8.0435
Epoch 4, Loss: 5.2710
Epoch 5, Loss: 4.4113

Training Cost:
Moderate – Fit time: 0.02s (per epoch)

Inference Cost:
Low – Test time: 0.01s (fast inference time)

Prediction Accuracy:
RMSE: 2.3629, MAE: 1.6767

Disk/Memory Size:
disk size (0.29 MB)/Memory Size: (710.70 MB)


In [24]:
def recommend_movies_for_user(user_id, top_k=5):
    if user_id not in user2index:
        return []
    
    idx = user2index[user_id]
    user_vec = final_emb[idx]  # shape [out_dim]
    
    # Dot product with all movie embeddings
    scores = (final_emb[num_users:] * user_vec).sum(dim=1)  # [num_movies]
    
    # Get top-K
    top_k_indices = torch.topk(scores, k=top_k).indices.cpu().numpy()
    
    # Map back to movie titles/IDs
    recommended = [unique_movies[i] for i in top_k_indices]
    return recommended

# Example usage
test_user_id = unique_users[0]  # pick some user
print("Recommendations for user:", test_user_id)
print(recommend_movies_for_user(test_user_id, top_k=5))

Recommendations for user: 10
['the+return+of+the+living+dead+1985', 'help+im+a+fish+2000', 'the+baby-sitters+club+1995', 'the+resurrected+1991', 'instinct+1999']
