In [16]:
import random
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import roc_auc_score
from torch_geometric.data import HeteroData
from torch_geometric.nn import SAGEConv  # Changed from GCNConv to SAGEConv
import torch.nn as nn

# ──────────────── 1. LOAD DATA (Same as before) ────────────────
df_ratings = pd.read_csv("u.data", sep="\t", header=None, 
                        names=["user_id", "item_id", "rating", "timestamp"])
df_users = pd.read_csv("u.user", sep="|", header=None, 
                      names=["user_id", "age", "gender", "occupation", "zip_code"])

item_cols = ["movie_id", "movie_title", "release_date", "video_release_date", "IMDb_URL",
            "unknown", "Action", "Adventure", "Animation", "Children's", "Comedy",
            "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
            "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]
df_items = pd.read_csv("u.item", sep="|", header=None, names=item_cols, encoding="latin-1")

# ──────────────── 2. SIMPLE FEATURE IMPROVEMENTS (Same as before) ────────────────
user_id_map = {raw: idx for idx, raw in enumerate(df_users["user_id"])}
movie_id_map = {raw: idx for idx, raw in enumerate(df_items["movie_id"])}

df_ratings["user_id_mapped"] = df_ratings["user_id"].map(user_id_map)
df_ratings["movie_id_mapped"] = df_ratings["item_id"].map(movie_id_map)

num_users = len(user_id_map)
num_movies = len(movie_id_map)

def create_simple_user_features(df_users, df_ratings):
    age_scaled = MinMaxScaler().fit_transform(df_users[["age"]])
    occ_enc = LabelEncoder().fit(df_users["occupation"])
    occ_encoded = occ_enc.transform(df_users["occupation"])[:, None]
    
    user_stats = df_ratings.groupby('user_id').agg({
        'rating': ['mean', 'count']
    }).fillna(0)
    user_stats.columns = ['avg_rating', 'num_ratings']
    
    stats_scaled = MinMaxScaler().fit_transform(user_stats.values)
    features = np.hstack([age_scaled, occ_encoded, stats_scaled])
    return torch.tensor(features, dtype=torch.float)

def create_simple_movie_features(df_items, df_ratings):
    genre_features = df_items[item_cols[5:]].values
    
    movie_stats = df_ratings.groupby('item_id').agg({
        'rating': ['mean', 'count']
    }).fillna(0)
    movie_stats.columns = ['avg_rating', 'num_ratings']
    
    stats_scaled = MinMaxScaler().fit_transform(movie_stats.values)
    features = np.hstack([genre_features, stats_scaled])
    return torch.tensor(features, dtype=torch.float)

u_feats = create_simple_user_features(df_users, df_ratings)
m_feats = create_simple_movie_features(df_items, df_ratings)

print(f"User features: {u_feats.shape[1]} dimensions")
print(f"Movie features: {m_feats.shape[1]} dimensions")

# ──────────────── 3. BUILD GRAPH (Same as before) ────────────────
hetero = HeteroData()
hetero["user"].x = u_feats
hetero["movie"].x = m_feats

edge_index = torch.tensor([df_ratings["user_id_mapped"].values,
                          df_ratings["movie_id_mapped"].values], dtype=torch.long)
hetero["user", "rates", "movie"].edge_index = edge_index

data = hetero.to_homogeneous(node_attrs=["x"], edge_attrs=None)

# ──────────────── 4. DATA SPLITTING (Same as before) ────────────────
pos_df = df_ratings[["user_id_mapped", "movie_id_mapped"]]
train_val, test_df = train_test_split(pos_df, test_size=0.10, random_state=42)
train_df, val_df = train_test_split(train_val, test_size=0.1111, random_state=42)

all_pos = set(zip(pos_df.user_id_mapped, pos_df.movie_id_mapped))

def sample_neg(n):
    negs = set()
    while len(negs) < n:
        u = random.randrange(num_users)
        v = random.randrange(num_movies)
        if (u, v) not in all_pos:
            negs.add((u, v))
    return pd.DataFrame(list(negs), columns=["user_id_mapped","movie_id_mapped"])

neg_train = sample_neg(len(train_df))
neg_val = sample_neg(len(val_df))
neg_test = sample_neg(len(test_df))

def build_edges(pos, neg):
    u_list = list(pos.user_id_mapped) + list(neg.user_id_mapped)
    m_list = [m + num_users for m in list(pos.movie_id_mapped) + list(neg.movie_id_mapped)]
    ei = torch.tensor([u_list, m_list], dtype=torch.long)
    lbl = torch.tensor([1]*len(pos) + [0]*len(neg), dtype=torch.float)
    return ei, lbl

train_ei, train_lbl = build_edges(train_df, neg_train)
val_ei, val_lbl = build_edges(val_df, neg_val)
test_ei, test_lbl = build_edges(test_df, neg_test)

# ──────────────── 5. GRAPHSAGE MODEL (MAIN CHANGE HERE) ────────────────
class SimpleGraphSAGE(torch.nn.Module):
    def __init__(self, in_feats, hidden_dim=128):
        super().__init__()
        # CHANGED: Using SAGEConv instead of GCNConv
        self.conv1 = SAGEConv(in_feats, hidden_dim)
        self.conv2 = SAGEConv(hidden_dim, hidden_dim)
        self.conv3 = SAGEConv(hidden_dim, hidden_dim)
        
        # Same batch normalization as before
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.bn2 = nn.BatchNorm1d(hidden_dim)
        
        # Same decoder as before
        self.decoder = nn.Sequential(
            nn.Linear(2 * hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, 1)
        )

    def encode(self, x, edge_index):
        # Layer 1
        h1 = self.conv1(x, edge_index)
        h1 = self.bn1(h1)
        h1 = F.relu(h1)
        h1 = F.dropout(h1, p=0.3, training=self.training)
        
        # Layer 2
        h2 = self.conv2(h1, edge_index)
        h2 = self.bn2(h2)
        h2 = F.relu(h2)
        h2 = F.dropout(h2, p=0.3, training=self.training)
        
        # Layer 3
        h3 = self.conv3(h2, edge_index)
        return h3

    def decode(self, z, edge_label_index):
        src, dst = edge_label_index
        h = torch.cat([z[src], z[dst]], dim=1)
        return self.decoder(h).view(-1)

    def forward(self, x, edge_index, edge_label_index):
        z = self.encode(x, edge_index)
        return self.decode(z, edge_label_index)

# ──────────────── 6. TRAINING (Same as before) ────────────────
def train_graphsage_model():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = SimpleGraphSAGE(data.num_features, hidden_dim=128).to(device)  # Changed model name
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)
    loss_fn = torch.nn.BCEWithLogitsLoss()

    x = data.x.to(device)
    ei = data.edge_index.to(device)
    
    best_val_auc = 0
    patience_counter = 0
    patience = 20
    
    print("Training GraphSAGE model...")
    
    for epoch in range(1, 1000):
        # Training
        model.train()
        optimizer.zero_grad()
        out = model(x, ei, train_ei.to(device))
        loss = loss_fn(out, train_lbl.to(device))
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        # Validation every 10 epochs
        if epoch % 10 == 0:
            model.eval()
            with torch.no_grad():
                val_out = model(x, ei, val_ei.to(device))
                val_loss = loss_fn(val_out, val_lbl.to(device))
                
                val_pred = torch.sigmoid(val_out).cpu().numpy()
                val_auc = roc_auc_score(val_lbl.numpy(), val_pred)
                
                print(f"Epoch {epoch:03d} | Train Loss: {loss:.4f} | Val Loss: {val_loss:.4f} | Val AUC: {val_auc:.4f}")
                
                if val_auc > best_val_auc:
                    best_val_auc = val_auc
                    patience_counter = 0
                    best_model = model.state_dict().copy()
                else:
                    patience_counter += 1
                    
                if patience_counter >= patience:
                    print(f"Early stopping at epoch {epoch}")
                    break
    
    model.load_state_dict(best_model)
    return model

# ──────────────── 7. TRAIN AND EVALUATE ────────────────
if __name__ == "__main__":
    # Train the GraphSAGE model
    model = train_graphsage_model()
    
    # Final evaluation
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    x = data.x.to(device)
    ei = data.edge_index.to(device)
    loss_fn = torch.nn.BCEWithLogitsLoss()
    
    model.eval()
    with torch.no_grad():
        test_out = model(x, ei, test_ei.to(device))
        test_loss = loss_fn(test_out, test_lbl.to(device))
        test_pred = torch.sigmoid(test_out).cpu().numpy()
        test_auc = roc_auc_score(test_lbl.numpy(), test_pred)
        
        print(f"\n{'='*50}")
        print("GRAPHSAGE RESULTS")
        print(f"{'='*50}")
        print(f"Test Loss: {test_loss:.4f}")
        print(f"Test AUC: {test_auc:.4f}")
        
        


User features: 4 dimensions
Movie features: 21 dimensions
Training GraphSAGE model...
Epoch 010 | Train Loss: 0.6129 | Val Loss: 0.6802 | Val AUC: 0.6183
Epoch 020 | Train Loss: 0.5729 | Val Loss: 0.5842 | Val AUC: 0.7539
Epoch 030 | Train Loss: 0.5361 | Val Loss: 0.5476 | Val AUC: 0.8337
Epoch 040 | Train Loss: 0.5075 | Val Loss: 0.4913 | Val AUC: 0.8574
Epoch 050 | Train Loss: 0.4854 | Val Loss: 0.4619 | Val AUC: 0.8751
Epoch 060 | Train Loss: 0.4680 | Val Loss: 0.4417 | Val AUC: 0.8795
Epoch 070 | Train Loss: 0.4545 | Val Loss: 0.4402 | Val AUC: 0.8855
Epoch 080 | Train Loss: 0.4426 | Val Loss: 0.4235 | Val AUC: 0.8894
Epoch 090 | Train Loss: 0.4352 | Val Loss: 0.4152 | Val AUC: 0.8925
Epoch 100 | Train Loss: 0.4304 | Val Loss: 0.4097 | Val AUC: 0.8955
Epoch 110 | Train Loss: 0.4234 | Val Loss: 0.4043 | Val AUC: 0.8978
Epoch 120 | Train Loss: 0.4153 | Val Loss: 0.3972 | Val AUC: 0.9007
Epoch 130 | Train Loss: 0.4085 | Val Loss: 0.3925 | Val AUC: 0.9021
Epoch 140 | Train Loss: 0.4063