# Data Preprocessing for Top-K Restaurant Recommendation

## 1. Import Libraries and download Dataset

In [2]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from collections import defaultdict
import pickle
from scipy.sparse import csr_matrix
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import scipy.sparse as sp
import math

In [3]:
!wget https://mcauleylab.ucsd.edu/public_datasets/gdrive/googlelocal_restaurants/image_review_all.json

--2025-12-02 10:15:59--  https://mcauleylab.ucsd.edu/public_datasets/gdrive/googlelocal_restaurants/image_review_all.json
Resolving mcauleylab.ucsd.edu (mcauleylab.ucsd.edu)... 169.228.63.88
Connecting to mcauleylab.ucsd.edu (mcauleylab.ucsd.edu)|169.228.63.88|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1087842416 (1.0G) [application/json]
Saving to: ‘image_review_all.json.1’


2025-12-02 10:16:10 (94.4 MB/s) - ‘image_review_all.json.1’ saved [1087842416/1087842416]



## 2. Load Dataset

Load the raw Google restaurant review data from JSON Lines format.

In [4]:
print("\n[1/7] Loading dataset...")
data = []
with open('image_review_all.json', 'r') as f:
    for line in f:
        data.append(json.loads(line.strip()))

df = pd.DataFrame(data)
print(f"   Total reviews: {len(df):,}")
print(f"   Unique users: {df['user_id'].nunique():,}")
print(f"   Unique businesses: {df['business_id'].nunique():,}")


[1/7] Loading dataset...
   Total reviews: 1,487,747
   Unique users: 868,937
   Unique businesses: 64,527


## 3. Clean Data

Remove duplicates and handle missing values.

In [5]:
print("\n[2/7] Cleaning data...")
# Remove any duplicates
df = df.drop_duplicates(subset=['user_id', 'business_id'])
print(f"   After removing duplicates: {len(df):,} reviews")

# Check for missing values
missing = df[['user_id', 'business_id', 'rating']].isnull().sum()
if missing.sum() > 0:
    print(f"   Found missing values: {missing[missing > 0]}")
    df = df.dropna(subset=['user_id', 'business_id', 'rating'])
    print(f"   After removing missing: {len(df):,} reviews")
else:
    print("   No missing values found")


[2/7] Cleaning data...
   After removing duplicates: 1,487,747 reviews
   No missing values found


## 4. Filter Sparse Users and Businesses

To improve recommendation quality, we filter out:
- Users with fewer than 3 reviews
- Businesses with fewer than 5 reviews

This is done iteratively until convergence, as filtering one affects the other.

In [6]:
print("\n[3/7] Filtering sparse users and businesses...")
min_user_reviews = 3  # Users must have at least 3 reviews
min_business_reviews = 5  # Businesses must have at least 5 reviews

# Iteratively filter (because filtering businesses affects users and vice versa)
prev_size = 0
iteration = 0
while len(df) != prev_size and iteration < 10:
    prev_size = len(df)
    iteration += 1
    
    user_counts = df['user_id'].value_counts()
    business_counts = df['business_id'].value_counts()
    
    valid_users = user_counts[user_counts >= min_user_reviews].index
    valid_businesses = business_counts[business_counts >= min_business_reviews].index
    
    df = df[df['user_id'].isin(valid_users) & df['business_id'].isin(valid_businesses)]
    
    if iteration > 1:
        print(f"   Iteration {iteration}: {len(df):,} reviews")

print(f"\n   Final dataset after filtering:")
print(f"   Reviews: {len(df):,}")
print(f"   Users: {df['user_id'].nunique():,}")
print(f"   Businesses: {df['business_id'].nunique():,}")
print(f"   Sparsity: {(1 - len(df)/(df['user_id'].nunique() * df['business_id'].nunique()))*100:.4f}%")


[3/7] Filtering sparse users and businesses...
   Iteration 2: 544,416 reviews
   Iteration 3: 523,230 reviews
   Iteration 4: 517,958 reviews
   Iteration 5: 515,268 reviews
   Iteration 6: 514,479 reviews
   Iteration 7: 514,046 reviews
   Iteration 8: 513,936 reviews
   Iteration 9: 513,872 reviews
   Iteration 10: 513,868 reviews

   Final dataset after filtering:
   Reviews: 513,868
   Users: 98,975
   Businesses: 28,274
   Sparsity: 99.9816%


## 5. Create ID Mappings

Map user and business IDs to integer indices for efficient matrix operations.

In [7]:
print("\n[4/7] Creating ID mappings...")
unique_users = df['user_id'].unique()
unique_businesses = df['business_id'].unique()

user_to_idx = {uid: idx for idx, uid in enumerate(unique_users)}
idx_to_user = {idx: uid for uid, idx in user_to_idx.items()}

business_to_idx = {bid: idx for idx, bid in enumerate(unique_businesses)}
idx_to_business = {idx: bid for bid, idx in business_to_idx.items()}

# Add mapped indices to dataframe
df['user_idx'] = df['user_id'].map(user_to_idx)
df['business_idx'] = df['business_id'].map(business_to_idx)

print(f"   Created mappings for {len(user_to_idx):,} users and {len(business_to_idx):,} businesses")


[4/7] Creating ID mappings...
   Created mappings for 98,975 users and 28,274 businesses


## 6. Split into Train/Validation/Test Sets

We use per-user splitting to ensure:
- Each user appears in the training set
- We can evaluate on held-out user interactions

**Split Strategy:**
- Users with ≥5 reviews: 70% train, 15% val, 15% test
- Users with <5 reviews: All but 1 in train, 1 in test

In [8]:
print("\n[5/7] Splitting into train/validation/test sets...")

train_rows = []
val_rows = []
test_rows = []

grouped = df.groupby("user_id", sort=False)

for user_id, group in grouped:
    group = group.sample(frac=1, random_state=42)  # shuffle once
    n = len(group)

    if n >= 5:
        train_size = int(0.7 * n)
        val_size = int(0.15 * n)

        train_rows.append(group.iloc[:train_size])
        val_rows.append(group.iloc[train_size:train_size + val_size])
        test_rows.append(group.iloc[train_size + val_size:])
    else:
        train_rows.append(group.iloc[:-1])
        test_rows.append(group.iloc[-1:])

train_df = pd.concat(train_rows, ignore_index=True)
val_df = pd.concat(val_rows, ignore_index=True)
test_df = pd.concat(test_rows, ignore_index=True)

print(f"Train: {len(train_df):,}")
print(f"Val: {len(val_df):,}")
print(f"Test: {len(test_df):,}")



[5/7] Splitting into train/validation/test sets...
Train: 342,665
Val: 24,950
Test: 146,253


## 7. Create Sparse Interaction Matrices

Create user-item interaction matrices in sparse format for memory efficiency.

In [9]:
print("\n[6/7] Creating interaction matrices...")

def create_interaction_matrix(df_subset, n_users, n_businesses):
    """Create sparse user-item interaction matrix"""
    rows = df_subset['user_idx'].values
    cols = df_subset['business_idx'].values
    ratings = df_subset['rating'].values
    
    matrix = csr_matrix((ratings, (rows, cols)), shape=(n_users, n_businesses))
    return matrix

n_users = len(user_to_idx)
n_businesses = len(business_to_idx)

train_matrix = create_interaction_matrix(train_df, n_users, n_businesses)
val_matrix = create_interaction_matrix(val_df, n_users, n_businesses) if len(val_df) > 0 else None
test_matrix = create_interaction_matrix(test_df, n_users, n_businesses)

print(f"   Train matrix shape: {train_matrix.shape}")
print(f"   Train matrix density: {train_matrix.nnz / (train_matrix.shape[0] * train_matrix.shape[1]) * 100:.4f}%")
if val_matrix is not None:
    print(f"   Val matrix shape: {val_matrix.shape}")
print(f"   Test matrix shape: {test_matrix.shape}")


[6/7] Creating interaction matrices...
   Train matrix shape: (98975, 28274)
   Train matrix density: 0.0122%
   Val matrix shape: (98975, 28274)
   Test matrix shape: (98975, 28274)


## 8. Compute Business Statistics

Calculate statistics for each business to handle cold-start scenarios.

In [10]:
print("\n[7/7] Computing business statistics...")
business_stats = train_df.groupby('business_id').agg({
    'rating': ['mean', 'count', 'std']
}).reset_index()
business_stats.columns = ['business_id', 'avg_rating', 'num_ratings', 'std_rating']
business_stats['std_rating'] = business_stats['std_rating'].fillna(0)

# Add business index
business_stats['business_idx'] = business_stats['business_id'].map(business_to_idx)

print(f"   Computed statistics for {len(business_stats):,} businesses")
print(f"   Average rating across all businesses: {business_stats['avg_rating'].mean():.3f}")


[7/7] Computing business statistics...
   Computed statistics for 28,223 businesses
   Average rating across all businesses: 4.434


## 9. Save Preprocessed Data

Save all preprocessed data structures for model training.

In [11]:
print("\n" + "=" * 60)
print("SAVING PREPROCESSED DATA")
print("=" * 60)

# Save as pickle for fast loading
save_data = {
    'train_df': train_df,
    'val_df': val_df,
    'test_df': test_df,
    'train_matrix': train_matrix,
    'val_matrix': val_matrix,
    'test_matrix': test_matrix,
    'user_to_idx': user_to_idx,
    'idx_to_user': idx_to_user,
    'business_to_idx': business_to_idx,
    'idx_to_business': idx_to_business,
    'business_stats': business_stats,
    'n_users': n_users,
    'n_businesses': n_businesses,
}

with open('preprocessed_data.pkl', 'wb') as f:
    pickle.dump(save_data, f)
print("✓ Saved preprocessed_data.pkl")

# Also save CSV versions for easy inspection
train_df.to_csv('train_data.csv', index=False)
if len(val_df) > 0:
    val_df.to_csv('val_data.csv', index=False)
test_df.to_csv('test_data.csv', index=False)
business_stats.to_csv('business_stats.csv', index=False)
print("✓ Saved CSV files (train_data.csv, val_data.csv, test_data.csv, business_stats.csv)")

# Save mappings as JSON for readability
mappings = {
    'user_to_idx': user_to_idx,
    'business_to_idx': business_to_idx,
}
with open('id_mappings.json', 'w') as f:
    json.dump(mappings, f, indent=2)
print("✓ Saved id_mappings.json")


SAVING PREPROCESSED DATA


✓ Saved preprocessed_data.pkl
✓ Saved CSV files (train_data.csv, val_data.csv, test_data.csv, business_stats.csv)
✓ Saved id_mappings.json


## 10. Preprocessing Summary

In [12]:
print("\n" + "=" * 60)
print("PREPROCESSING SUMMARY")
print("=" * 60)
print(f"Total users: {n_users:,}")
print(f"Total businesses: {n_businesses:,}")
print(f"Total interactions: {len(df):,}")
print(f"")
print(f"Train interactions: {len(train_df):,}")
print(f"Val interactions: {len(val_df):,}")
print(f"Test interactions: {len(test_df):,}")
print(f"")
print(f"Avg reviews per user: {len(train_df)/n_users:.2f}")
print(f"Avg reviews per business: {len(train_df)/n_businesses:.2f}")
print(f"")
print(f"Rating distribution (train):")
for rating in sorted(train_df['rating'].unique()):
    count = (train_df['rating'] == rating).sum()
    pct = count / len(train_df) * 100
    print(f"  {rating} stars: {count:,} ({pct:.1f}%)")

print("\n✓ Preprocessing complete!")
print("\nTo load the preprocessed data:")
print("  import pickle")
print("  with open('preprocessed_data.pkl', 'rb') as f:")
print("      data = pickle.load(f)")


PREPROCESSING SUMMARY
Total users: 98,975
Total businesses: 28,274
Total interactions: 513,868

Train interactions: 342,665
Val interactions: 24,950
Test interactions: 146,253

Avg reviews per user: 3.46
Avg reviews per business: 12.12

Rating distribution (train):
  1 stars: 4,671 (1.4%)
  2 stars: 7,455 (2.2%)
  3 stars: 25,654 (7.5%)
  4 stars: 84,660 (24.7%)
  5 stars: 220,225 (64.3%)

✓ Preprocessing complete!

To load the preprocessed data:
  import pickle
  with open('preprocessed_data.pkl', 'rb') as f:
      data = pickle.load(f)


## Model training and evaluation

## 1. Load Data

In [13]:
with open("preprocessed_data.pkl", "rb") as f:
    data = pickle.load(f)

train_df = data["train_df"][["user_idx", "business_idx", "rating"]]
test_df = data["test_df"][["user_idx", "business_idx", "rating"]]
n_users = data["n_users"]
n_items = data["n_businesses"]

# Use likes (rating >= 4) for implicit feedback
train_likes = train_df[train_df["rating"] >= 4]
test_likes = test_df[test_df["rating"] >= 4]

print(f"Users: {n_users:,}, Items: {n_items:,}")
print(f"Train likes: {len(train_likes):,}")
print(f"Test likes: {len(test_likes):,}")

Users: 98,975, Items: 28,274
Train likes: 304,885
Test likes: 130,465


## 2. Build User-Item Graph

In [14]:
def build_sparse_graph(train_df, n_users, n_items):
    """
    Build normalized adjacency matrix for LightGCN
    Graph structure:
    [  0      R  ]
    [ R^T     0  ]
    
    Where R is user-item interaction matrix
    """
    users = train_df['user_idx'].values
    items = train_df['business_idx'].values
    
    # Build bipartite graph directly as COO (faster than DOK)
    print("  Building graph structure...")
    # User -> Item edges
    row_ui = users
    col_ui = items + n_users  # Offset item indices
    # Item -> User edges  
    row_iu = items + n_users
    col_iu = users
    # Combine both directions
    row = np.concatenate([row_ui, row_iu])
    col = np.concatenate([col_ui, col_iu])
    data = np.ones(len(row), dtype=np.float32)
    
    adj_mat = sp.coo_matrix((data, (row, col)), 
                            shape=(n_users + n_items, n_users + n_items),
                            dtype=np.float32)
    
    # Normalize: D^(-1/2) * A * D^(-1/2)
    rowsum = np.array(adj_mat.sum(1)).flatten()
    d_inv_sqrt = np.power(rowsum, -0.5)
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
    
    # Multiply by diagonal matrix efficiently
    row_normalized = d_inv_sqrt[row] * data * d_inv_sqrt[col]
    
    norm_adj = sp.coo_matrix((row_normalized, (row, col)), 
                             shape=(n_users + n_items, n_users + n_items),
                             dtype=np.float32)
    
    # Convert to torch sparse tensor
    indices = torch.LongTensor(np.vstack([norm_adj.row, norm_adj.col]))
    values = torch.FloatTensor(norm_adj.data)
    shape = torch.Size(norm_adj.shape)
    
    return torch.sparse_coo_tensor(indices, values, shape)

graph = build_sparse_graph(train_likes, n_users, n_items)
print(f"✓ Built graph with {graph._nnz()} edges")


  Building graph structure...
✓ Built graph with 609770 edges


  d_inv_sqrt = np.power(rowsum, -0.5)


## 3. LightGCN Model

In [None]:
class LightGCN(nn.Module):
    """
    LightGCN: Simplified Graph Convolutional Network
    """
    
    def __init__(self, n_users, n_items, embedding_dim=64, n_layers=3, reg_weight=1e-4):
        super().__init__()
        
        self.n_users = n_users
        self.n_items = n_items
        self.embedding_dim = embedding_dim
        self.n_layers = n_layers
        self.reg_weight = reg_weight
        
        # Initialize embeddings
        self.user_embedding = nn.Embedding(n_users, embedding_dim)
        self.item_embedding = nn.Embedding(n_items, embedding_dim)
        
        # Xavier initialization
        nn.init.xavier_uniform_(self.user_embedding.weight)
        nn.init.xavier_uniform_(self.item_embedding.weight)
        
        print(f"  Embedding dim: {embedding_dim}")
        print(f"  Layers: {n_layers}")
        print(f"  Total parameters: {(n_users + n_items) * embedding_dim:,}")
    
    def get_ego_embeddings(self):
        """Get initial embeddings (layer 0)"""
        user_emb = self.user_embedding.weight
        item_emb = self.item_embedding.weight
        ego_emb = torch.cat([user_emb, item_emb], dim=0)
        return ego_emb
    
    def forward(self, graph):
        """
        Graph convolution to get final embeddings
        """
        all_embeddings = [self.get_ego_embeddings()]
        
        # Multi-layer propagation
        for layer in range(self.n_layers):
            # Graph convolution: aggregate from neighbors
            ego_emb = all_embeddings[-1]
            if graph.device.type == 'cpu':
                side_emb = torch.sparse.mm(graph, ego_emb.cpu()).to(ego_emb.device)
            else:
                side_emb = torch.sparse.mm(graph, ego_emb)
            all_embeddings.append(side_emb)
        
        # Layer aggregation (mean of all layers)
        final_emb = torch.stack(all_embeddings, dim=1).mean(dim=1)
        
        # Split back to users and items
        users_emb = final_emb[:self.n_users]
        items_emb = final_emb[self.n_users:]
        
        return users_emb, items_emb
    
    def bpr_loss(self, users, pos_items, neg_items, user_emb, item_emb):
        """
        BPR loss: maximize difference between positive and negative items
        """
        # Get embeddings
        u_emb = user_emb[users]
        pos_emb = item_emb[pos_items]
        neg_emb = item_emb[neg_items]
        
        # Compute scores
        pos_scores = (u_emb * pos_emb).sum(dim=1)
        neg_scores = (u_emb * neg_emb).sum(dim=1)
        
        # BPR loss
        bpr_loss = -torch.log(torch.sigmoid(pos_scores - neg_scores) + 1e-10).mean()
        
        # L2 regularization
        reg_loss = self.reg_weight * (
            u_emb.norm(2).pow(2) + 
            pos_emb.norm(2).pow(2) + 
            neg_emb.norm(2).pow(2)
        ) / len(users)
        
        return bpr_loss + reg_loss
    
    def predict(self, users, items, user_emb, item_emb):
        """Predict scores for user-item pairs"""
        u_emb = user_emb[users]
        i_emb = item_emb[items]
        scores = (u_emb * i_emb).sum(dim=1)
        return scores


## 4. Training Dataset

In [16]:
class BPRDataset(Dataset):
    def __init__(self, train_df, n_items, num_negatives=1):
        self.n_items = n_items
        self.num_negatives = num_negatives
        
        # Build user positive items dict
        from collections import defaultdict
        self.user_pos_items = defaultdict(set)
        for _, row in train_df.iterrows():
            self.user_pos_items[int(row.user_idx)].add(int(row.business_idx))
        
        # Create training pairs
        self.users = []
        self.pos_items = []
        
        for user, items in self.user_pos_items.items():
            for item in items:
                self.users.append(user)
                self.pos_items.append(item)
        
        self.all_items = set(range(n_items))
        print(f"  Training pairs: {len(self.users):,}")
    
    def __len__(self):
        return len(self.users)
    
    def __getitem__(self, idx):
        user = self.users[idx]
        pos_item = self.pos_items[idx]
        
        # Sample negative items
        neg_items = []
        pos_set = self.user_pos_items[user]
        candidates = self.all_items - pos_set
        
        if len(candidates) > 0:
            neg_items = np.random.choice(list(candidates), 
                                        size=min(self.num_negatives, len(candidates)), 
                                        replace=False)
        else:
            neg_items = [np.random.randint(0, self.n_items)]
        
        return user, pos_item, neg_items[0]

train_dataset = BPRDataset(train_likes, n_items, num_negatives=1)

  Training pairs: 304,885


## 5. Training Loop

In [None]:
EMBEDDING_DIM = 64
N_LAYERS = 3
LEARNING_RATE = 0.001
REG_WEIGHT = 1e-4
BATCH_SIZE = 2048
EPOCHS = 20

# Setup
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f"  Device: {device}")


# Initialize model
model = LightGCN(n_users, n_items, EMBEDDING_DIM, N_LAYERS, REG_WEIGHT)

model = model.to(device)
graph = graph.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, 
                          num_workers=4, pin_memory=True)

# Training
print("\nStarting training...")
best_loss = float('inf')

for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0
    
    pbar = tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}")
    for batch_idx, (users, pos_items, neg_items) in enumerate(pbar):
        users = users.to(device)
        pos_items = pos_items.to(device)
        neg_items = neg_items.to(device)
        
        optimizer.zero_grad()
        
        # Get fresh embeddings for each batch (recompute graph convolution)
        user_emb, item_emb = model.forward(graph)
        
        # Compute loss
        loss = model.bpr_loss(users, pos_items, neg_items, user_emb, item_emb)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        pbar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch}: Average Loss = {avg_loss:.4f}")
    
    if avg_loss < best_loss:
        best_loss = avg_loss
        torch.save(model.state_dict(), 'best_lightgcn.pt')

  Device: cuda:0
  Embedding dim: 64
  Layers: 3
  Total parameters: 8,143,936

Starting training...


Epoch 1/20: 100%|██████████| 38/38 [02:39<00:00,  4.20s/it, loss=0.6914]


Epoch 1: Average Loss = 0.6926


Epoch 2/20: 100%|██████████| 38/38 [02:40<00:00,  4.21s/it, loss=0.6743]


Epoch 2: Average Loss = 0.6849


Epoch 3/20: 100%|██████████| 38/38 [02:39<00:00,  4.21s/it, loss=0.6243]


Epoch 3: Average Loss = 0.6510


Epoch 4/20: 100%|██████████| 38/38 [02:40<00:00,  4.23s/it, loss=0.5468]


Epoch 4: Average Loss = 0.5844


Epoch 5/20: 100%|██████████| 38/38 [02:38<00:00,  4.17s/it, loss=0.4644]


Epoch 5: Average Loss = 0.5021


Epoch 6/20: 100%|██████████| 38/38 [02:38<00:00,  4.16s/it, loss=0.4015]


Epoch 6: Average Loss = 0.4251


Epoch 7/20: 100%|██████████| 38/38 [02:40<00:00,  4.22s/it, loss=0.3311]


Epoch 7: Average Loss = 0.3621


Epoch 8/20: 100%|██████████| 38/38 [02:37<00:00,  4.15s/it, loss=0.2909]


Epoch 8: Average Loss = 0.3128


Epoch 9/20: 100%|██████████| 38/38 [02:38<00:00,  4.16s/it, loss=0.2583]


Epoch 9: Average Loss = 0.2754


Epoch 10/20: 100%|██████████| 38/38 [02:40<00:00,  4.23s/it, loss=0.2300]


Epoch 10: Average Loss = 0.2458


Epoch 11/20: 100%|██████████| 38/38 [02:36<00:00,  4.12s/it, loss=0.2097]


Epoch 11: Average Loss = 0.2216


Epoch 12/20: 100%|██████████| 38/38 [02:37<00:00,  4.13s/it, loss=0.1970]


Epoch 12: Average Loss = 0.2026


Epoch 13/20: 100%|██████████| 38/38 [02:35<00:00,  4.10s/it, loss=0.1782]


Epoch 13: Average Loss = 0.1866


Epoch 14/20: 100%|██████████| 38/38 [02:38<00:00,  4.18s/it, loss=0.1644]


Epoch 14: Average Loss = 0.1727


Epoch 15/20: 100%|██████████| 38/38 [02:37<00:00,  4.15s/it, loss=0.1560]


Epoch 15: Average Loss = 0.1608


Epoch 16/20: 100%|██████████| 38/38 [02:39<00:00,  4.20s/it, loss=0.1498]


Epoch 16: Average Loss = 0.1512


Epoch 17/20: 100%|██████████| 38/38 [02:36<00:00,  4.12s/it, loss=0.1348]


Epoch 17: Average Loss = 0.1413


Epoch 18/20: 100%|██████████| 38/38 [02:37<00:00,  4.15s/it, loss=0.1314]


Epoch 18: Average Loss = 0.1331


Epoch 19/20: 100%|██████████| 38/38 [02:37<00:00,  4.15s/it, loss=0.1311]


Epoch 19: Average Loss = 0.1265


Epoch 20/20: 100%|██████████| 38/38 [02:36<00:00,  4.11s/it, loss=0.1135]

Epoch 20: Average Loss = 0.1198





## 6.Evaluation

In [24]:
def recall_at_k(recommended, ground_truth, K):
    if len(ground_truth) == 0:
        return None
    hit = len(set(recommended[:K]) & set(ground_truth))
    return hit / len(ground_truth)

def ndcg_at_k(recommended, ground_truth, K):
    dcg = 0.0
    for rank, item in enumerate(recommended[:K], start=1):
        if item in ground_truth:
            dcg += 1 / math.log2(rank + 1)
    max_rel = min(K, len(ground_truth))
    idcg = sum(1 / math.log2(rank + 1) for rank in range(1, max_rel + 1))
    return dcg / idcg if idcg > 0 else None

def evaluate_lightgcn(model, graph, test_df, train_likes, K=10, 
                      sample_users=2000, device='cuda:0', batch_size=8192):
    """Evaluate LightGCN on test set"""
    model.eval()
    
    # Get final embeddings
    user_emb, item_emb = model.forward(graph)
    
    # Build user seen items
    user_seen = train_likes.groupby("user_idx")["business_idx"].apply(set).to_dict()
    
    # Build test ground truth (rating >= 4)
    test_filtered = test_df[test_df['rating'] >= 4]
    test_items_by_user = test_filtered.groupby('user_idx')['business_idx'].apply(set).to_dict()
    test_users = list(test_items_by_user.keys())
    
    # Sample users
    if sample_users and sample_users < len(test_users):
        np.random.seed(42)
        test_users = np.random.choice(test_users, size=sample_users, replace=False)
    
    recalls, ndcgs = [], []
    all_items = torch.arange(n_items, device=device)
    
    print(f"Evaluating {len(test_users)} users...")
    
    for i in tqdm(range(0, len(test_users), batch_size)):
        batch_users = test_users[i:i+batch_size]
        batch_user_tensor = torch.LongTensor(batch_users).to(device)
        
        # Get user embeddings
        batch_user_emb = user_emb[batch_user_tensor]
        
        # Compute scores for all items
        scores = torch.matmul(batch_user_emb, item_emb.T)
        
        # Mask seen items
        for j, user in enumerate(batch_users):
            if user in user_seen:
                seen_items = list(user_seen[user])
                scores[j, seen_items] = -float('inf')
        
        # Get top-K
        _, top_k_items = torch.topk(scores, k=K, dim=1)
        top_k_items = top_k_items.cpu().numpy()
        
        # Compute metrics
        for j, user in enumerate(batch_users):
            recs = top_k_items[j].tolist()
            truth = list(test_items_by_user.get(user, set()))
            
            r = recall_at_k(recs, truth, K)
            n = ndcg_at_k(recs, truth, K)
            
            if r is not None:
                recalls.append(r)
            if n is not None:
                ndcgs.append(n)
    
    return np.mean(recalls), np.mean(ndcgs)

# Load best model
model.load_state_dict(torch.load('best_lightgcn.pt'))

# Evaluate
recall, ndcg = evaluate_lightgcn(model, graph, test_df, train_likes, 
                                 K=10, sample_users=2000, device=device)

print(f"\nLightGCN Recall@10: {recall:.4f}")
print(f"LightGCN NDCG@10: {ndcg:.4f}")

  model.load_state_dict(torch.load('best_lightgcn.pt'))


Evaluating 2000 users...


100%|██████████| 1/1 [00:00<00:00,  5.52it/s]



LightGCN Recall@10: 0.0452
LightGCN NDCG@10: 0.0278
