In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# import torch_geometric
# from torch_geometric.data import HeteroData

In [None]:
import pandas as pd
import os

def get_full_df(dir_name):
  folder_path = f"/content/drive/MyDrive/updated_chckpt2_data/{dir_name}"
  csv_files = [f for f in os.listdir(folder_path) if f.endswith('.parquet')]

  dfs = []
  for file in csv_files:
      file_path = os.path.join(folder_path, file)
      df = pd.read_parquet(file_path)
      dfs.append(df)

  entire_df = pd.concat(dfs, ignore_index=True)

  return entire_df

history_df = get_full_df("history.parquet")
items_df = get_full_df("items.parquet")
users_df = get_full_df("users.parquet")

In [None]:
user_interactions = history_df.groupby('user_idx').size().rename('total_interactions')
users_df = users_df.merge(user_interactions, on='user_idx', how='left').fillna(0)
pos_rate = history_df.groupby('user_idx')['relevance'].mean().rename('positive_rate')
users_df = users_df.merge(pos_rate, on='user_idx', how='left').fillna(0)

In [None]:
item_popularity = history_df.groupby('item_idx').size().rename('popularity')
items_df = items_df.merge(item_popularity, on='item_idx', how='left').fillna(0)

In [None]:
# Feature Normalization (attr_ features looks normalized but also normalize them for sanity checks)
from sklearn.preprocessing import StandardScaler
# Identify user and item feature columns (excluding indices and categorical)
user_feat_cols = [col for col in users_df.columns if col.startswith('user_attr_')] + ["total_interactions", "positive_rate"]
item_feat_cols = [col for col in items_df.columns if col.startswith('item_attr_')] + ['price', 'popularity']

user_scaler = StandardScaler()
item_scaler = StandardScaler()

users_df[user_feat_cols] = user_scaler.fit_transform(users_df[user_feat_cols])
items_df[item_feat_cols] = item_scaler.fit_transform(items_df[item_feat_cols])

In [None]:
data = history_df.merge(users_df, on='user_idx', how='left')
data = data.merge(items_df, on='item_idx', how='left')

In [None]:
categorical_cols = data.select_dtypes(include=['object', 'category']).columns.tolist()
data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)
users_df = pd.get_dummies(users_df, columns=['segment'], drop_first=True)
items_df = pd.get_dummies(items_df, columns=['category'], drop_first=True)

In [None]:
data.columns

Index(['user_idx', 'item_idx', 'relevance', 'user_attr_0', 'user_attr_1',
       'user_attr_2', 'user_attr_3', 'user_attr_4', 'user_attr_5',
       'user_attr_6', 'user_attr_7', 'user_attr_8', 'user_attr_9',
       'user_attr_10', 'user_attr_11', 'user_attr_12', 'user_attr_13',
       'user_attr_14', 'user_attr_15', 'user_attr_16', 'user_attr_17',
       'user_attr_18', 'user_attr_19', 'total_interactions', 'positive_rate',
       'item_attr_0', 'item_attr_1', 'item_attr_2', 'item_attr_3',
       'item_attr_4', 'item_attr_5', 'item_attr_6', 'item_attr_7',
       'item_attr_8', 'item_attr_9', 'item_attr_10', 'item_attr_11',
       'item_attr_12', 'item_attr_13', 'item_attr_14', 'item_attr_15',
       'item_attr_16', 'item_attr_17', 'item_attr_18', 'item_attr_19', 'price',
       'popularity', 'segment_mainstream', 'segment_premium',
       'category_clothing', 'category_electronics', 'category_home'],
      dtype='object')

In [None]:
combined_df = data

In [None]:
import torch
import pandas as pd

user_features = combined_df.drop_duplicates("user_idx").sort_values("user_idx")[[
    f"user_attr_{i}" for i in range(20)
] + ["total_interactions", "positive_rate"]].values

item_features_df = combined_df.drop_duplicates("item_idx").sort_values("item_idx")[[
    f"item_attr_{i}" for i in range(20)
] + ["price", "popularity", "segment_mainstream", "segment_premium",
     "category_clothing", "category_electronics", "category_home"]]

item_features_df = item_features_df.astype(float)

user_tensor = torch.tensor(user_features, dtype=torch.float)
item_tensor = torch.tensor(item_features_df.values, dtype=torch.float)

In [None]:
edge_index = torch.tensor(combined_df[["user_idx", "item_idx"]].values.T, dtype=torch.long)
edge_label = torch.tensor(combined_df["relevance"].values, dtype=torch.float)

In [None]:
train_user_ids = train_df["user_idx"].unique()
train_item_ids = train_df["item_idx"].unique()
num_users = len(users_df["user_idx"].unique())
num_items = len(items_df["item_idx"].unique())

In [None]:
data = HeteroData()

# Node features
data["user"].x = user_tensor
data["item"].x = item_tensor

# Edges
data["user", "interacts", "item"].edge_index = edge_index
data["user", "interacts", "item"].edge_label = edge_label

# edge_attr
# edge_attr_df = combined_df[["user_idx", "item_idx", "price"]]
# edge_attr = torch.tensor(edge_attr_df.values, dtype=torch.float)
# data["user", "interacts", "item"].edge_attr = edge_attr

# Rev Edges
data["item", "rev_interacts", "user"].edge_index = edge_index.flip(0)
data["item", "rev_interacts", "user"].edge_label = edge_label

In [None]:
combined_df.columns

Index(['user_idx', 'item_idx', 'relevance', 'user_attr_0', 'user_attr_1',
       'user_attr_2', 'user_attr_3', 'user_attr_4', 'user_attr_5',
       'user_attr_6', 'user_attr_7', 'user_attr_8', 'user_attr_9',
       'user_attr_10', 'user_attr_11', 'user_attr_12', 'user_attr_13',
       'user_attr_14', 'user_attr_15', 'user_attr_16', 'user_attr_17',
       'user_attr_18', 'user_attr_19', 'total_interactions', 'positive_rate',
       'item_attr_0', 'item_attr_1', 'item_attr_2', 'item_attr_3',
       'item_attr_4', 'item_attr_5', 'item_attr_6', 'item_attr_7',
       'item_attr_8', 'item_attr_9', 'item_attr_10', 'item_attr_11',
       'item_attr_12', 'item_attr_13', 'item_attr_14', 'item_attr_15',
       'item_attr_16', 'item_attr_17', 'item_attr_18', 'item_attr_19', 'price',
       'popularity', 'segment_mainstream', 'segment_premium',
       'category_clothing', 'category_electronics', 'category_home'],
      dtype='object')

In [None]:
# For train-test split
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

edge_index_np = edge_index.cpu().numpy()
edge_label_np = edge_label.cpu().numpy()

edge_df = pd.DataFrame({
    "user_idx": edge_index_np[0],
    "item_idx": edge_index_np[1],
    "label": edge_label_np
})

train_df, test_df = train_test_split(edge_df, test_size=0.2, stratify=edge_df["label"], random_state=42)

In [None]:
# Add negative sampling to add negative edges
def sample_negative_edges(train_df, num_users, num_items, num_samples):
    existing = set((u, i) for u, i in zip(train_df["user_idx"], train_df["item_idx"]))
    neg_edges = set()

    while len(neg_edges) < num_samples:
        u = np.random.randint(0, num_users)
        i = np.random.randint(0, num_items)
        if (u, i) not in existing:
            neg_edges.add((u, i))

    neg_df = pd.DataFrame(list(neg_edges), columns=["user_idx", "item_idx"])
    neg_df["label"] = 0
    return neg_df

neg_df = sample_negative_edges(train_df, num_users, num_items, num_samples=len(train_df))
train_df = pd.concat([train_df, neg_df]).sample(frac=1).reset_index(drop=True)

In [None]:
train_edge_index = torch.tensor(train_df[["user_idx", "item_idx"]].values.T, dtype=torch.long)
train_edge_label = torch.tensor(train_df["label"].values, dtype=torch.float)

data["user", "interacts", "item"].edge_index = train_edge_index
data["user", "interacts", "item"].edge_label = train_edge_label

data["item", "rev_interacts", "user"].edge_index = train_edge_index.flip(0)
data["item", "rev_interacts", "user"].edge_label = train_edge_label

In [None]:
from torch_geometric.nn import SAGEConv, HeteroConv
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import BatchNorm1d

class GraphSAGERecommender(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.bn_user = BatchNorm1d(hidden_dim)
        self.bn_item = BatchNorm1d(hidden_dim)

        self.conv1 = HeteroConv({
            ("user", "interacts", "item"): SAGEConv((-1, -1), hidden_dim),
            ("item", "rev_interacts", "user"): SAGEConv((-1, -1), hidden_dim),
        }, aggr="sum")

        self.conv2 = HeteroConv({
            ("user", "interacts", "item"): SAGEConv((hidden_dim, hidden_dim), hidden_dim),
            ("item", "rev_interacts", "user"): SAGEConv((hidden_dim, hidden_dim), hidden_dim),
        }, aggr="sum")

    def forward(self, data):
        x_dict = self.conv1(data.x_dict, data.edge_index_dict)
        x_dict["user"] = self.bn_user(x_dict["user"])
        x_dict["item"] = self.bn_item(x_dict["item"])

        x_dict = {k: F.relu(v) for k, v in x_dict.items()}
        x_dict = self.conv2(x_dict, data.edge_index_dict)
        return x_dict


In [None]:
import torch
import torch.nn.functional as F

model = GraphSAGERecommender(hidden_dim=64)

# weight_decay applies L2 regularization to all trainable params
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-5)
criterion = torch.nn.BCEWithLogitsLoss()
for epoch in range(1, 151):
    model.train()
    out_dict = model(data)
    user_emb = out_dict["user"]
    item_emb = out_dict["item"]
    edge_index = data["user", "interacts", "item"].edge_index
    edge_label = data["user", "interacts", "item"].edge_label

    # Predict link scores by dot product of user emb and item emb of the (user, item) as an edge
    user_ids = edge_index[0]
    item_ids = edge_index[1]
    scores = (user_emb[user_ids] * item_emb[item_ids]).sum(dim=1)

    loss = criterion(scores, edge_label)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    with torch.no_grad():
        pred_prob = torch.sigmoid(scores)
        pred_binary = (pred_prob > 0.5).float()
        acc = (pred_binary == edge_label).float().mean().item()

    if epoch % 10 == 0:
      print(f"Epoch {epoch:02d}; Loss: {loss.item():.4f}; Acc: {acc:.4f}")

Epoch 10; Loss: 1.0850; Acc: 0.4249
Epoch 20; Loss: 0.6880; Acc: 0.6303
Epoch 30; Loss: 0.5136; Acc: 0.7520
Epoch 40; Loss: 0.5107; Acc: 0.7528
Epoch 50; Loss: 0.4902; Acc: 0.7534
Epoch 60; Loss: 0.4779; Acc: 0.7551
Epoch 70; Loss: 0.4720; Acc: 0.7564
Epoch 80; Loss: 0.4656; Acc: 0.7583
Epoch 90; Loss: 0.4608; Acc: 0.7598
Epoch 100; Loss: 0.4564; Acc: 0.7607
Epoch 110; Loss: 0.4525; Acc: 0.7624
Epoch 120; Loss: 0.4491; Acc: 0.7642
Epoch 130; Loss: 0.4460; Acc: 0.7650
Epoch 140; Loss: 0.4432; Acc: 0.7665
Epoch 150; Loss: 0.4407; Acc: 0.7680


In [None]:
def update_graph_with_new_edges(data, new_df):
    """
    Updates a HeteroData graph with new user-item interactions.
    Assumes new_df has columns: ['user_idx', 'item_idx', 'label']
    """
    new_edge_index = torch.tensor(new_df[["user_idx", "item_idx"]].values.T, dtype=torch.long)
    new_edge_label = torch.tensor(new_df["label"].values, dtype=torch.float)

    # Append to current edge_index and edge_label
    old_edge_index = data["user", "interacts", "item"].edge_index
    old_edge_label = data["user", "interacts", "item"].edge_label

    data["user", "interacts", "item"].edge_index = torch.cat([old_edge_index, new_edge_index], dim=1)
    data["user", "interacts", "item"].edge_label = torch.cat([old_edge_label, new_edge_label], dim=0)

    # Reverse edge: item â†’ user
    data["item", "rev_interacts", "user"].edge_index = data["user", "interacts", "item"].edge_index.flip(0)
    data["item", "rev_interacts", "user"].edge_label = data["user", "interacts", "item"].edge_label

    return data

In [None]:
# Only deal with seen users and items during inference time

test_df = test_df[
    test_df["user_idx"].isin(train_user_ids) &
    test_df["item_idx"].isin(train_item_ids)
].reset_index(drop=True)

# Start test
model.eval()
with torch.no_grad():
    out = model(data)
    user_emb = out["user"]
    item_emb = out["item"]

test_user_ids = torch.tensor(test_df["user_idx"].values, dtype=torch.long)
test_item_ids = torch.tensor(test_df["item_idx"].values, dtype=torch.long)
test_labels   = torch.tensor(test_df["label"].values, dtype=torch.float)

test_scores = (user_emb[test_user_ids] * item_emb[test_item_ids]).sum(dim=1)
test_probs = torch.sigmoid(test_scores)

from sklearn.metrics import roc_auc_score
auc = roc_auc_score(test_labels.numpy(), test_probs.numpy())
print(f"Test AUC: {auc:.4f}")


Test AUC: 0.9229


In [None]:
# Ranking with expected revenue
def rank_items(user_emb, item_emb, prices, top_k=10):
    scores = torch.matmul(user_emb, item_emb.T)
    expected_revenue = scores * prices
    top_items = torch.topk(expected_revenue, top_k, dim=1).indices
    return top_items

In [None]:
prices = data["item"].x[:, -7]
topk_recommendations = rank_items(user_emb, item_emb, prices, top_k=10)

In [None]:
topk_recommendations

tensor([[129, 167, 986,  ..., 838, 941, 548],
        [129,  48, 897,  ..., 167, 618, 871],
        [987,  42, 142,  ..., 922, 141, 835],
        ...,
        [ 99, 559, 167,  ...,  52, 443, 958],
        [ 99, 604, 817,  ..., 904, 559, 608],
        [277, 676, 320,  ..., 572, 474, 442]])