In [2]:
pip install torch torch-geometric

Defaulting to user installation because normal site-packages is not writeable
Collecting torch
  Downloading torch-2.9.1-cp312-cp312-win_amd64.whl.metadata (30 kB)
Collecting torch-geometric
  Downloading torch_geometric-2.7.0-py3-none-any.whl.metadata (63 kB)
Collecting sympy>=1.13.3 (from torch)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting xxhash (from torch-geometric)
  Downloading xxhash-3.6.0-cp312-cp312-win_amd64.whl.metadata (13 kB)
Downloading torch-2.9.1-cp312-cp312-win_amd64.whl (110.9 MB)
   ---------------------------------------- 0.0/110.9 MB ? eta -:--:--
   ---------------------------------------- 0.5/110.9 MB 3.4 MB/s eta 0:00:33
    --------------------------------------- 1.6/110.9 MB 4.4 MB/s eta 0:00:25
   - -------------------------------------- 2.9/110.9 MB 4.8 MB/s eta 0:00:23
   - -------------------------------------- 3.9/110.9 MB 4.8 MB/s eta 0:00:23
   - -------------------------------------- 5.0/110.9 MB 4.7 MB/s eta 0:00:23
   -- 



In [None]:
"""
GNN-based user embedding for drift analysis
"""

import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool
from torch_geometric.data import Data
import pandas as pd
import numpy as np

# Load data
df = pd.read_parquet("runs/enhanced_v1_20251119_001142/data/reviews.parquet")

# Build bipartite user-item graph
user_ids = df['user_id'].astype('category').cat.codes.values
item_ids = df['item_id'].astype('category').cat.codes.values + df['user_id'].nunique()

# Edge list: (user -> item)
edge_index = torch.tensor([user_ids, item_ids], dtype=torch.long)

# Node features: sentiment at each time step
# For simplicity: user nodes get mean sentiment, item nodes get mean stars
n_users = df['user_id'].nunique()
n_items = df['item_id'].nunique()

user_features = df.groupby('user_id')['sent_hybrid'].mean().values.reshape(-1, 1)
item_features = df.groupby('item_id')['stars'].mean().values.reshape(-1, 1)

x = torch.tensor(
    np.vstack([user_features, item_features]),
    dtype=torch.float
)

# Edge features: timestamp (normalized)
edge_attr = torch.tensor(
    (df['ts'] - df['ts'].min()).dt.total_seconds().values / 1e9,
    dtype=torch.float
).reshape(-1, 1)

# Create PyG data object
data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)

# Simple GNN model
class DriftGNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)
    
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x

model = DriftGNN(in_channels=1, hidden_channels=16, out_channels=8)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Training loop (unsupervised: reconstruct node features)
model.train()
for epoch in range(100):
    optimizer.zero_grad()
    z = model(data.x, data.edge_index)
    
    # Simple reconstruction loss
    loss = F.mse_loss(z[:n_users], data.x[:n_users])
    loss.backward()
    optimizer.step()
    
    if epoch % 20 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

# Extract user embeddings
model.eval()
with torch.no_grad():
    user_embeddings = model(data.x, data.edge_index)[:n_users].numpy()

# Cluster on learned embeddings
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4, random_state=42)
gnn_clusters = kmeans.fit_predict(user_embeddings)

print(f"GNN-based clustering complete!")
print(f"Cluster distribution: {np.bincount(gnn_clusters)}")

  edge_index = torch.tensor([user_ids, item_ids], dtype=torch.long)
  loss = F.mse_loss(z[:n_users], data.x[:n_users])


Epoch 0, Loss: 0.4022
Epoch 20, Loss: 0.0595
Epoch 40, Loss: 0.0024
Epoch 60, Loss: 0.0002
Epoch 80, Loss: 0.0001


[WinError 2] The system cannot find the file specified
  File "c:\ProgramData\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\ProgramData\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\ProgramData\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\ProgramData\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


GNN-based clustering complete!
Cluster distribution: [ 955  583 1002  117]




In [None]:
"""
GNN-based user embedding for drift analysis
"""

import os
os.environ.setdefault("OMP_NUM_THREADS", "8")
os.environ.setdefault("MKL_NUM_THREADS", "8")
os.environ.setdefault("OPENBLAS_NUM_THREADS", "8")
os.environ.setdefault("NUMEXPR_NUM_THREADS", "8")
os.environ.setdefault("LOKY_MAX_CPU_COUNT", "8")

# --------- (B) Imports ----------
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch_geometric.data import Data
from torch_geometric.utils import to_undirected
from torch_geometric.nn import SAGEConv  

from sklearn.cluster import KMeans

RUN_ROOT = r"runs\enhanced_v1_20251119_001142"
REVIEWS_PATH = os.path.join(RUN_ROOT, "data", "reviews.parquet")
TRAJ_PATH    = os.path.join(RUN_ROOT, "data", "user_trajectories.parquet")  # optional (for supervised)

df = pd.read_parquet(REVIEWS_PATH)

# Build stable categorical encodings (and keep category ordering)
user_cat = df["user_id"].astype("category")
item_cat = df["item_id"].astype("category")

n_users = len(user_cat.cat.categories)
n_items = len(item_cat.cat.categories)
n_nodes = n_users + n_items

user_codes = user_cat.cat.codes.to_numpy()                 # 0..n_users-1
item_codes = item_cat.cat.codes.to_numpy() + n_users       # n_users..n_users+n_items-1

# Stack into a single ndarray first (shape [2, E]) then convert to tensor
edge_index_np = np.vstack([user_codes, item_codes]).astype(np.int64)
edge_index = torch.from_numpy(edge_index_np)
edge_index = to_undirected(edge_index, num_nodes=n_nodes)  # add reverse edges

# Users: mean hybrid sentiment; Items: mean stars
user_feat_series = df.groupby("user_id")["sent_hybrid"].mean()
item_feat_series = df.groupby("item_id")["stars"].mean()

# Reindex by category order to match codes 0..n_users-1 and 0..n_items-1
user_feat = user_feat_series.reindex(user_cat.cat.categories).to_numpy().reshape(-1, 1)
item_feat = item_feat_series.reindex(item_cat.cat.categories).to_numpy().reshape(-1, 1)

x_np = np.vstack([user_feat, item_feat]).astype(np.float32)  # shape [n_nodes, 1]
x = torch.from_numpy(x_np)

data = Data(x=x, edge_index=edge_index)

y_supervised = None
if os.path.exists(TRAJ_PATH):
    traj = pd.read_parquet(TRAJ_PATH)
    # Align to category order; missing users â†’ NaN (we'll mask them out)
    slope_map = traj.set_index("user_id")["drift_slope"]
    y_arr = slope_map.reindex(user_cat.cat.categories).to_numpy()
    y_supervised = torch.from_numpy(y_arr.astype(np.float32)).reshape(-1, 1)  # [n_users, 1]

# Model 
class DriftGNN(nn.Module):
    def __init__(self, in_dim=1, hid=32, emb=32, recon=True, supervise=False):
        super().__init__()
        self.convs = nn.ModuleList([
            SAGEConv(in_dim, hid),
            SAGEConv(hid, emb),
        ])
        self.recon = recon
        self.supervise = supervise

        # Decoder to reconstruct original 1-D node feature
        if self.recon:
            self.decoder = nn.Sequential(
                nn.Linear(emb, hid),
                nn.ReLU(),
                nn.Linear(hid, 1),
            )

        if self.supervise:
            self.head = nn.Sequential(
                nn.Linear(emb, hid),
                nn.ReLU(),
                nn.Linear(hid, 1),
            )

    def forward(self, x, edge_index):
        h = x
        for i, conv in enumerate(self.convs):
            h = conv(h, edge_index)
            if i == 0:
                h = F.relu(h)
        out = {"emb": h}
        if self.recon:
            out["recon"] = self.decoder(h)
        if self.supervise:
            out["slope"] = self.head(h[:n_users]) 
        return out

USE_SUPERVISED = y_supervised is not None

model = DriftGNN(in_dim=1, hid=32, emb=32, recon=True, supervise=USE_SUPERVISED)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)

# Training loop 
model.train()
EPOCHS = 100
for epoch in range(1, EPOCHS + 1):
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)

    # Reconstruction MSE on ALL nodes (users + items): match original 1-D features
    loss_recon = F.mse_loss(out["recon"], data.x)

    if USE_SUPERVISED:
        mask = ~torch.isnan(y_supervised.squeeze(1))
        if mask.any():
            pred = out["slope"].squeeze(1)[mask]
            targ = y_supervised.squeeze(1)[mask]
            loss_sup = F.l1_loss(pred, targ)  
        else:
            loss_sup = torch.tensor(0.0, requires_grad=True)
        loss = loss_recon + 0.5 * loss_sup
    else:
        loss_sup = torch.tensor(0.0, requires_grad=True)
        loss = loss_recon

    loss.backward()
    optimizer.step()

    if epoch % 20 == 0 or epoch == 1:
        msg = f"[epoch {epoch:3d}] recon={loss_recon.item():.4f}"
        if USE_SUPERVISED:
            msg += f", sup={loss_sup.item():.4f}, total={loss.item():.4f}"
        print(msg)

# Embeddings & k-means 
model.eval()
with torch.no_grad():
    emb = model(data.x, data.edge_index)["emb"]  # [n_nodes, emb_dim]
    user_emb = emb[:n_users].cpu().numpy()

kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
gnn_clusters = kmeans.fit_predict(user_emb)

counts = np.bincount(gnn_clusters, minlength=4)
print("GNN-based clustering complete!")
print("Cluster distribution:", counts)


[epoch   1] recon=10.3014, sup=0.0835, total=10.3432
[epoch  20] recon=2.7532, sup=0.0275, total=2.7670
[epoch  40] recon=0.0869, sup=0.0256, total=0.0997
[epoch  60] recon=0.0634, sup=0.0246, total=0.0757
[epoch  80] recon=0.0523, sup=0.0238, total=0.0642
[epoch 100] recon=0.0428, sup=0.0235, total=0.0545




GNN-based clustering complete!
Cluster distribution: [1389  109  467  692]
