## UltraGCN KuaiRec

In [2]:
# With W2v Init
import pickle
import numpy as np
import scipy.sparse as sp
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import os
import time
import pandas as pd
from collections import defaultdict
from gensim.models import Word2Vec
import multiprocessing

# Step 1: Precompute W2V Embeddings
def w2v_emb(embedding_dim=50):
    df_watch = pd.read_csv('KuaiRec/big_matrix.csv')
    df_watch = df_watch[df_watch['watch_ratio'] >= 2].sort_values(['user_id', 'time'])

    unique_users = df_watch['user_id'].unique()
    unique_videos = df_watch['video_id'].unique()

    user2idx = {uid: i for i, uid in enumerate(unique_users)}
    video2idx = {vid: i for i, vid in enumerate(unique_videos)}
    
    train = pd.read_parquet('KuaiRec/train_sequences.parquet')
    sequences = train['video_id'].apply(lambda x: list(map(str, x))).tolist()
    model = Word2Vec(
        vector_size=embedding_dim,
        window=15,
        workers=multiprocessing.cpu_count(),
        sg=1,
        min_count=1,
        compute_loss=True,
    )
    model.build_vocab(sequences)
    model.train(
        corpus_iterable=sequences,
        total_examples=len(sequences),
        epochs=75,
    )
    model_vocab = list(model.wv.index_to_key)
    
    with open("KuaiRec/adj_data.pkl", "rb") as f:
        adj_data = pickle.load(f)
        
    video_emb = np.zeros((adj_data['num_nodes_dict']['video'], embedding_dim), dtype=np.float32)
    for vid, idx in video2idx.items():
        if str(vid) in model_vocab:
            video_emb[idx] = model.wv[str(vid)]

    video_emb = torch.tensor(video_emb, dtype=torch.float32)
    
    num_users = adj_data['num_nodes_dict']['user']
    user_to_videos = defaultdict(list)
    for _, row in df_watch.iterrows():
        user_id = row['user_id']
        video_id_str = str(row['video_id'])
        user_to_videos[user_id].append(video_id_str)

    user_emb = torch.zeros((num_users, embedding_dim), dtype=torch.float32)
    for uid, u_idx in user2idx.items():
        vids_watched = user_to_videos[uid]
        if not vids_watched:
            continue
        sum_vec = np.zeros(embedding_dim, dtype=np.float32)
        count = 0
        for vid_str in vids_watched:
            if vid_str in model.wv:
                sum_vec += model.wv[vid_str]
                count += 1
        if count > 0:
            user_emb[u_idx] = torch.tensor(sum_vec / count, dtype=torch.float32)
    
    return user_emb, video_emb

# Step 2: Data Preparation
with open("KuaiRec/adj_data.pkl", "rb") as f:
    adj_data = pickle.load(f)
with open("KuaiRec/val_data.pkl", "rb") as f:
    val_data_dict = pickle.load(f)
with open("KuaiRec/test_data.pkl", "rb") as f:
    test_data_dict = pickle.load(f)

num_users = adj_data['num_nodes_dict']['user']
num_items = adj_data['num_nodes_dict']['video']

rowptr = adj_data['rowptr_dict'][('user', 'watches', 'video')]
col = adj_data['col_dict'][('user', 'watches', 'video')]
train_data = []
for u in range(num_users):
    start = rowptr[u]
    end = rowptr[u + 1]
    items = col[start:end].tolist()
    train_data.extend([(u, i) for i in items])

train_mat = sp.coo_matrix(
    (np.ones(len(train_data)), ([x[0] for x in train_data], [x[1] for x in train_data])),
    shape=(num_users, num_items)
).tocsr()

items_D = np.array(train_mat.sum(axis=0)).flatten()
users_D = np.array(train_mat.sum(axis=1)).flatten()
beta_uD = (np.sqrt(users_D + 1) / (users_D + 1e-8)).reshape(-1)
beta_iD = (1 / np.sqrt(items_D + 1)).reshape(-1)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
constraint_mat = {
    "beta_uD": torch.from_numpy(beta_uD).float().to(device),
    "beta_iD": torch.from_numpy(beta_iD).float().to(device)
}

def get_ii_constraint_mat(train_mat, num_neighbors, ii_diagonal_zero=False):
    print('Computing item-item constraint matrix efficiently...')
    A = train_mat.T.dot(train_mat).tocsr()
    if ii_diagonal_zero:
        A.setdiag(0)
    A.eliminate_zeros()
    n_items = A.shape[0]
    items_D = np.array(A.sum(axis=0)).flatten()
    users_D = np.array(A.sum(axis=1)).flatten()
    beta_uD = (np.sqrt(users_D + 1) / (users_D + 1e-8)).reshape(-1, 1)
    beta_iD = (1 / np.sqrt(items_D + 1)).reshape(1, -1)
    all_ii_constraint_mat = beta_uD.dot(beta_iD)
    res_mat = torch.zeros((n_items, num_neighbors), dtype=torch.long)
    res_sim_mat = torch.zeros((n_items, num_neighbors), dtype=torch.float32)
    batch_size = 10000
    for start in range(0, n_items, batch_size):
        end = min(start + batch_size, n_items)
        batch_A = A[start:end].toarray()
        batch_weighted = batch_A * all_ii_constraint_mat[start:end]
        batch_tensor = torch.from_numpy(batch_weighted).float()
        row_sims, row_idxs = torch.topk(batch_tensor, k=num_neighbors, dim=1)
        res_mat[start:end] = row_idxs
        res_sim_mat[start:end] = row_sims
        print(f'Processed items {start} to {end-1}')
    print('Item-item constraint matrix computed!')
    return res_mat.to(device), res_sim_mat.to(device)

ii_neighbor_num = 10
ii_cons_mat_path = 'KuaiRec_ii_constraint_mat.pkl'
ii_neigh_mat_path = 'KuaiRec_ii_neighbor_mat.pkl'
if os.path.exists(ii_cons_mat_path) and os.path.exists(ii_neigh_mat_path):
    with open(ii_cons_mat_path, 'rb') as f:
        ii_constraint_mat = pickle.load(f).to(device)
    with open(ii_neigh_mat_path, 'rb') as f:
        ii_neighbor_mat = pickle.load(f).to(device)
else:
    ii_neighbor_mat, ii_constraint_mat = get_ii_constraint_mat(train_mat, ii_neighbor_num)
    with open(ii_cons_mat_path, 'wb') as f:
        pickle.dump(ii_constraint_mat, f)
    with open(ii_neigh_mat_path, 'wb') as f:
        pickle.dump(ii_neighbor_mat, f)

interacted_items = [[] for _ in range(num_users)]
for u, i in train_data:
    interacted_items[u].append(i)

mask = torch.zeros(num_users, num_items).to(device)
for u, items in enumerate(interacted_items):
    mask[u, items] = -np.inf

train_data_tensor = torch.tensor(train_data, dtype=torch.long)
train_loader = DataLoader(
    train_data_tensor, batch_size=8192, shuffle=True, num_workers=16, pin_memory=True
)
val_loader = DataLoader(
    torch.tensor(list(val_data_dict.keys()), dtype=torch.long), batch_size=1024, shuffle=False, num_workers=5, pin_memory=True
)
test_loader = DataLoader(
    torch.tensor(list(test_data_dict.keys()), dtype=torch.long), batch_size=1024, shuffle=False, num_workers=5, pin_memory=True
)

# Step 2: Model Definition with W2V Initialization
class UltraGCN(nn.Module):
    def __init__(self, params, constraint_mat, ii_constraint_mat, ii_neighbor_mat, user_emb_init, item_emb_init):
        super(UltraGCN, self).__init__()
        self.user_num = params['user_num']
        self.item_num = params['item_num']
        self.embedding_dim = params['embedding_dim']
        self.w1 = params['w1']
        self.w2 = params['w2']
        self.w3 = params['w3']
        self.w4 = params['w4']
        self.negative_weight = params['negative_weight']
        self.gamma = params['gamma']
        self.lambda_ = params['lambda']

        # Initialize embeddings with W2V embeddings
        self.user_embeds = nn.Embedding.from_pretrained(user_emb_init, freeze=False)
        self.item_embeds = nn.Embedding.from_pretrained(item_emb_init, freeze=False)

        self.constraint_mat = constraint_mat
        self.ii_constraint_mat = ii_constraint_mat
        self.ii_neighbor_mat = ii_neighbor_mat

    def get_omegas(self, users, pos_items, neg_items):
        device = self.get_device()
        if self.w2 > 0:
            pos_weight = torch.mul(self.constraint_mat['beta_uD'][users], self.constraint_mat['beta_iD'][pos_items])
            pos_weight = self.w1 + self.w2 * pos_weight
        else:
            pos_weight = self.w1 * torch.ones(len(pos_items), device=device)
        if self.w4 > 0:
            neg_weight = torch.mul(
                torch.repeat_interleave(self.constraint_mat['beta_uD'][users], neg_items.size(1)),
                self.constraint_mat['beta_iD'][neg_items.flatten()]
            )
            neg_weight = self.w3 + self.w4 * neg_weight
        else:
            neg_weight = self.w3 * torch.ones(neg_items.size(0) * neg_items.size(1), device=device)
        return torch.cat((pos_weight, neg_weight))

    def cal_loss_L(self, users, pos_items, neg_items, omega_weight):
        device = self.get_device()
        user_embeds = self.user_embeds(users)
        pos_embeds = self.item_embeds(pos_items)
        neg_embeds = self.item_embeds(neg_items)
        pos_scores = (user_embeds * pos_embeds).sum(dim=-1)
        user_embeds = user_embeds.unsqueeze(1)
        neg_scores = (user_embeds * neg_embeds).sum(dim=-1)
        neg_labels = torch.zeros(neg_scores.size(), device=device)
        neg_loss = F.binary_cross_entropy_with_logits(
            neg_scores, neg_labels, weight=omega_weight[len(pos_scores):].view(neg_scores.size()), reduction='none'
        ).mean(dim=-1)
        pos_labels = torch.ones(pos_scores.size(), device=device)
        pos_loss = F.binary_cross_entropy_with_logits(
            pos_scores, pos_labels, weight=omega_weight[:len(pos_scores)], reduction='none'
        )
        return (pos_loss + neg_loss * self.negative_weight).sum()

    def cal_loss_I(self, users, pos_items):
        device = self.get_device()
        neighbor_embeds = self.item_embeds(self.ii_neighbor_mat[pos_items])
        sim_scores = self.ii_constraint_mat[pos_items]
        user_embeds = self.user_embeds(users).unsqueeze(1)
        loss = -sim_scores * (user_embeds * neighbor_embeds).sum(dim=-1).sigmoid().log()
        return loss.sum()

    def norm_loss(self):
        loss = 0.0
        for parameter in self.parameters():
            loss += torch.sum(parameter ** 2)
        return loss / 2

    def forward(self, users, pos_items, neg_items):
        omega_weight = self.get_omegas(users, pos_items, neg_items)
        loss = self.cal_loss_L(users, pos_items, neg_items, omega_weight)
        loss += self.gamma * self.norm_loss()
        loss += self.lambda_ * self.cal_loss_I(users, pos_items)
        return loss

    def test_forward(self, users):
        items = torch.arange(self.item_num).to(users.device)
        user_embeds = self.user_embeds(users)
        item_embeds = self.item_embeds(items)
        return user_embeds.mm(item_embeds.t())

    def get_device(self):
        return self.user_embeds.weight.device

# Step 3: Training Loop
params = {
    'user_num': num_users,
    'item_num': num_items,
    'embedding_dim': 50,
    'w1': 1.0,
    'w2': 1.0,
    'w3': 1.0,
    'w4': 1.0,
    'negative_weight': 1.0,
    'gamma': 0.001,
    'lambda': 0.0001,
    'initial_weight': 0.1,
    'lr': 0.005,
    'batch_size': 8192,
    'max_epoch': 100,
    'early_stop_epoch': 10,
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
    'topk': [1,2,3,5,10,20,30,40,50,60,70,80,90,100],
    'negative_num': 1,
    'sampling_sift_pos': True,
    'is_validation': True
}

def Sampling(users, pos_items, item_num, neg_ratio, interacted_items, sampling_sift_pos):
    neg_candidates = np.arange(item_num)
    if sampling_sift_pos:
        neg_items = []
        for u in users:
            probs = np.ones(item_num)
            probs[interacted_items[u]] = 0
            probs /= np.sum(probs)
            u_neg_items = np.random.choice(neg_candidates, size=neg_ratio, p=probs, replace=True)
            neg_items.append(u_neg_items)
        neg_items = np.array(neg_items)
    else:
        neg_items = np.random.choice(neg_candidates, (len(users), neg_ratio), replace=True)
    return users, pos_items, torch.from_numpy(neg_items).long().to(users.device)

def evaluate(model, loader, ground_truth_dict, mask, other_dict, top_k, is_validation=True):
    total_recall = {k: 0.0 for k in top_k}
    total_ndcg = {k: 0.0 for k in top_k}
    recommended_items = {k: set() for k in top_k}
    total_items = model.item_num
    user_count = 0

    idcg_cache = {}
    def get_idcg(r_size, k):
        cut = min(k, r_size)
        if (cut, k) not in idcg_cache:
            val = 0.0
            for i in range(cut):
                val += 1.0 / np.log2(i + 2)
            idcg_cache[(cut, k)] = val
        return idcg_cache[(cut, k)]

    with torch.no_grad():
        model.eval()
        for batch_users in loader:
            batch_users = batch_users.to(model.get_device())
            rating = model.test_forward(batch_users)
            rating += mask[batch_users]
            rating = rating.cpu().numpy()

            for i, u in enumerate(batch_users.cpu().numpy()):
                relevant_set = set(ground_truth_dict.get(u, []))
                if not relevant_set:
                    continue
                other_set = set(other_dict.get(u, []))
                scores = rating[i]
                exclude_items = set(interacted_items[u]) | other_set
                valid_items = [vid for vid in range(total_items) if vid not in exclude_items]
                scores_filtered = scores[valid_items]
                ranked_items = np.argsort(-scores_filtered)[:max(top_k)]
                ranked_items = [valid_items[vid] for vid in ranked_items]

                hits_positions = [pos for pos, vid in enumerate(ranked_items) if vid in relevant_set]

                for k in top_k:
                    hits_count = sum(1 for pos in hits_positions if pos < k)
                    recall_k = hits_count / float(len(relevant_set))
                    total_recall[k] += recall_k

                    dcg_val = sum(1.0 / np.log2(pos + 2) for pos in hits_positions if pos < k)
                    idcg_val = get_idcg(len(relevant_set), k)
                    ndcg_k = (dcg_val / idcg_val) if idcg_val > 0 else 0.0
                    total_ndcg[k] += ndcg_k

                    recommended_items[k].update(ranked_items[:k])
                user_count += 1

    if user_count == 0:
        return {f'Recall@{k}': 0.0 for k in top_k} | {f'NDCG@{k}': 0.0 for k in top_k} | {f'Coverage@{k}': 0.0 for k in top_k}

    avg_recall = {f'Recall@{k}': total_recall[k] / user_count for k in top_k}
    avg_ndcg = {f'NDCG@{k}': total_ndcg[k] / user_count for k in top_k}
    coverage = {f'Coverage@{k}': len(recommended_items[k]) / total_items for k in top_k}
    return avg_recall | avg_ndcg | coverage

def train(model, optimizer, train_loader, val_loader, test_loader, mask, val_data_dict, test_data_dict, interacted_items, params):
    device = params['device']
    best_recall, best_epoch = 0, 0
    early_stop_count = 0
    patience = 3

    for epoch in range(params['max_epoch']):
        model.train()
        total_loss = 0
        for batch in train_loader:
            batch = batch.to(device)
            users, pos_items = batch[:, 0], batch[:, 1]
            users, pos_items, neg_items = Sampling(
                users, pos_items, params['item_num'], params['negative_num'], interacted_items, params['sampling_sift_pos']
            )

            optimizer.zero_grad()
            loss = model(users, pos_items, neg_items)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f'Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}')

        test_metrics = evaluate(model, val_loader, val_data_dict, mask, test_data_dict if params['is_validation'] else val_data_dict, params['topk'], is_validation=False)
        recall_at_k = test_metrics[f'Recall@{max(params["topk"])}']
        print(f"Test Metrics: {test_metrics}")

        if recall_at_k > best_recall:
            best_recall, best_epoch = recall_at_k, epoch
            early_stop_count = 0
        else:
            early_stop_count += 1
            if early_stop_count >= patience:
                print(f'Early stop at epoch {epoch+1}, Best Recall@{max(params["topk"])}: {best_recall:.4f} at epoch {best_epoch+1}')
                break

    #test_metrics = evaluate(model, test_loader, test_data_dict, mask, val_data_dict if params['is_validation'] else test_data_dict, params['topk'], is_validation=False)
    #print(f"Final Test Metrics: {test_metrics}")
    #return test_metrics

# Step 5: Run Training with W2V Initialization
user_emb_init, item_emb_init = w2v_emb(embedding_dim=params['embedding_dim'])
model = UltraGCN(params, constraint_mat, ii_constraint_mat, ii_neighbor_mat, user_emb_init, item_emb_init)
model = model.to(params['device'])
optimizer = torch.optim.Adam(model.parameters(), lr=params['lr'])
train(model, optimizer, train_loader, val_loader, test_loader, mask, val_data_dict, test_data_dict, interacted_items, params)

Epoch 1, Loss: 14666.1238
Test Metrics: {'Recall@1': 0.006891385514725286, 'Recall@2': 0.011873115186467737, 'Recall@3': 0.015428774474037383, 'Recall@5': 0.022695493450765266, 'Recall@10': 0.03516822008790371, 'Recall@20': 0.05337984265872318, 'Recall@30': 0.06915117598449312, 'Recall@40': 0.0825899097208964, 'Recall@50': 0.09558622840372709, 'Recall@60': 0.105164251826902, 'Recall@70': 0.11464980536034802, 'Recall@80': 0.12200531416562585, 'Recall@90': 0.129472217602437, 'Recall@100': 0.13701526784123028, 'NDCG@1': 0.0963855421686747, 'NDCG@2': 0.08816046476468019, 'NDCG@3': 0.08104624108585463, 'NDCG@5': 0.07487380822636895, 'NDCG@10': 0.0688693441717888, 'NDCG@20': 0.06811990350473553, 'NDCG@30': 0.07073308866113709, 'NDCG@40': 0.0734943292876321, 'NDCG@50': 0.07715753626376758, 'NDCG@60': 0.07986772067833171, 'NDCG@70': 0.08292367190351789, 'NDCG@80': 0.08545078647252426, 'NDCG@90': 0.08809902397118079, 'NDCG@100': 0.09072497359941749, 'Coverage@1': 0.010439970171513796, 'Coverage

In [2]:
# Without W2V init
import pickle
import numpy as np
import scipy.sparse as sp
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import os
import time

# Step 1: Data Preparation
with open("KuaiRec/adj_data.pkl", "rb") as f:
    adj_data = pickle.load(f)
with open("KuaiRec/val_data.pkl", "rb") as f:
    val_data_dict = pickle.load(f)
with open("KuaiRec/test_data.pkl", "rb") as f:
    test_data_dict = pickle.load(f)

num_users = adj_data['num_nodes_dict']['user']
num_items = adj_data['num_nodes_dict']['video']

rowptr = adj_data['rowptr_dict'][('user', 'watches', 'video')]
col = adj_data['col_dict'][('user', 'watches', 'video')]
train_data = []
for u in range(num_users):
    start = rowptr[u]
    end = rowptr[u + 1]
    items = col[start:end].tolist()
    train_data.extend([(u, i) for i in items])

train_mat = sp.coo_matrix(
    (np.ones(len(train_data)), ([x[0] for x in train_data], [x[1] for x in train_data])),
    shape=(num_users, num_items)
).tocsr()

items_D = np.array(train_mat.sum(axis=0)).flatten()
users_D = np.array(train_mat.sum(axis=1)).flatten()
beta_uD = (np.sqrt(users_D + 1) / (users_D + 1e-8)).reshape(-1)
beta_iD = (1 / np.sqrt(items_D + 1)).reshape(-1)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
constraint_mat = {
    "beta_uD": torch.from_numpy(beta_uD).float().to(device),
    "beta_iD": torch.from_numpy(beta_iD).float().to(device)
}

def get_ii_constraint_mat(train_mat, num_neighbors, ii_diagonal_zero=False):
    print('Computing item-item constraint matrix efficiently...')
    A = train_mat.T.dot(train_mat).tocsr()
    if ii_diagonal_zero:
        A.setdiag(0)
    A.eliminate_zeros()
    n_items = A.shape[0]
    items_D = np.array(A.sum(axis=0)).flatten()
    users_D = np.array(A.sum(axis=1)).flatten()
    beta_uD = (np.sqrt(users_D + 1) / (users_D + 1e-8)).reshape(-1, 1)
    beta_iD = (1 / np.sqrt(items_D + 1)).reshape(1, -1)
    all_ii_constraint_mat = beta_uD.dot(beta_iD)
    res_mat = torch.zeros((n_items, num_neighbors), dtype=torch.long)
    res_sim_mat = torch.zeros((n_items, num_neighbors), dtype=torch.float32)
    batch_size = 10000
    for start in range(0, n_items, batch_size):
        end = min(start + batch_size, n_items)
        batch_A = A[start:end].toarray()
        batch_weighted = batch_A * all_ii_constraint_mat[start:end]
        batch_tensor = torch.from_numpy(batch_weighted).float()
        row_sims, row_idxs = torch.topk(batch_tensor, k=num_neighbors, dim=1)
        res_mat[start:end] = row_idxs
        res_sim_mat[start:end] = row_sims
        print(f'Processed items {start} to {end-1}')
    print('Item-item constraint matrix computed!')
    return res_mat.to(device), res_sim_mat.to(device)

ii_neighbor_num = 10
ii_cons_mat_path = 'KuaiRec_ii_constraint_mat.pkl'
ii_neigh_mat_path = 'KuaiRec_ii_neighbor_mat.pkl'
if os.path.exists(ii_cons_mat_path) and os.path.exists(ii_neigh_mat_path):
    with open(ii_cons_mat_path, 'rb') as f:
        ii_constraint_mat = pickle.load(f).to(device)
    with open(ii_neigh_mat_path, 'rb') as f:
        ii_neighbor_mat = pickle.load(f).to(device)
else:
    ii_neighbor_mat, ii_constraint_mat = get_ii_constraint_mat(train_mat, ii_neighbor_num)
    with open(ii_cons_mat_path, 'wb') as f:
        pickle.dump(ii_constraint_mat, f)
    with open(ii_neigh_mat_path, 'wb') as f:
        pickle.dump(ii_neighbor_mat, f)

interacted_items = [[] for _ in range(num_users)]
for u, i in train_data:
    interacted_items[u].append(i)

mask = torch.zeros(num_users, num_items).to(device)
for u, items in enumerate(interacted_items):
    mask[u, items] = -np.inf

train_data_tensor = torch.tensor(train_data, dtype=torch.long)
train_loader = DataLoader(
    train_data_tensor, batch_size=8192, shuffle=True, num_workers=16, pin_memory=True
)
val_loader = DataLoader(
    torch.tensor(list(val_data_dict.keys()), dtype=torch.long), batch_size=1024, shuffle=False, num_workers=5, pin_memory=True
)
test_loader = DataLoader(
    torch.tensor(list(test_data_dict.keys()), dtype=torch.long), batch_size=1024, shuffle=False, num_workers=5, pin_memory=True
)

# Step 2: Model Definition
class UltraGCN(nn.Module):
    def __init__(self, params, constraint_mat, ii_constraint_mat, ii_neighbor_mat):
        super(UltraGCN, self).__init__()
        self.user_num = params['user_num']
        self.item_num = params['item_num']
        self.embedding_dim = params['embedding_dim']
        self.w1 = params['w1']
        self.w2 = params['w2']
        self.w3 = params['w3']
        self.w4 = params['w4']
        self.negative_weight = params['negative_weight']
        self.gamma = params['gamma']
        self.lambda_ = params['lambda']

        self.user_embeds = nn.Embedding(self.user_num, self.embedding_dim)
        self.item_embeds = nn.Embedding(self.item_num, self.embedding_dim)

        self.constraint_mat = constraint_mat
        self.ii_constraint_mat = ii_constraint_mat
        self.ii_neighbor_mat = ii_neighbor_mat

        self.initial_weight = params['initial_weight']
        self.initial_weights()

    def initial_weights(self):
        nn.init.normal_(self.user_embeds.weight, std=self.initial_weight)
        nn.init.normal_(self.item_embeds.weight, std=self.initial_weight)

    def get_omegas(self, users, pos_items, neg_items):
        device = self.get_device()
        if self.w2 > 0:
            pos_weight = torch.mul(self.constraint_mat['beta_uD'][users], self.constraint_mat['beta_iD'][pos_items])
            pos_weight = self.w1 + self.w2 * pos_weight
        else:
            pos_weight = self.w1 * torch.ones(len(pos_items), device=device)
        if self.w4 > 0:
            neg_weight = torch.mul(
                torch.repeat_interleave(self.constraint_mat['beta_uD'][users], neg_items.size(1)),
                self.constraint_mat['beta_iD'][neg_items.flatten()]
            )
            neg_weight = self.w3 + self.w4 * neg_weight
        else:
            neg_weight = self.w3 * torch.ones(neg_items.size(0) * neg_items.size(1), device=device)
        return torch.cat((pos_weight, neg_weight))

    def cal_loss_L(self, users, pos_items, neg_items, omega_weight):
        device = self.get_device()
        user_embeds = self.user_embeds(users)
        pos_embeds = self.item_embeds(pos_items)
        neg_embeds = self.item_embeds(neg_items)
        pos_scores = (user_embeds * pos_embeds).sum(dim=-1)
        user_embeds = user_embeds.unsqueeze(1)
        neg_scores = (user_embeds * neg_embeds).sum(dim=-1)
        neg_labels = torch.zeros(neg_scores.size(), device=device)
        neg_loss = F.binary_cross_entropy_with_logits(
            neg_scores, neg_labels, weight=omega_weight[len(pos_scores):].view(neg_scores.size()), reduction='none'
        ).mean(dim=-1)
        pos_labels = torch.ones(pos_scores.size(), device=device)
        pos_loss = F.binary_cross_entropy_with_logits(
            pos_scores, pos_labels, weight=omega_weight[:len(pos_scores)], reduction='none'
        )
        return (pos_loss + neg_loss * self.negative_weight).sum()

    def cal_loss_I(self, users, pos_items):
        device = self.get_device()
        neighbor_embeds = self.item_embeds(self.ii_neighbor_mat[pos_items])
        sim_scores = self.ii_constraint_mat[pos_items]
        user_embeds = self.user_embeds(users).unsqueeze(1)
        loss = -sim_scores * (user_embeds * neighbor_embeds).sum(dim=-1).sigmoid().log()
        return loss.sum()

    def norm_loss(self):
        loss = 0.0
        for parameter in self.parameters():
            loss += torch.sum(parameter ** 2)
        return loss / 2

    def forward(self, users, pos_items, neg_items):
        omega_weight = self.get_omegas(users, pos_items, neg_items)
        loss = self.cal_loss_L(users, pos_items, neg_items, omega_weight)
        loss += self.gamma * self.norm_loss()
        loss += self.lambda_ * self.cal_loss_I(users, pos_items)
        return loss

    def test_forward(self, users):
        items = torch.arange(self.item_num).to(users.device)
        user_embeds = self.user_embeds(users)
        item_embeds = self.item_embeds(items)
        return user_embeds.mm(item_embeds.t())

    def get_device(self):
        return self.user_embeds.weight.device

# Step 3: Training Loop
params = {
    'user_num': num_users,
    'item_num': num_items,
    'embedding_dim': 50,
    'w1': 1.0,
    'w2': 1.0,
    'w3': 1.0,
    'w4': 1.0,
    'negative_weight': 1.0,
    'gamma': 0.001,
    'lambda': 0.0001,
    'initial_weight': 0.1,
    'lr': 0.005,
    'batch_size': 8192,
    'max_epoch': 100,
    'early_stop_epoch': 10,
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
    'topk': [1,2,3,5,10,20,30,40,50,60,70,80,90,100],
    'negative_num': 1,
    'sampling_sift_pos': True,
    'is_validation': True
}

def Sampling(users, pos_items, item_num, neg_ratio, interacted_items, sampling_sift_pos):
    neg_candidates = np.arange(item_num)
    if sampling_sift_pos:
        neg_items = []
        for u in users:
            probs = np.ones(item_num)
            probs[interacted_items[u]] = 0
            probs /= np.sum(probs)
            u_neg_items = np.random.choice(neg_candidates, size=neg_ratio, p=probs, replace=True)
            neg_items.append(u_neg_items)
        neg_items = np.array(neg_items)
    else:
        neg_items = np.random.choice(neg_candidates, (len(users), neg_ratio), replace=True)
    return users, pos_items, torch.from_numpy(neg_items).long().to(users.device)

def evaluate(model, loader, ground_truth_dict, mask, other_dict, top_k, is_validation=True):
    total_recall = {k: 0.0 for k in top_k}
    total_ndcg = {k: 0.0 for k in top_k}
    recommended_items = {k: set() for k in top_k}
    total_items = model.item_num
    user_count = 0

    idcg_cache = {}
    def get_idcg(r_size, k):
        cut = min(k, r_size)
        if (cut, k) not in idcg_cache:
            val = 0.0
            for i in range(cut):
                val += 1.0 / np.log2(i + 2)
            idcg_cache[(cut, k)] = val
        return idcg_cache[(cut, k)]

    with torch.no_grad():
        model.eval()
        for batch_users in loader:
            batch_users = batch_users.to(model.get_device())
            rating = model.test_forward(batch_users)  # Shape: [batch_size, item_num]
            rating += mask[batch_users]  # Mask training items
            rating = rating.cpu().numpy()

            for i, u in enumerate(batch_users.cpu().numpy()):
                relevant_set = set(ground_truth_dict.get(u, []))
                if not relevant_set:
                    continue
                other_set = set(other_dict.get(u, []))
                scores = rating[i]
                # Exclude training and other_set items
                exclude_items = set(interacted_items[u]) | other_set
                valid_items = [vid for vid in range(total_items) if vid not in exclude_items]
                scores_filtered = scores[valid_items]
                ranked_items = np.argsort(-scores_filtered)[:max(top_k)]  # Top-k after exclusion
                ranked_items = [valid_items[vid] for vid in ranked_items]

                hits_positions = [pos for pos, vid in enumerate(ranked_items) if vid in relevant_set]

                for k in top_k:
                    hits_count = sum(1 for pos in hits_positions if pos < k)
                    recall_k = hits_count / float(len(relevant_set))
                    total_recall[k] += recall_k

                    dcg_val = sum(1.0 / np.log2(pos + 2) for pos in hits_positions if pos < k)
                    idcg_val = get_idcg(len(relevant_set), k)
                    ndcg_k = (dcg_val / idcg_val) if idcg_val > 0 else 0.0
                    total_ndcg[k] += ndcg_k

                    recommended_items[k].update(ranked_items[:k])
                user_count += 1

    if user_count == 0:
        return {f'Recall@{k}': 0.0 for k in top_k} | {f'NDCG@{k}': 0.0 for k in top_k} | {f'Coverage@{k}': 0.0 for k in top_k}

    avg_recall = {f'Recall@{k}': total_recall[k] / user_count for k in top_k}
    avg_ndcg = {f'NDCG@{k}': total_ndcg[k] / user_count for k in top_k}
    coverage = {f'Coverage@{k}': len(recommended_items[k]) / total_items for k in top_k}
    return avg_recall | avg_ndcg | coverage

def train(model, optimizer, train_loader, val_loader, test_loader, mask, val_data_dict, test_data_dict, interacted_items, params):
    device = params['device']
    best_recall, best_epoch = 0, 0
    early_stop_count = 0
    patience = 3  # Stop if no improvement for 3 epochs

    for epoch in range(params['max_epoch']):
        model.train()
        total_loss = 0
        for batch in train_loader:
            batch = batch.to(device)
            users, pos_items = batch[:, 0], batch[:, 1]
            users, pos_items, neg_items = Sampling(
                users, pos_items, params['item_num'], params['negative_num'], interacted_items, params['sampling_sift_pos']
            )

            optimizer.zero_grad()
            loss = model(users, pos_items, neg_items)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()


        # Evaluate on validation set
        val_metrics = evaluate(model, val_loader, val_data_dict, mask, test_data_dict if params['is_validation'] else val_data_dict, params['topk'], is_validation=False)
        recall_at_k = val_metrics[f'Recall@{max(params["topk"])}']
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}, Validation Metrics: {val_metrics}")

        if recall_at_k > best_recall:
            best_recall, best_epoch = recall_at_k, epoch
            early_stop_count = 0
        else:
            early_stop_count += 1
            if early_stop_count >= patience:
                #print(f'Early stop at epoch {epoch+1}, Best Recall@{params["topk"]}: {best_recall:.4f} at epoch {best_epoch+1}')
                break
    print(f'Finished training, Best Recall@{max(params["topk"])}: {best_recall:.4f} at epoch {best_epoch+1}')
    return best_recall, best_epoch


# Step 5: Run Training
model = UltraGCN(params, constraint_mat, ii_constraint_mat, ii_neighbor_mat)
model = model.to(params['device'])
optimizer = torch.optim.Adam(model.parameters(), lr=params['lr'])
train(model, optimizer, train_loader, val_loader, test_loader, mask, val_data_dict, test_data_dict, interacted_items, params)

Epoch 1, Loss: 11151.8186, Validation Metrics: {'Recall@1': 0.011458258828346259, 'Recall@2': 0.020400784553366993, 'Recall@3': 0.02919536287847556, 'Recall@5': 0.042889765728618, 'Recall@10': 0.06455385814906589, 'Recall@20': 0.09404348931488998, 'Recall@30': 0.1169038920067873, 'Recall@40': 0.13386393963555884, 'Recall@50': 0.1466741898961528, 'Recall@60': 0.15957888056991373, 'Recall@70': 0.16950628637928114, 'Recall@80': 0.18006949710091477, 'Recall@90': 0.1883240968726763, 'Recall@100': 0.19523850775447504, 'NDCG@1': 0.1261516654854713, 'NDCG@2': 0.11672872338279806, 'NDCG@3': 0.11267452227568862, 'NDCG@5': 0.10771109849455136, 'NDCG@10': 0.10192417971625058, 'NDCG@20': 0.10439573538819427, 'NDCG@30': 0.10943588573133388, 'NDCG@40': 0.1139148226204892, 'NDCG@50': 0.11757951591212351, 'NDCG@60': 0.12172282364423176, 'NDCG@70': 0.1252918764909931, 'NDCG@80': 0.12896830013756416, 'NDCG@90': 0.13215450901708126, 'NDCG@100': 0.1348304212463064, 'Coverage@1': 0.029921700223713647, 'Cove

(0.26731737793139587, 1)

In [5]:
import pandas as pd
ultragcn_metrics = {
    'Recall@1': 0.02005460183880451,
    'Recall@2': 0.03592174557311276,
    'Recall@3': 0.04616237477553958,
    'Recall@5': 0.06146424723852795,
    'Recall@10': 0.08371859625654751,
    'Recall@20': 0.12273670585885475,
    'Recall@30': 0.1542731458426025,
    'Recall@40': 0.1755284674262601,
    'Recall@50': 0.19592054881885204,
    'Recall@60': 0.21219716560669227,
    'Recall@70': 0.22769812820420393,
    'Recall@80': 0.24225927203264896,
    'Recall@90': 0.2557639022372308,
    'Recall@100': 0.26731737793139587,
    'NDCG@1': 0.2069454287739192,
    'NDCG@2': 0.19652699739552615,
    'NDCG@3': 0.17943761470715774,
    'NDCG@5': 0.16097270795807392,
    'NDCG@10': 0.14173767843350488,
    'NDCG@20': 0.14119209609484723,
    'NDCG@30': 0.14755945593281733,
    'NDCG@40': 0.15261601612128237,
    'NDCG@50': 0.15880867690135625,
    'NDCG@60': 0.1641732034930173,
    'NDCG@70': 0.16956748572605057,
    'NDCG@80': 0.17486813458456182,
    'NDCG@90': 0.1798150527798118,
    'NDCG@100': 0.18428634752100087,
    'Coverage@1': 0.0041946308724832215,
    'Coverage@2': 0.00633855331841909,
    'Coverage@3': 0.008296047725577927,
    'Coverage@5': 0.011558538404175988,
    'Coverage@10': 0.01948173005219985,
    'Coverage@20': 0.034395973154362415,
    'Coverage@30': 0.047818791946308725,
    'Coverage@40': 0.05965697240865026,
    'Coverage@50': 0.07252050708426547,
    'Coverage@60': 0.0837994034302759,
    'Coverage@70': 0.09619686800894854,
    'Coverage@80': 0.10868754660700969,
    'Coverage@90': 0.12052572706935123,
    'Coverage@100': 0.13143176733780762
}

root = 'results/KuaiRec/'
results_path = root + 'results.csv'

# Read the existing results CSV file
df = pd.read_csv(results_path, index_col=0)
df['UltraGCN'] = pd.Series(ultragcn_metrics)

# Save the updated DataFrame back to CSV
df.to_csv(results_path)

## UserKNN KuaiRec

In [1]:
import pickle
import numpy as np
import scipy.sparse as sp
from sklearn.metrics.pairwise import cosine_similarity
import time
import os

class UserKNN:
    def __init__(self, train_mat, K_neighbors=20):
        """
        Initialize UserKNN with the training user-item interaction matrix.
        
        Args:
            train_mat (sp.csr_matrix): User-item interaction matrix (users x items).
            K_neighbors (int): Number of neighbors to consider for recommendations.
        """
        self.train_mat = train_mat  # CSR sparse matrix
        self.K_neighbors = K_neighbors
        self.num_users, self.num_items = train_mat.shape
        print(f"Initializing UserKNN with {self.num_users} users, {self.num_items} items, K={K_neighbors}")
        # Compute user-user similarity matrix
        self.user_sim = self.compute_user_similarity()

    def compute_user_similarity(self):
        """
        Compute cosine similarity between users based on the sparse interaction matrix.
        
        Returns:
            np.ndarray: User-user similarity matrix (num_users x num_users).
        """
        print("Computing user-user similarity matrix...")
        start_time = time.time()
        # Cosine similarity on sparse matrix returns a dense array
        sim = cosine_similarity(self.train_mat, dense_output=True)
        # Set similarity of a user with themselves to -1 to exclude in neighbor selection
        np.fill_diagonal(sim, -1)
        print(f"Similarity matrix computed in {time.time() - start_time:.2f} seconds")
        return sim

    def get_top_k_neighbors(self, user_id):
        """
        Get indices of the top K most similar users for a given user.
        
        Args:
            user_id (int): User index.
        
        Returns:
            np.ndarray: Indices of top K neighbors.
        """
        sim_scores = self.user_sim[user_id]
        # Get top K indices (highest similarities), excluding the user themselves (set to -1)
        top_k_indices = np.argsort(-sim_scores)[:self.K_neighbors]
        return top_k_indices

    def recommend(self, user_id, top_k=100):
        """
        Generate top-k item recommendations for a user.
        
        Args:
            user_id (int): User index.
            top_k (int): Number of items to recommend.
        
        Returns:
            np.ndarray: Indices of top-k recommended items.
        """
        # Get top K neighbors
        neighbors = self.get_top_k_neighbors(user_id)
        # Sum interactions of neighbors (sparse matrix sum along rows)
        neighbor_interactions = self.train_mat[neighbors].sum(axis=0).A1  # Convert to 1D array
        # Exclude items the user has already interacted with
        user_interactions = self.train_mat[user_id].nonzero()[1]
        candidate_items = np.setdiff1d(np.arange(self.num_items), user_interactions)
        # Score candidates based on neighbor interactions
        scores = neighbor_interactions[candidate_items]
        # Get top-k items
        top_k_indices = candidate_items[np.argsort(-scores)[:top_k]]
        return top_k_indices

    def evaluate(self, ground_truth_dict, other_dict, top_k_list=[100], is_validation=True):
        """
        Evaluate UserKNN on validation or test data, matching kuairec_eval metrics.
        
        Args:
            ground_truth_dict (dict): {user_id: [item_ids]} for ground truth.
            other_dict (dict): {user_id: [item_ids]} for items to exclude (e.g., test items during validation).
            top_k_list (list): List of k values for metrics (e.g., [100]).
            is_validation (bool): True if evaluating validation, False for test.
        
        Returns:
            dict: Metrics like Recall@k, NDCG@k, Coverage@k.
        """
        total_recall = {k: 0.0 for k in top_k_list}
        total_ndcg = {k: 0.0 for k in top_k_list}
        recommended_items = {k: set() for k in top_k_list}
        total_items = self.num_items
        user_count = 0

        # Precompute IDCG for efficiency
        idcg_cache = {}
        def get_idcg(r_size, k):
            cut = min(k, r_size)
            if (cut, k) not in idcg_cache:
                val = sum(1.0 / np.log2(i + 2) for i in range(cut))
                idcg_cache[(cut, k)] = val
            return idcg_cache[(cut, k)]

        print(f"Evaluating UserKNN on {'validation' if is_validation else 'test'} set...")
        start_time = time.time()
        for u, relevant_set in ground_truth_dict.items():
            if not relevant_set:
                continue
            other_set = other_dict.get(u, set())
            # Get recommendations
            rec_items = self.recommend(u, max(top_k_list))
            # Exclude items in other_set (e.g., test items during validation)
            rec_items = [item for item in rec_items if item not in other_set]
            # Find positions of hits
            hits_positions = [pos for pos, item in enumerate(rec_items) if item in relevant_set]

            for k in top_k_list:
                hits_count = sum(1 for pos in hits_positions if pos < k)
                recall_k = hits_count / len(relevant_set)
                total_recall[k] += recall_k

                dcg = sum(1.0 / np.log2(pos + 2) for pos in hits_positions if pos < k)
                idcg = get_idcg(len(relevant_set), k)
                ndcg_k = dcg / idcg if idcg > 0 else 0.0
                total_ndcg[k] += ndcg_k

                recommended_items[k].update(rec_items[:k])
            user_count += 1

        if user_count == 0:
            metrics = {f'Recall@{k}': 0.0 for k in top_k_list}
            metrics.update({f'NDCG@{k}': 0.0 for k in top_k_list})
            metrics.update({f'Coverage@{k}': 0.0 for k in top_k_list})
        else:
            metrics = {f'Recall@{k}': total_recall[k] / user_count for k in top_k_list}
            metrics.update({f'NDCG@{k}': total_ndcg[k] / user_count for k in top_k_list})
            metrics.update({f'Coverage@{k}': len(recommended_items[k]) / total_items for k in top_k_list})

        print(f"Evaluation completed in {time.time() - start_time:.2f} seconds")
        return metrics


In [7]:
def build_interaction_matrix(adj_data):
    """
    Build the user-item interaction matrix from adj_data.
    
    Args:
        adj_data (dict): Contains rowptr_dict, col_dict, num_nodes_dict.
    
    Returns:
        sp.csr_matrix: User-item interaction matrix.
    """
    rowptr = adj_data['rowptr_dict'][('user', 'watches', 'video')]
    col = adj_data['col_dict'][('user', 'watches', 'video')]
    num_users = adj_data['num_nodes_dict']['user']
    num_items = adj_data['num_nodes_dict']['video']
    
    print(f"Building interaction matrix for {num_users} users and {num_items} items...")
    train_data = []
    for u in range(num_users):
        start = rowptr[u]
        end = rowptr[u + 1]
        items = col[start:end].tolist()
        train_data.extend([(u, i) for i in items])
    
    train_mat = sp.coo_matrix(
        (np.ones(len(train_data)), ([x[0] for x in train_data], [x[1] for x in train_data])),
        shape=(num_users, num_items)
    ).tocsr()
    print(f"Interaction matrix built with {len(train_data)} interactions")
    return train_mat

# Load KuaiRec data
with open("KuaiRec/adj_data.pkl", "rb") as f:
    adj_data = pickle.load(f)
with open("KuaiRec/val_data.pkl", "rb") as f:
    val_data = pickle.load(f)
with open("KuaiRec/test_data.pkl", "rb") as f:
    test_data = pickle.load(f)
print("Loaded KuaiRec data")

# Build user-item interaction matrix
train_mat = build_interaction_matrix(adj_data)
print("Train Matrix computed")
# Initialize UserKNN
userknn = UserKNN(train_mat, K_neighbors=1500)

# Evaluate on validation and test sets
val_metrics = userknn.evaluate(
    ground_truth_dict=val_data,
    other_dict=test_data,
    top_k_list=[100],
    is_validation=True
)
print("Validation Metrics:", val_metrics)

Loaded KuaiRec data
Building interaction matrix for 7176 users and 10728 items...
Interaction matrix built with 936568 interactions
Train Matrix computed
Initializing UserKNN with 7176 users, 10728 items, K=1500
Computing user-user similarity matrix...
Similarity matrix computed in 1.67 seconds
Evaluating UserKNN on validation set...
Evaluation completed in 2.05 seconds
Validation Metrics: {'Recall@100': 0.23811217463706705, 'NDCG@100': 0.16704731894823835, 'Coverage@100': 0.0267524235645041}


In [8]:
test_metrics = userknn.evaluate(
    ground_truth_dict=test_data,
    other_dict=val_data,
    top_k_list=[1,2,3,5,10,20,30,40,50,60,70,80,90,100],
    is_validation=False
)
print("Test Metrics:", test_metrics)

Evaluating UserKNN on test set...
Evaluation completed in 2.29 seconds
Test Metrics: {'Recall@1': 0.013078878816451046, 'Recall@2': 0.021195434520410204, 'Recall@3': 0.03047994523955555, 'Recall@5': 0.048304170195103246, 'Recall@10': 0.065279947994868, 'Recall@20': 0.10188392715789932, 'Recall@30': 0.13878712499612897, 'Recall@40': 0.1707852716654582, 'Recall@50': 0.18684843349166558, 'Recall@60': 0.20118618388505702, 'Recall@70': 0.21322902820111223, 'Recall@80': 0.22188521953869586, 'Recall@90': 0.2289625564146559, 'Recall@100': 0.23377915539117006, 'NDCG@1': 0.5506732813607371, 'NDCG@2': 0.468696676567592, 'NDCG@3': 0.45417414452862837, 'NDCG@5': 0.44020124249000414, 'NDCG@10': 0.35385302861633733, 'NDCG@20': 0.30207054610804834, 'NDCG@30': 0.289711071974637, 'NDCG@40': 0.2880385713835392, 'NDCG@50': 0.2826861154315943, 'NDCG@60': 0.279640267445158, 'NDCG@70': 0.27816820072313847, 'NDCG@80': 0.277117105820763, 'NDCG@90': 0.27678582620150727, 'NDCG@100': 0.2747678931602998, 'Coverage