### Weighted Metapath2Vec KuaiRec Recommendations

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import multiprocessing
from tqdm import tqdm
from typing import Dict, List, Optional, Tuple
from collections import defaultdict
import pickle
import random
import itertools
import copy
from sklearn.neighbors import NearestNeighbors
from sklearn.manifold import TSNE
from gensim.models.word2vec import Word2Vec
from scipy.stats import wilcoxon
import gc

import torch
from torch_geometric.nn.models import MetaPath2Vec
from torch_geometric.nn import MetaPath2Vec
from torch_geometric.data import HeteroData
from torch import nn, Tensor
from torch.utils.data import DataLoader
from torch_geometric.index import index2ptr
from torch_geometric.typing import EdgeType, NodeType, OptTensor
from torch_geometric.utils import sort_edge_index

In [2]:
%%time
# Best W2V video/track embeddings config will be used as initialization for Metapath2vec
train = pd.read_parquet('KuaiRec/train_sequences.parquet')
sequences = train['video_id'].apply(lambda x: list(map(str, x))).tolist()
val = np.load('KuaiRec/val_pairs.npy')
model = Word2Vec(
            vector_size=50,
            window=15,
            workers=multiprocessing.cpu_count(),
            sg=1,
            min_count=1,
            compute_loss=True,
        )
model.build_vocab(sequences)
model.train(
            corpus_iterable=sequences,
            total_examples=len(sequences),
            epochs=75,
        )

CPU times: user 27min 29s, sys: 1.4 s, total: 27min 31s
Wall time: 3min 36s


(70180360, 70242600)

In [3]:
df_social = pd.read_csv('KuaiRec/social_network.csv')
df_watch = pd.read_csv('KuaiRec/big_matrix.csv')
df_watch = df_watch[df_watch['watch_ratio'] >= 2].sort_values(['user_id', 'time'])

unique_users = df_watch['user_id'].unique()
unique_videos = df_watch['video_id'].unique()

user2idx = {uid: i for i, uid in enumerate(unique_users)}
video2idx = {vid: i for i, vid in enumerate(unique_videos)}
idx2video = {i: vid for vid, i in video2idx.items()}

user_col = df_watch['user_id'].map(user2idx).values
video_col = df_watch['video_id'].map(video2idx).values
edge_index_uv = np.vstack([user_col, video_col])
edge_weight_uv = torch.tensor(df_watch['watch_ratio'].values, dtype=torch.float)

data = HeteroData()
data['user'].num_nodes = len(user2idx)
data['video'].num_nodes = len(video2idx)

data['user', 'watches', 'video'].edge_index = torch.from_numpy(edge_index_uv)
data['user', 'watches', 'video'].edge_weight = edge_weight_uv
edge_index_vu = np.flip(edge_index_uv, axis=0).copy(order='C')
data['video', 'watched_by', 'user'].edge_index = torch.from_numpy(edge_index_vu)
data['video', 'watched_by', 'user'].edge_weight = edge_weight_uv

df_social['friend_list'] = df_social['friend_list'].apply(lambda x: x.strip('[]').split(','))
df_social = df_social.explode('friend_list').dropna()
df_social['friend_list'] = df_social['friend_list'].astype(int)

# Build user-user edge index
userA = df_social['user_id'].map(user2idx).values
userB = df_social['friend_list'].map(user2idx).values

edge_index_uu = np.vstack([userA, userB])
data['user', 'follows', 'user'].edge_index = torch.tensor(edge_index_uu, dtype=torch.long)
w2v_dim = model.vector_size

In [4]:
# Create a zero tensor for all video embeddings & update them with W2V trained ones
model_vocab = list(model.wv.index_to_key)
video_emb = np.zeros((len(video2idx), w2v_dim), dtype=np.float32)
for vid, idx in video2idx.items():
    if str(vid) in model_vocab:
        video_emb[idx] = model.wv[str(vid)]

# Set data video embeddings to the W2V embeddings
data['video'].x = torch.tensor(video_emb, dtype=torch.float32)

num_users = len(user2idx)

user_to_videos = defaultdict(list)
for _, row in df_watch.iterrows():
    user_id = row['user_id']
    video_id_str = str(row['video_id'])  # Convert to string for Word2Vec keys
    user_to_videos[user_id].append(video_id_str)

# Initialize user embeddings to their average video embeddings from training watch history
user_emb = torch.zeros((num_users, w2v_dim), dtype=torch.float32)
for uid, u_idx in user2idx.items():
    vids_watched = user_to_videos[uid]
    if not vids_watched:
        continue  # user_emb remains zero if no videos

    sum_vec = np.zeros(w2v_dim, dtype=np.float32)
    count = 0
    for vid_str in vids_watched:
        if vid_str in model.wv:  # If the video ID is in the W2V vocab
            sum_vec += model.wv[vid_str]
            count += 1
    if count > 0:
        user_emb[u_idx] = torch.tensor(sum_vec / count, dtype=torch.float32)

# Set data user embeddings to the user's average W2V video embeddings
data['user'].x = user_emb

In [5]:
meta_path_uuv = [
    ('user', 'follows', 'user'),
    ('user', 'watches', 'video'),
    ('video', 'watched_by', 'user')
]
with open('KuaiRec/val_data.pkl', 'rb') as f:
    val_data = pickle.load(f)
with open('KuaiRec/test_data.pkl', 'rb') as f:
    test_data = pickle.load(f)

In [6]:
import torch
import copy
from models.Wmetapath2vec import WMetaPath2VecRecommender

# Example configuration (adjust as needed)
config = {
    'walk_length': 10,
    'context_size': 3,
    'walks_per_node': 2,
    'num_negative_samples': 5,
    'lr': 0.005,
    'embedding_dim': 50,
    'max_epochs': 40,
    'patience': 12,
    'batch_size': 512,
    'k': [100],           # Evaluate at top-100
    'device': 'cpu',      # or 'cuda' if GPU available
    'verbose': True,
}

# Instantiate the recommender with the configuration
recommender = WMetaPath2VecRecommender(config)

# Fit the model (this will train the model using your data and metapath)
print("Starting training...")
recommender.fit(
    data=data,
    metapath=meta_path_uuv,  # e.g. [('user', 'watches', 'video'), ('video', 'watched_by', 'user'), ('user', 'follows', 'user')]
    user2idx=user2idx,
    video2idx=video2idx,
    idx2video=idx2video,
    val_data=val_data,
    test_data=test_data
)
print("Training complete.")

# Evaluate the model on the validation set
print("Evaluating on validation set...")
val_metrics = recommender.evaluate(
    user_emb=recommender.user_emb,
    video_emb=recommender.video_emb,
    val_data=val_data,
    test_data=test_data,
    user2idx=user2idx,
    video2idx=video2idx,
    idx2video=idx2video,
    top_k=[100],
    is_validation=True,
    progress_bar=True,
    return_per_user=False
)

print("Validation metrics:")
print(val_metrics)

Starting training...
Epoch: 1, Loss: 40.038991928100586, NDCG@100: 0.05127449892627967
Epoch: 2, Loss: 29.6527441740036, NDCG@100: 0.05152631456902219
Epoch: 3, Loss: 22.97763967514038, NDCG@100: 0.05212739985338954
Epoch: 4, Loss: 18.724189281463623, NDCG@100: 0.05317924935692166
Epoch: 5, Loss: 15.866265714168549, NDCG@100: 0.05501802231604555
Epoch: 6, Loss: 13.939627826213837, NDCG@100: 0.05709777941487168
Epoch: 7, Loss: 12.49271297454834, NDCG@100: 0.059885960806138124
Epoch: 8, Loss: 11.340573251247406, NDCG@100: 0.062459960644255366
Epoch: 9, Loss: 10.410344183444977, NDCG@100: 0.06481431402392643
Epoch: 10, Loss: 9.619074821472168, NDCG@100: 0.06762018129887763
Epoch: 11, Loss: 9.052366435527802, NDCG@100: 0.06989110751361328
Epoch: 12, Loss: 8.596085727214813, NDCG@100: 0.07354039883199161
Epoch: 13, Loss: 8.316554844379425, NDCG@100: 0.07662213557197808
Epoch: 14, Loss: 7.986779034137726, NDCG@100: 0.08013454615268198
Epoch: 15, Loss: 7.719898879528046, NDCG@100: 0.081811820

100%|██████████| 1411/1411 [00:01<00:00, 1211.28it/s]

Validation metrics:
{'HR@100': 0.1431752373353853, 'NDCG@100': 0.08361179499104114}





### Hyperparameter Search

In [None]:
from models.Wmetapath2vec import WMetaPath2VecRecommender

# Hyperparameter ranges:
WALK_LENGTHS = [4, 10, 15, 25]
CONTEXT_SIZES = [2, 3, 6, 10, 15]
WALKS_PER_NODE = [3, 5, 10, 15, 30]
NUM_NEG_SAMPLES = [3, 6, 10]
LEARNING_RATES = [0.005]

# Other hyperparameters:
MAX_EPOCHS = 40
PATIENCE = 12  # Early stopping patience
BATCH_SIZE = 512
w2v_dim = 50
K = [1, 5, 10, 20, 40, 60, 80, 100]

# Total number of configurations:
all_configs = list(itertools.product(WALK_LENGTHS, CONTEXT_SIZES, WALKS_PER_NODE, NUM_NEG_SAMPLES, LEARNING_RATES))
total_configs = len(all_configs)

# Initialize tracking variables:
best_ndcg100 = -1.0
best_config = None
best_val_metrics = None

# Loop over hyperparameter combinations:
for (wl, cs, wpn, neg, lr) in tqdm(all_configs, total=total_configs, desc="Hyperparam search"):
    config_dict = {
        'walk_length': wl,
        'context_size': cs,
        'walks_per_node': wpn,
        'num_negative_samples': neg,
        'lr': lr,
        'embedding_dim': w2v_dim,
        'max_epochs': MAX_EPOCHS,
        'patience': PATIENCE,
        'batch_size': BATCH_SIZE,
        'k': K,
        'device': 'cpu',  # or 'cuda' if you use GPU
        'verbose': False,
    }
    
    # Instantiate your recommender with the configuration:
    recommender = WMetaPath2VecRecommender(config_dict)
    
    try:
        # Fit the model using your data and metapath (e.g., meta_path_uuv)
        recommender.fit(
            data=data,
            metapath=meta_path_uuv,
            user2idx=user2idx,
            video2idx=video2idx,
            idx2video=idx2video,
            val_data=val_data,
            test_data=test_data
        )
        
        # Evaluate on validation data:
        val_metrics = recommender.evaluate(
            user_emb=recommender.user_emb,
            video_emb=recommender.video_emb,
            val_data=val_data,
            test_data=test_data,
            user2idx=user2idx,
            video2idx=video2idx,
            idx2video=idx2video,
            top_k=K,
            is_validation=True,
            progress_bar=False,
            return_per_user=False
        )
        
        # Use NDCG@100 as the key for model selection. Adjust the key name if needed.
        ndcg_100 = val_metrics.get('NDCG@100', 0.0)
        if ndcg_100 > best_ndcg100:
            best_ndcg100 = ndcg_100
            best_config = copy.deepcopy(config_dict)
            best_val_metrics = copy.deepcopy(val_metrics)
            
    except Exception as e:
        continue
    finally:
        del recommender
        gc.collect()

print("\n==============================")
print("Hyperparameter search complete!")
print("Best config:", best_config)
print("Best validation metrics:", best_val_metrics)
print("==============================\n")

Hyperparam search: 100%|█████████▉| 299/300 [33:03:21<37:19, 2239.07s/it]   

In [None]:
best_config

In [None]:
best_val_metrics

In [13]:
WALK_LENGTHS = [2, 4, 10, 15, 25]
CONTEXT_SIZES = [2, 3, 6, 10, 15]
WALKS_PER_NODE = [1, 2, 5, 10, 15]
NUM_NEG_SAMPLES = [1, 3, 5, 8, 10]
LEARNING_RATES = [0.005]

MAX_EPOCHS = 40
PATIENCE = 12  # Early stopping patience

def run_wmetapath2vec_experiment(
    data,  # HeteroData
    evaluate_user_reco,  # your evaluation function
    user2idx, video2idx, idx2video, val_data, test_data,
    walk_length, context_size, walks_per_node, num_negative_samples,
    lr, w2v_dim
):
    """Train WMetaPath2Vec with given hyperparams, return best HR@100."""    
    try:
        # 1) Build MetaPath2Vec model with these hyperparams
        model_metapath = WeightedMetaPath2Vec(
            data.edge_index_dict,
            edge_weight_dict,
            embedding_dim=w2v_dim,
            metapath=meta_path_uuv,
            walk_length=walk_length,
            context_size=context_size,
            walks_per_node=walks_per_node,
            num_negative_samples=num_negative_samples,
            sparse=True
        ).to('cpu')
    except Exception as e:
        print(f"Error in training: {e}")
        return (None, None)

    with torch.no_grad():
        # Copy in user & video inits:
        model_metapath('video').data.copy_(data['video'].x.to('cpu'))
        model_metapath('user').data.copy_(data['user'].x.to('cpu'))

    loader = model_metapath.loader(batch_size=128, shuffle=True, num_workers=4)
    optimizer = torch.optim.SparseAdam(model_metapath.parameters(), lr=lr)

    best_hr = 0.0
    epochs_no_improve = 0

    def train_one_epoch():
        model_metapath.train()
        total_loss = 0
        for pos_rw, neg_rw in loader:
            pos_rw, neg_rw = pos_rw.to('cpu'), neg_rw.to('cpu')
            optimizer.zero_grad()
            loss = model_metapath.loss(pos_rw, neg_rw)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        return total_loss

    best_hr_so_far = 0.0

    for epoch in range(1, MAX_EPOCHS + 1):
        loss_val = train_one_epoch()

        # Evaluate on validation:
        model_metapath.eval()
        with torch.no_grad():
            user_emb = model_metapath('user').cpu().detach().numpy()
            video_emb = model_metapath('video').cpu().detach().numpy()
        
        val_metrics = evaluate_user_reco(
            user_emb=user_emb,
            video_emb=video_emb,
            val_data=val_data,
            test_data=test_data,
            user2idx=user2idx,
            video2idx=video2idx,
            idx2video=idx2video,
            is_validation=True,
            top_k=100,
            progress_bar=False
        )
        hr100 = val_metrics.get('HR@100', 0.0)
        ndcg100 = val_metrics.get("nDCG@100", 0.0)

        # Early stopping check:
        if hr100 > best_hr_so_far:
            best_hr_so_far = hr100
            best_ndcg_so_far = ndcg100
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1

        if epochs_no_improve >= PATIENCE:
            break

    return (best_hr_so_far, best_ndcg_so_far)

search_results = []
best_hr_overall = 0.0
best_config = None

for wl, cs, wpn, neg, learning_rate in itertools.product(
    WALK_LENGTHS,
    CONTEXT_SIZES,
    WALKS_PER_NODE,
    NUM_NEG_SAMPLES,
    LEARNING_RATES
):
    
    hr100, ndcg100 = run_metapath2vec_experiment(
        data=data,
        evaluate_user_reco=evaluate_user_reco,
        user2idx=user2idx,
        video2idx=video2idx,
        idx2video=idx2video,
        val_data=val_data,
        test_data=test_data,
        walk_length=wl,
        context_size=cs,
        walks_per_node=wpn,
        num_negative_samples=neg,
        lr=learning_rate,
        w2v_dim=w2v_dim
    )
    
    config = {
        'walk_length': wl,
        'context_size': cs,
        'walks_per_node': wpn,
        'num_negative_samples': neg,
        'lr': learning_rate
    }
    search_results.append((config, hr100))

    if ((hr100 is not None) and (hr100 > best_hr_overall)):
        best_hr_overall = hr100
        best_ndcg_overall = ndcg100
        best_config = copy.deepcopy(config)

print("\n==============================")
print("Search complete!")
print("Best HR@100:", best_hr_overall, " NDCG@100:", best_ndcg_overall)
print("Best config:", best_config)
print("==============================\n")

Error in training: 
Error in training: 
Error in training: 
Error in training: 
Error in training: 
Error in training: 
Error in training: 
Error in training: 
Error in training: 
Error in training: 
Error in training: 
Error in training: 
Error in training: 
Error in training: 
Error in training: 
Error in training: 
Error in training: 
Error in training: 

Search complete!
Best HR@100: 0.2490927854183785  NDCG@100: 0.16127212701619853
Best config: {'walk_length': 10, 'context_size': 3, 'walks_per_node': 5, 'num_negative_samples': 5, 'lr': 0.01}



In [17]:
# Best metapath2vec model:
model_metapath = WeightedMetaPath2Vec(
    data.edge_index_dict,
    edge_weight_dict,
    embedding_dim=50,
    metapath=meta_path_uuv,
    walk_length=10,
    context_size=3,
    walks_per_node=5,
    num_negative_samples=5,
    sparse=True
).to('cpu')

with torch.no_grad():
    model_metapath('video').data.copy_(data['video'].x.to('cpu'))
    model_metapath('user').data.copy_(data['user'].x.to('cpu'))

loader = model_metapath.loader(batch_size=128, shuffle=True, num_workers=4)
optimizer = torch.optim.SparseAdam(
    [p for p in model_metapath.parameters()],
    lr=0.01
)

def train_one_epoch():
    model_metapath.train()
    total_loss = 0
    for pos_rw, neg_rw in loader:
        pos_rw, neg_rw = pos_rw.to('cpu'), neg_rw.to('cpu')
        optimizer.zero_grad()
        loss = model_metapath.loss(pos_rw, neg_rw)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss


for epoch in range(1, 20):
    epoch_loss = train_one_epoch()
    print(f"Epoch {epoch}, Loss: {epoch_loss:.4f}")

    # Evaluate on validation:
    model_metapath.eval()
    user_emb = model_metapath('user').cpu().detach().numpy()
    video_emb = model_metapath('video').cpu().detach().numpy()

    val_metrics = evaluate_user_reco(
        user_emb=user_emb,
        video_emb=video_emb,
        val_data=val_data,
        test_data=test_data,
        user2idx=user2idx,
        video2idx=video2idx,
        idx2video=idx2video,
        is_validation=True,   # i.e., we're evaluating on val_data
        top_k=100
    )
    print("Val metrics:", val_metrics)
    
user_emb_meta = model_metapath('user').cpu().detach().numpy()
video_emb_meta = model_metapath('video').cpu().detach().numpy()

In [19]:
# Run both on test set
meta_test_metrics = evaluate_user_reco(
    user_emb=user_emb_meta,
    video_emb=video_emb_meta,
    val_data=val_data,
    test_data=test_data,    # so we ignore val items if they appear
    user2idx=user2idx,
    video2idx=video2idx,
    idx2video=idx2video,    # e.g., {0: 'vid_a', 1: 'vid_b', ...}
    is_validation=False,
    top_k=100,
    return_per_user=True
)
print(meta_test_metrics[0])


100%|██████████| 1411/1411 [00:01<00:00, 716.74it/s]

{'HR@100': 0.27245404188231237, 'nDCG@100': 0.23753742950303666}



