## Process KuaiRec
This notebook simply creates the adjacency list containing the user-video graph as well as the validation and test data splits

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import multiprocessing
from tqdm import tqdm
from typing import Dict, List, Optional, Tuple
from collections import defaultdict
import pickle
import random
import itertools
import copy
from sklearn.neighbors import NearestNeighbors
from sklearn.manifold import TSNE
from gensim.models.word2vec import Word2Vec
from scipy.stats import wilcoxon
import os
import logging

import torch
from torch_geometric.nn.models import MetaPath2Vec
from torch_geometric.nn import MetaPath2Vec
from torch_geometric.data import HeteroData
from torch import nn, Tensor
from torch.utils.data import DataLoader
from torch_geometric.index import index2ptr
from torch_geometric.typing import EdgeType, NodeType, OptTensor
from torch_geometric.utils import sort_edge_index

In [16]:
df_watch = pd.read_csv('KuaiRec/big_matrix.csv')

unique_users = df_watch['user_id'].unique()
unique_videos = df_watch['video_id'].unique()

df_watch = df_watch[df_watch['watch_ratio'] >= 2].sort_values(['user_id', 'time'])

user2idx = {uid: i for i, uid in enumerate(unique_users)}
video2idx = {vid: i for i, vid in enumerate(unique_videos)}
idx2video = {i: vid for vid, i in video2idx.items()}


# Build user->video edge index
user_col = df_watch['user_id'].map(user2idx).values
video_col = df_watch['video_id'].map(video2idx).values
edge_index_uv = np.vstack([user_col, video_col])
edge_weight_uv = torch.tensor(df_watch['watch_ratio'].values, dtype=torch.float)

df_social = pd.read_csv('KuaiRec/social_network.csv')
df_social['friend_list'] = df_social['friend_list'].apply(lambda x: x.strip('[]').split(','))
df_social = df_social.explode('friend_list').dropna()
df_social['friend_list'] = df_social['friend_list'].astype(int)

# Build user-user edge index
userA = df_social['user_id'].map(user2idx).values
userB = df_social['friend_list'].map(user2idx).values

edge_index_uu = np.vstack([userA, userB])

In [17]:
data = HeteroData()
# Add user, video node counts
data['user'].num_nodes = len(user2idx)
data['video'].num_nodes = len(video2idx)
# user->video edges
data['user', 'watches', 'video'].edge_index = torch.tensor(edge_index_uv, dtype=torch.long)
data['user', 'watches', 'video'].edge_weight = edge_weight_uv
# Include reverse edges too:
data['video', 'watched_by', 'user'].edge_index = torch.tensor(edge_index_uv[[1, 0], :], dtype=torch.long)
data['video', 'watched_by', 'user'].edge_weight = edge_weight_uv
data['user', 'follows', 'user'].edge_index = torch.tensor(edge_index_uu, dtype=torch.long)
#w2v_dim = model.vector_size

In [18]:
data

HeteroData(
  user={ num_nodes=7176 },
  video={ num_nodes=10728 },
  (user, watches, video)={
    edge_index=[2, 936568],
    edge_weight=[936568],
  },
  (video, watched_by, user)={
    edge_index=[2, 936568],
    edge_weight=[936568],
  },
  (user, follows, user)={ edge_index=[2, 670] }
)

In [19]:
def adjacency(data, save_path):
    edge_index_dict = data.edge_index_dict
    num_nodes_dict = {}
    for keys, edge_index in edge_index_dict.items():
        key = keys[0]
        N = int(edge_index[0].max() + 1)
        num_nodes_dict[key] = max(N, num_nodes_dict.get(key, N))

        key = keys[-1]
        N = int(edge_index[1].max() + 1)
        num_nodes_dict[key] = max(N, num_nodes_dict.get(key, N))

    rowptr_dict, col_dict, rowcount_dict = {}, {}, {}
    for keys, edge_index in edge_index_dict.items():
        sizes = (num_nodes_dict[keys[0]], num_nodes_dict[keys[-1]])
        row, col = sort_edge_index(edge_index, num_nodes=max(sizes)).cpu()
        rowptr = index2ptr(row, size=sizes[0])
        rowptr_dict[keys] = rowptr
        col_dict[keys] = col
        rowcount_dict[keys] = rowptr[1:] - rowptr[:-1]
    
    edge_weight_dict = {
            ('user','watches','video'): data['user','watches','video'].edge_weight,
            ('video','watched_by','user'): data['video','watched_by','user'].edge_weight,
            ('user','follows','user'): None,
        }

    save_dict = {
    "rowptr_dict": rowptr_dict,
    "col_dict": col_dict,
    "rowcount_dict": rowcount_dict,
    "num_nodes_dict": num_nodes_dict,
    "edge_weight_dict": edge_weight_dict
    }

    # Save to disk.
    with open(save_path, "wb") as f:
        pickle.dump(save_dict, f, protocol=pickle.HIGHEST_PROTOCOL)
    print(f"Saved adjacency structures to: {save_path}")
    
adjacency(data, save_path="KuaiRec/adj_data.pkl")

Saved adjacency structures to: KuaiRec/adj_data.pkl


In [20]:
df = pd.read_csv('KuaiRec/small_matrix.csv')
df = df[df['watch_ratio'] >= 2].sort_values(['user_id', 'time']).reset_index(drop=True)
df['user_id'] = df['user_id'].map(user2idx)
df['video_id'] = df['video_id'].map(video2idx)

# Drop any unmapped values (in case of missing keys)
#df = df.dropna(subset=['user_id', 'video_id']).astype({'user_id': int, 'video_id': int})
val_data = {}
test_data = {}

for user_id, group in df.groupby('user_id'):
    videos = group['video_id'].tolist()
    random.shuffle(videos)
    split_idx = int(len(videos) * 0.2)  # 20% val, 80% test
    val_videos = videos[:split_idx]
    tst_videos = videos[split_idx:]

    val_data[user_id] = set(val_videos)
    test_data[user_id] = set(tst_videos)
    
with open('KuaiRec/val_data.pkl', 'wb') as f:
    pickle.dump(val_data, f)
with open('KuaiRec/test_data.pkl', 'wb') as f:
    pickle.dump(test_data, f)