1. 任务描述：

    •   使用三条元路径的嵌入表示实现电影推荐，解决数据异构问题。
    •   预测用户对某电影的潜在兴趣分数（分为高中低三类）
    •   生成movie-embedding和user-embedding，以用于下游任务

2. 元路径描述：
    Alice提供一部分电影特征、用户评分数据，通过共同评分用户构建movie-user-movie元路径。
    Bob提供另一部分电影特征、演员数据、导演数据，通过共同演员、共同导演构建movie-actor-movie、movie-director-movie元路径。


In [1]:
import secretflow as sf

sf.shutdown()
sf.init(parties=['alice', 'bob'], address='local', debug_mode=True)
alice, bob = sf.PYU('alice'), sf.PYU('bob')

agg_mode = 'concat'
num_classes = 3

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


In [2]:
import os
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import torch

def process_and_save_douban_data(ratings_path, movies_path, movies_features_path, save_dir):
    ratings = pd.read_csv(ratings_path, sep=',', engine='python', encoding='utf-8-sig')
    movies = pd.read_csv(movies_path, sep=',', engine='python', encoding='utf-8-sig')
    movies_features = np.load(movies_features_path, allow_pickle=True).item()

    # 求电影 ID 的交集
    alice_movie_ids = set(ratings['MOVIE_ID'].unique())
    bob_movie_ids = set(movies['MOVIE_ID'].unique())
    common_movie_ids = alice_movie_ids & bob_movie_ids

    # 检查交集是否非空
    if len(common_movie_ids) == 0:
        raise ValueError("No common movie IDs found between ratings and movies!")

    # 过滤数据，保留交集的电影
    ratings = ratings[ratings['MOVIE_ID'].isin(common_movie_ids)]
    movies = movies[movies['MOVIE_ID'].isin(common_movie_ids)]

    # 编码用户和电影 ID
    user_encoder = {user: idx for idx, user in enumerate(ratings['USER_MD5'].unique())}
    movie_encoder = {movie: idx for idx, movie in enumerate(movies['MOVIE_ID'].unique())}
    
    ratings['user_idx'] = ratings['USER_MD5'].map(user_encoder)
    ratings['movie_idx'] = ratings['MOVIE_ID'].map(movie_encoder)
    movies['movie_idx'] = movies['MOVIE_ID'].map(movie_encoder)
    
    # 构建电影特征矩阵
    num_movies = len(movie_encoder)
    movie_feature_matrix = np.zeros((num_movies, len(next(iter(movies_features.values())))))
    for movie_id, embedding in movies_features.items():
        if movie_id in movie_encoder:
            movie_idx = movie_encoder[movie_id]
            movie_feature_matrix[movie_idx] = embedding
    movie_feature_matrix = torch.FloatTensor(movie_feature_matrix)

    # 构建电影-用户元路径 (movie-user-movie)
    num_users = len(user_encoder)
    movie_user_adj = csr_matrix(
        (ratings['RATING'], (ratings['movie_idx'], ratings['user_idx'])),
        shape=(num_movies, num_users)
    )
    meta_path_movie_user_movie = movie_user_adj @ movie_user_adj.T
    
    # 提取导演和演员信息，将格式像 "nconst:123|nconst:456|nconst:789" 这样的字符串列解析为 [123, 456, 789] 的整数 ID 列表
    def parse_ids(id_column):
        if pd.isna(id_column) or not id_column.strip():
            return []
        ids = []
        for entry in id_column.split('|'):
            if ':' in entry:
                try:
                    ids.append(int(entry.split(':')[-1]))
                except ValueError:
                    continue
        return ids

    movies['director_ids'] = movies['DIRECTOR_IDS'].apply(parse_ids)
    movies['actor_ids'] = movies['ACTOR_IDS'].apply(parse_ids)
    
    # 确保映射过程完整，所有导演和演员都有对应索引
    unique_directors = sorted({d for ids in movies['director_ids'] for d in ids})
    unique_actors = sorted({a for ids in movies['actor_ids'] for a in ids})

    # 创建导演和演员的映射
    director_map = {d: idx for idx, d in enumerate(unique_directors)}
    actor_map = {a: idx for idx, a in enumerate(unique_actors)}

    # 替换导演和演员 ID 为索引
    movies['director_idx'] = movies['director_ids'].apply(lambda ids: [director_map[d] for d in ids if d in director_map])
    movies['actor_idx'] = movies['actor_ids'].apply(lambda ids: [actor_map[a] for a in ids if a in actor_map])
    # 保存数据
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # 保存映射到文件
    with open(os.path.join(save_dir, "user_idx_mapping.txt"), "w") as f:
        for user, idx in user_encoder.items():
            f.write(f"USER_MD5: {user} -> user_idx: {idx}\n")

    with open(os.path.join(save_dir, "movie_idx_mapping.txt"), "w") as f:
        for movie, idx in movie_encoder.items():
            f.write(f"MOVIE_ID: {movie} -> movie_idx: {idx}\n")

    with open(os.path.join(save_dir, "director_idx_mapping.txt"), "w") as f:
        for director, idx in director_map.items():
            f.write(f"DIRECTOR_ID: {director} -> director_idx: {idx}\n")

    with open(os.path.join(save_dir, "actor_idx_mapping.txt"), "w") as f:
        for actor, idx in actor_map.items():
            f.write(f"ACTOR_ID: {actor} -> actor_idx: {idx}\n")

    # 检查索引是否超出范围
    def validate_indices(idx_list, max_dim, label):
        for i, ids in idx_list.items():
            for idx in ids:
                if idx >= max_dim:
                    print(f"Invalid {label} ID {idx} at movie index {i} (max allowed: {max_dim - 1})")
                    return False
        return True

    if not validate_indices(movies['director_idx'], len(director_map), "director"):
        raise ValueError("Invalid director indices found!")
    if not validate_indices(movies['actor_idx'], len(actor_map), "actor"):
        raise ValueError("Invalid actor indices found!")

    # 构建电影-导演元路径
    director_data = []
    director_row = []
    director_col = []

    for movie_idx, director_ids in enumerate(movies['director_idx']):
        for director_idx in director_ids:
            director_data.append(1)  # 权重
            director_row.append(movie_idx)  # 电影索引
            director_col.append(director_idx)  # 导演索引

    num_directors = len(director_map)
    movie_director_adj = csr_matrix((director_data, (director_row, director_col)), shape=(num_movies, num_directors))
    meta_path_movie_director_movie = movie_director_adj @ movie_director_adj.T

    # 构建 user-movie 矩阵
    user_movie_matrix = csr_matrix(
        (ratings['RATING'], (ratings['user_idx'], ratings['movie_idx'])),
        shape=(num_users, num_movies)
    )
    save_path = "recommend"
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    # 保存 user-movie 矩阵
    user_movie_matrix_path = "recommend/user_movie_matrix.npz"
    from scipy.sparse import save_npz
    save_npz(user_movie_matrix_path, user_movie_matrix)

    print(f"user_movie_matrix shape: {user_movie_matrix.shape}, saved to {user_movie_matrix_path}")

    # 构建电影-演员元路径
    actor_data = []
    actor_row = []
    actor_col = []

    for movie_idx, actor_ids in enumerate(movies['actor_idx']):
        for actor_idx in actor_ids:
            actor_data.append(1)  # 权重
            actor_row.append(movie_idx)  # 电影索引
            actor_col.append(actor_idx)  # 演员索引

    num_actors = len(actor_map)
    movie_actor_adj = csr_matrix((actor_data, (actor_row, actor_col)), shape=(num_movies, num_actors))
    meta_path_movie_actor_movie = movie_actor_adj @ movie_actor_adj.T
    
    # 生成标签 (电影平均评分分类)
    movie_avg_rating = ratings.groupby('movie_idx')['RATING'].mean()
    y = np.zeros(num_movies, dtype=np.int64)
    y[movie_avg_rating < 3] = 0  # 低评分
    y[(movie_avg_rating >= 3) & (movie_avg_rating < 4)] = 1  # 中评分
    y[movie_avg_rating >= 4] = 2  # 高评分
    
    # 划分训练、验证、测试集
    idx_train = np.arange(0, int(0.7 * num_movies))
    idx_val = np.arange(int(0.7 * num_movies), int(0.85 * num_movies))
    idx_test = np.arange(int(0.85 * num_movies), num_movies)
    
    y_train = np.zeros_like(y)
    y_val = np.zeros_like(y)
    y_test = np.zeros_like(y)
    
    y_train[idx_train] = y[idx_train]
    y_val[idx_val] = y[idx_val]
    y_test[idx_test] = y[idx_test]

    # movie_user_movie存在Alice方，movie_director_movie、movie_actor_movie存在Bob方
    meta_path_alice = [meta_path_movie_user_movie]
    meta_path_bob = [meta_path_movie_director_movie, meta_path_movie_actor_movie]
    # movie_features可以划分一下存放在Alice和Bob作为features_alice和features_bob
    features_alice = movie_feature_matrix.numpy()[:, :movie_feature_matrix.shape[1] // 2]
    features_bob = movie_feature_matrix.numpy()[:, movie_feature_matrix.shape[1] // 2:]
    
    saved_files = [
        os.path.join(save_dir, name)
        for name in [
            'meta_path_movie_user_movie.npy',
            'meta_path_movie_genre_movie.npy',
            'features_alice.npy',
            'features_bob.npy',
            'y_train.npy',
            'y_val.npy',
            'y_test.npy',
            'idx_train.npy',
            'idx_val.npy',
            'idx_test.npy',
        ]
    ]
    np.save(saved_files[0], meta_path_alice[0].toarray())
    np.save(saved_files[1], meta_path_bob[0].toarray())
    np.save(saved_files[2], features_alice)
    np.save(saved_files[3], features_bob)
    np.save(saved_files[4], y_train)
    np.save(saved_files[5], y_val)
    np.save(saved_files[6], y_test)
    np.save(saved_files[7], idx_train)
    np.save(saved_files[8], idx_val)
    np.save(saved_files[9], idx_test)
    
    print(f"meta_path_movie_user_movie shape: {meta_path_movie_user_movie.shape}")
    print(f"meta_path_movie_director_movie shape: {meta_path_movie_director_movie.shape}")
    print(f"meta_path_movie_actor_movie shape: {meta_path_movie_actor_movie.shape}")
    print(f"movie_features shape: {movie_feature_matrix.shape}")
    print(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}, y_test shape: {y_test.shape}")
    print(f"Data saved to {save_dir}")
    return saved_files

In [3]:
ratings_path = './moviedata_partitial/ratings.csv'
movies_path = './moviedata_partitial/movies.csv'
movies_features_path = './moviedata_partitial/movie_embeddings_dict.npy'
save_dir = './saved_files_douban_partitial'

saved_files = process_and_save_douban_data(ratings_path, movies_path, movies_features_path, save_dir)

user_movie_matrix shape: (265187, 18810), saved to recommend/user_movie_matrix.npz
meta_path_movie_user_movie shape: (18810, 18810)
meta_path_movie_director_movie shape: (18810, 18810)
meta_path_movie_actor_movie shape: (18810, 18810)
movie_features shape: torch.Size([18810, 2996])
y_train shape: (18810,), y_val shape: (18810,), y_test shape: (18810,)
Data saved to ./saved_files_douban_partitial


In [4]:
from secretflow.data.ndarray import load

meta_path_list = load({alice: saved_files[0], bob: saved_files[1]})
features = load({alice: saved_files[2], bob: saved_files[3]})
Y_train = load({alice: saved_files[4]})
Y_val = load({alice: saved_files[5]})
Y_test = load({alice: saved_files[6]})
idx_train = load({alice: saved_files[7]})
idx_val = load({alice: saved_files[8]})
idx_test = load({alice: saved_files[9]})

partition_shapes = features.partition_shape()
input_shape_alice = partition_shapes[alice]
input_shape_bob = partition_shapes[bob]
print(f"input_shape_alice: {input_shape_alice}, input_shape_bob: {input_shape_bob}")
meta_path_shapes = meta_path_list.partition_shape()
meta_path_alice_shape = meta_path_shapes[alice]
meta_path_bob_shape = meta_path_shapes[bob]
print(f"meta_path alice shape: {meta_path_alice_shape}, meta_path bob shape: {meta_path_bob_shape}")
y_shape = Y_train.partition_shape()
y_shape_alice = y_shape[alice]
print(f"y shape alice: {y_shape_alice}")

input_shape_alice: (18810, 1498), input_shape_bob: (18810, 1498)
meta_path alice shape: (18810, 18810), meta_path bob shape: (18810, 18810)
y shape alice: (18810,)


In [5]:
from secretflow.ml.nn.core.torch import BaseModule
import torch.nn as nn
import torch.nn.functional as F

class NodeAttentionLayer(BaseModule):
    """
    Adapted from Diego999/pyGAT
    """
    def __init__(self, in_feature_dim, out_feature_dim, dropout, alpha):
        super(NodeAttentionLayer, self).__init__()
        self.in_feature_dim = in_feature_dim
        self.out_feature_dim = out_feature_dim
        self.dropout = dropout
        # The paper didn't specify but the author used the default 0.2 in tensorflow.
        self.leakyrelu = nn.LeakyReLU(alpha)

        self.weight = nn.Parameter(torch.empty(size=(self.in_feature_dim, self.out_feature_dim)))
        self.attention_coef = nn.Parameter(torch.empty(size=(self.out_feature_dim * 2, 1)))
        # Initiate with the recommended value of the leaky relu with a slope of 0.2.
        nn.init.xavier_uniform_(self.weight, gain=1.387)
        nn.init.xavier_uniform_(self.attention_coef, gain=1.387)

    def forward(self, x, adj):
        Wh = torch.mm(x, self.weight)      # Wh: (N, out_feature_dim)
        e = self._prepare_attention(Wh)    # e: (N, N) So this could be seen as an interaction matrix

        infneg_vector = -1e12 * torch.ones_like(e)
        attention = torch.where(adj > 0, e, infneg_vector)
        attention = F.softmax(attention, dim=1)
        attention = F.dropout(attention, self.dropout, training=self.training)
        h_prime = torch.matmul(attention, Wh)  # h_prime: (N, out_feature_dim)

        return F.elu(h_prime)

    def _prepare_attention(self, Wh):
        Wh1 = torch.matmul(Wh, self.attention_coef[:self.out_feature_dim, :])  # Wh1 & Wh2: (N, 1)
        Wh2 = torch.matmul(Wh, self.attention_coef[self.out_feature_dim:, :])
        e = Wh1 + Wh2.T  # Broadcast add

        return self.leakyrelu(e)


class SemanticAttentionLayer(BaseModule):
    def __init__(self, in_feature_dim, q_vector):
        super(SemanticAttentionLayer, self).__init__()
        self.weight = nn.Parameter(torch.empty(size=(in_feature_dim, q_vector)))
        self.bias = nn.Parameter(torch.empty(size=(1, q_vector)))
        self.q = nn.Parameter(torch.empty(size=(q_vector, 1)))

        # Similarly, the recommended gain value for tanh
        nn.init.xavier_uniform_(self.weight, gain=1.667)
        nn.init.xavier_uniform_(self.bias, gain=1.667)
        nn.init.xavier_uniform_(self.q, gain=1.667)

    def forward(self, z):
        Wh = torch.matmul(z, self.weight) + self.bias    # z: (N, M, hidden_dim * num_classes)
        Wh = F.tanh(Wh)                 # Wh: (N, M, q_vector)
        w = torch.matmul(Wh, self.q)    # w: (N, M, 1)
        w = w.mean(0)                   # w: (M, 1)
        beta = F.softmax(w, dim=1)
        beta = beta.expand((z.shape[0],) + beta.shape)    # beta: (N, M, 1)

        return (beta * z).sum(1)       # (N, hidden_dim * num_classes)

class HAN(BaseModule):
    def __init__(self, feature_dim, hidden_dim, dropout, num_heads, alpha, q_vector):
        super(HAN, self).__init__()
        self.dropout = dropout
        self.q_vector = q_vector
        self.hidden_dim = hidden_dim
        self.num_heads = num_heads

        self.attentions = [NodeAttentionLayer(feature_dim, hidden_dim, self.dropout, alpha) for _ in range(num_heads)]
        for i, attention in enumerate(self.attentions):
            self.add_module('attention_{}'.format(i), attention)
        self.semantic_attention = SemanticAttentionLayer(hidden_dim * num_heads, q_vector)
        
        # self.out_layer = nn.Linear(hidden_dim * num_heads, num_classes)

    def forward(self, inputs):
        if inputs is None:
            raise ValueError("Received None as input to HAN model.")
        # 从扩展的维度中取出真实输入
        x = inputs[0]
        meta_path_list = [inputs[1]]
        # meta_path_list = meta_path_list.permute(2, 0, 1)
        semantic_embeddings = []
        for meta_path_adj in meta_path_list:
            x = F.dropout(x, self.dropout, training=self.training)
            Z = torch.cat([attention(x, meta_path_adj) for attention in self.attentions], dim=1) # 节点级注意力
            Z = F.dropout(Z, self.dropout, training=self.training)
            semantic_embeddings.append(Z)

        semantic_embeddings = torch.stack(semantic_embeddings, dim=1) # 元路径级注意力
        final_embedding = self.semantic_attention(semantic_embeddings)
        return final_embedding
        # return self.out_layer(final_embedding)
    
    def output_num(self):
        return 1

class ServerNet(BaseModule):
    def __init__(self, input_shape, num_classes, save_path='recommend'):
        super(ServerNet, self).__init__()
        self.out_layer = nn.Linear(input_shape, num_classes)
        self.save_path = save_path
        if self.save_path and not os.path.exists(self.save_path):
            os.makedirs(self.save_path)

    def forward(self, inputs):
        if len(inputs) != 2:
            raise ValueError("Expected two inputs to concatenate, got {}".format(len(inputs)))
        
        # Check if the two inputs have the same batch size
        if inputs[0].size(0) != inputs[1].size(0):
            raise ValueError("Batch size mismatch between alice and bob's inputs")
        if agg_mode == 'average':
            embeddings = torch.mean(torch.stack(inputs), dim=0) # 平均化embedding
        else:
            embeddings = torch.cat(inputs, dim=1) # 拼接embedding进行计算
        # return self.out_layer(embeddings)
        if self.training:
            return self.out_layer(embeddings)
        else:
            # save embeddings
            if self.save_path:
                # 将 embeddings 转为 numpy 格式并保存
                embeddings = sf.reveal(embeddings)
                embeddings_np = embeddings.detach().cpu().numpy()
                file_path = os.path.join(self.save_path, f"movie_embedding.npy")
                np.save(file_path, embeddings_np)
            return self.out_layer(embeddings)

In [6]:
def create_base_model(feature_dim, hidden_dim, dropout, num_heads, alpha, q_vector):
    def base_model():
        model = HAN(feature_dim, hidden_dim, dropout, num_heads, alpha, q_vector)
        return model
    return base_model

def create_fuse_model(hidden_dim, num_heads, num_classes):
    def fuse_model():
        if agg_mode == 'average':
            input_shape = hidden_dim * num_heads
        else:
            input_shape = 2 * hidden_dim * num_heads
        model = ServerNet(input_shape, num_classes)
        return model
    return fuse_model

In [7]:
from secretflow.ml.nn.core.torch import metric_wrapper,optim_wrapper,TorchModel
from secretflow.ml.nn import SLModel
import torch.optim as optim
from torchmetrics import Recall, Accuracy, Precision

import torch.nn as nn
import torch.nn.functional as F

class ArcFaceLoss(nn.Module):
    def __init__(self, s=30.0, m=0.50):
        super(ArcFaceLoss, self).__init__()
        self.s = s
        self.m = m

    def forward(self, input, target):
        one_hot = F.one_hot(target, num_classes=input.size(1)).float()
        theta = torch.acos(torch.clamp(input, -1.0+1e-7, 1.0-1e-7))
        target_logits = torch.cos(theta + self.m)
        logits = input * (1 - one_hot) + target_logits * one_hot
        return F.cross_entropy(self.s * logits, target)

lr = 0.0002
weight_decay = 0.0001
hidden_dim = 64
num_heads = 8
dropout = 0.4
alpha = 0.2
q_vector = 128
patience = 100

# loss_fn = nn.CrossEntropyLoss
loss_fn = ArcFaceLoss
optim_fn = optim_wrapper(optim.Adam, lr=lr)
base_model_alice = TorchModel(
    model_fn = create_base_model(feature_dim=input_shape_alice[1],
                                    hidden_dim=hidden_dim,
                                    dropout=dropout,
                                    num_heads=num_heads,
                                    alpha=alpha,
                                    q_vector=q_vector),
    loss_fn = loss_fn,
    optim_fn = optim_fn,
    metrics= [
        metric_wrapper(Accuracy, task="multiclass", num_classes=num_classes, average='micro'),
        metric_wrapper(Precision, task="multiclass", num_classes=num_classes, average='micro'),
        metric_wrapper(Recall, task="multiclass", num_classes=num_classes, average='micro'),
    ],
)
base_model_bob = TorchModel(
    model_fn = create_base_model(feature_dim=input_shape_bob[1],
                                    hidden_dim=hidden_dim,
                                    dropout=dropout,
                                    num_heads=num_heads,
                                    alpha=alpha,
                                    q_vector=q_vector),
    loss_fn = loss_fn,
    optim_fn = optim_fn,
    metrics= [
        metric_wrapper(Accuracy, task="multiclass", num_classes=num_classes, average='micro'),
        metric_wrapper(Precision, task="multiclass", num_classes=num_classes, average='micro'),
        metric_wrapper(Recall, task="multiclass", num_classes=num_classes, average='micro'),
    ],
)

fuse_model = TorchModel(
    model_fn = create_fuse_model(hidden_dim=hidden_dim, 
                                    num_heads=num_heads, 
                                    num_classes=num_classes),
    loss_fn = loss_fn,
    optim_fn = optim_fn,
    metrics= [
        metric_wrapper(Accuracy, task="multiclass", num_classes=num_classes, average='micro'),
        metric_wrapper(Precision, task="multiclass", num_classes=num_classes, average='micro'),
        metric_wrapper(Recall, task="multiclass", num_classes=num_classes, average='micro'),
    ],
)

base_model_dict = {
    alice: base_model_alice,
    bob: base_model_bob,
}

sl_model = SLModel(
    base_model_dict=base_model_dict,
    device_y=alice,
    model_fuse=fuse_model,
    random_seed=1234,
    backend='torch',
)

INFO:root:Create proxy actor <class 'abc.ActorPYUSLTorchModel'> with party alice.


INFO:root:Create proxy actor <class 'abc.ActorPYUSLTorchModel'> with party bob.


In [8]:
epochs = 10

sl_model.fit(
    x = [features, meta_path_list],
    y = Y_train,
    epochs = epochs,
    batch_size = input_shape_alice[0],
    # sample_weight = idx_train,
    # validation_data = ([features, meta_path_list], Y_val, idx_val),
)

INFO:root:SL Train Params: {'self': <secretflow.ml.nn.sl.sl_model.SLModel object at 0x7f9e112836d0>, 'x': [FedNdarray(partitions={PYURuntime(alice): <secretflow.device.device.pyu.PYUObject object at 0x7f9e11263a60>, PYURuntime(bob): <secretflow.device.device.pyu.PYUObject object at 0x7f9e11263ca0>}, partition_way=<PartitionWay.VERTICAL: 'vertical'>), FedNdarray(partitions={PYURuntime(alice): <secretflow.device.device.pyu.PYUObject object at 0x7f9e11262a70>, PYURuntime(bob): <secretflow.device.device.pyu.PYUObject object at 0x7f9e11263f10>}, partition_way=<PartitionWay.VERTICAL: 'vertical'>)], 'y': FedNdarray(partitions={PYURuntime(alice): <secretflow.device.device.pyu.PYUObject object at 0x7f9e11263fd0>}, partition_way=<PartitionWay.VERTICAL: 'vertical'>), 'batch_size': 18810, 'epochs': 6, 'verbose': 1, 'callbacks': None, 'validation_data': None, 'shuffle': False, 'sample_weight': None, 'validation_freq': 1, 'dp_spent_step_freq': None, 'dataset_builder': None, 'audit_log_params': {}, '

Epoch 1/6


Train Processing: :   0%|          | 0/1 [00:38<?, ?it/s, {'train_loss': array(21.194305, dtype=float32), 'train_MulticlassAccuracy': array(0.31807548, dtype=float32), 'train_MulticlassPrecision': array(0.31807548, dtype=float32), 'train_MulticlassRecall': array(0.31807548, dtype=float32)}]
Train Processing: :   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 2/6


Train Processing: :   0%|          | 0/1 [00:38<?, ?it/s, {'train_loss': array(16.220493, dtype=float32), 'train_MulticlassAccuracy': array(0.53636366, dtype=float32), 'train_MulticlassPrecision': array(0.53636366, dtype=float32), 'train_MulticlassRecall': array(0.53636366, dtype=float32)}]
Train Processing: :   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 3/6


Train Processing: :   0%|          | 0/1 [00:39<?, ?it/s, {'train_loss': array(17.088148, dtype=float32), 'train_MulticlassAccuracy': array(0.5879851, dtype=float32), 'train_MulticlassPrecision': array(0.5879851, dtype=float32), 'train_MulticlassRecall': array(0.5879851, dtype=float32)}]
Train Processing: :   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 4/6


Train Processing: :   0%|          | 0/1 [00:39<?, ?it/s, {'train_loss': array(16.729155, dtype=float32), 'train_MulticlassAccuracy': array(0.5854865, dtype=float32), 'train_MulticlassPrecision': array(0.5854865, dtype=float32), 'train_MulticlassRecall': array(0.5854865, dtype=float32)}]
Train Processing: :   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 5/6


Train Processing: :   0%|          | 0/1 [00:38<?, ?it/s, {'train_loss': array(15.826693, dtype=float32), 'train_MulticlassAccuracy': array(0.5579479, dtype=float32), 'train_MulticlassPrecision': array(0.5579479, dtype=float32), 'train_MulticlassRecall': array(0.5579479, dtype=float32)}]
Train Processing: :   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 6/6


Train Processing: :   0%|          | 0/1 [00:38<?, ?it/s, {'train_loss': array(15.536063, dtype=float32), 'train_MulticlassAccuracy': array(0.5180223, dtype=float32), 'train_MulticlassPrecision': array(0.5180223, dtype=float32), 'train_MulticlassRecall': array(0.5180223, dtype=float32)}]


{'train_loss': [array(21.194305, dtype=float32),
  array(16.220493, dtype=float32),
  array(17.088148, dtype=float32),
  array(16.729155, dtype=float32),
  array(15.826693, dtype=float32),
  array(15.536063, dtype=float32)],
 'train_MulticlassAccuracy': [array(0.31807548, dtype=float32),
  array(0.53636366, dtype=float32),
  array(0.5879851, dtype=float32),
  array(0.5854865, dtype=float32),
  array(0.5579479, dtype=float32),
  array(0.5180223, dtype=float32)],
 'train_MulticlassPrecision': [array(0.31807548, dtype=float32),
  array(0.53636366, dtype=float32),
  array(0.5879851, dtype=float32),
  array(0.5854865, dtype=float32),
  array(0.5579479, dtype=float32),
  array(0.5180223, dtype=float32)],
 'train_MulticlassRecall': [array(0.31807548, dtype=float32),
  array(0.53636366, dtype=float32),
  array(0.5879851, dtype=float32),
  array(0.5854865, dtype=float32),
  array(0.5579479, dtype=float32),
  array(0.5180223, dtype=float32)]}

In [9]:
sl_model.evaluate(
    x = [features, meta_path_list],
    y = Y_test,
    batch_size = input_shape_alice[0],
    # sample_weight = idx_test,
)

Evaluate Processing: :   0%|          | 0/1 [00:14<?, ?it/s, loss=12.608366, MulticlassAccuracy=0.84949493, MulticlassPrecision=0.84949493, MulticlassRecall=0.84949493]


{'loss': array(12.608366, dtype=float32),
 'MulticlassAccuracy': array(0.84949493, dtype=float32),
 'MulticlassPrecision': array(0.84949493, dtype=float32),
 'MulticlassRecall': array(0.84949493, dtype=float32)}

In [10]:
import numpy as np
from scipy.sparse import load_npz
movie_embedding = np.load("recommend/movie_embedding.npy")
print(movie_embedding.shape)

user_movie_matrix = load_npz("recommend/user_movie_matrix.npz")
print(user_movie_matrix.shape)

(18810, 1024)
(265187, 18810)


至此得到了movie的embedding，后面可以用它做下游任务，完成推荐任务

In [11]:
# 聚合用户特征：用户特征为用户交互电影 embedding 的加权平均
user_embedding = user_movie_matrix.dot(movie_embedding) / (user_movie_matrix.sum(axis=1) + 1e-10)  # 防止除以 0
user_embedding_path = "recommend/user_embedding.npy"
np.save(user_embedding_path, user_embedding)
print("用户特征 shape:", user_embedding.shape)  # [num_users, embedding_dim]

用户特征 shape: (265187, 1024)


In [12]:
# 按步长 10000 取出用户（每隔 10000 个用户取 10 个）
step = 10000
num_per_step = 10
selected_user_indices = []
for start in range(0, len(user_embedding), step):
    selected_user_indices.extend(range(start, min(start + num_per_step, len(user_embedding))))

# 根据选定的用户索引取出用户特征和交互矩阵
limited_user_features = user_embedding[selected_user_indices]  # [num_selected_users, embedding_dim]
limited_user_movie_matrix = user_movie_matrix[selected_user_indices]  # [num_selected_users, num_movies]

# 计算用户对电影的相似度
similarity_matrix = np.dot(limited_user_features, movie_embedding.T)  # [num_selected_users, num_movies]

# 避免推荐用户已交互过的电影
for user_idx in range(limited_user_movie_matrix.shape[0]):
    interacted_movies = np.where(limited_user_movie_matrix[user_idx].toarray().flatten() > 0)[0]  # 获取已交互电影索引
    similarity_matrix[user_idx, interacted_movies] = -np.inf  # 设置已交互电影的分数为 -np.inf
    
# 推荐前 5 部电影
top_k = 5
recommended_movies = np.argsort(similarity_matrix, axis=1)[:, -top_k:]
recommended_movies = np.flip(recommended_movies, axis=1)

# 输出推荐结果
for idx, movie_ids in zip(selected_user_indices, recommended_movies):
    print(f"用户 {idx} 推荐的电影: {movie_ids}")

用户 0 推荐的电影: [[7053 2296 8055 1547 2050]]
用户 1 推荐的电影: [[ 7053  2296  8055  1547 12678]]
用户 2 推荐的电影: [[ 7053  8055  2296  1547 12678]]
用户 3 推荐的电影: [[ 7053  2296  8055  1547 12678]]
用户 4 推荐的电影: [[7053 2296 8055 1547 2050]]
用户 5 推荐的电影: [[ 7053  8055  2296  1547 12678]]
用户 6 推荐的电影: [[7053 2296 8055 1547 2050]]
用户 7 推荐的电影: [[7053 2296 8055 1547 2050]]
用户 8 推荐的电影: [[7053 2296 1547 8055 2050]]
用户 9 推荐的电影: [[7053 2296 8055 1547 2050]]
用户 10000 推荐的电影: [[7053 2296 1547 2050 8055]]
用户 10001 推荐的电影: [[7053 2296 1547 2050 8055]]
用户 10002 推荐的电影: [[7053 2296 1547 2050 8055]]
用户 10003 推荐的电影: [[7053 2296 1547 2050 8055]]
用户 10004 推荐的电影: [[ 7053  2049 17250 11774 15204]]
用户 10005 推荐的电影: [[7053 2296 8055 1547 2050]]
用户 10006 推荐的电影: [[7053 2296 8055 1547 2050]]
用户 10007 推荐的电影: [[7053 2296 8055 1547 2050]]
用户 10008 推荐的电影: [[7053 2296 8055 1547 2050]]
用户 10009 推荐的电影: [[7053 2296 8055 1547 2050]]
用户 20000 推荐的电影: [[7053 2296 1547 8055 2050]]
用户 20001 推荐的电影: [[7053 2296 8055 1547 2050]]
用户 20002 推荐的电影: [[7053 22