In [136]:
import random
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.metrics import roc_auc_score
import csv

训练集（training.txt）包含了用于模型训练/计算的数据，内含 942 个用户，1412 个商品项目，44,234 条点击信息. 文件内每一行包含三个字段，分别为：`user_id`, `item_id`, `click`。先将数据读取出来：

In [137]:
path = 'RSBD_Project_2024/training.txt'
data = pd.read_csv(path, sep=' ', header=None, names=['user_id', 'item_id', 'click'])
data.head()

Unnamed: 0,user_id,item_id,click
0,298,474,1
1,253,465,1
2,286,1014,1
3,200,222,1
4,122,387,1


划分训练集和测试集

In [138]:
from sklearn.model_selection import train_test_split
Data_train, Data_test = train_test_split(data, train_size=0.8, random_state=None)
Data_train.head()

Unnamed: 0,user_id,item_id,click
29066,692,300,1
30686,711,202,1
30902,901,144,1
16338,338,427,1
38805,599,846,1


In [139]:
# 输入用户和物品的交互信息，输出用户的交互字典(key是user_id，value是这个用户交互过的item_id)
def get_user_rating_dict(data_train):
    user_rating_dict = defaultdict()
    for index, row in data_train.iterrows():
        u_id = row['user_id']
        i_id = row['item_id']
    
        # 如果 user_id 不在字典中，则将其添加进去并初始化一个空列表
        if u_id not in user_rating_dict:
            user_rating_dict[u_id] = []
        
        # 将当前行的 item_id 添加到对应 user_id 的列表中
        user_rating_dict[u_id].append(i_id)
        
    return user_rating_dict

定义一些常量

In [140]:
Learning_rate = 0.01    # 学习率
Lambda_reg = 0.01       # 正则化参数
Iterations = 10000      # 迭代次数

num_users = 942         # 用户数
num_items = 1412        # 物品数
vector_dimension = 12   # 向量长度

# 初始化用户、物品和物品偏置项的向量
User_factors = np.random.rand(num_users, vector_dimension) * 0.01
Item_factors = np.random.rand(num_items, vector_dimension) * 0.01
Bias_factors = np.random.rand(num_items) * 0.01

In [141]:
# 将用户和物品的id抽离出来
User_ids = data.loc[:, 'user_id'].unique().tolist()
Item_ids = data.loc[:, 'item_id'].unique().tolist()

# 为了知道每一行向量代表的是哪一个user或item，添加一个从 id 映射到 index 的 map    
User_id_map = {user_id : index for index, user_id in enumerate(User_ids)}
Item_id_map = {item_id : index for index, item_id in enumerate(Item_ids)}

# 训练数据中出现的 user_id
User_ids_train = Data_train.loc[:, 'user_id'].unique().tolist()

# 初始化用户交互矩阵，其中行代表user_id对应的index，列代表item_id对应的index，如果交互过则值为1，否则为0
User_rating_metrix = np.zeros(shape=(num_users, num_items))
User_rating_metrix_test = np.zeros(shape=(num_users, num_items))

# 填充训练集矩阵
for index in range(Data_train.shape[0]):
    user_index = User_id_map[Data_train.iloc[index]['user_id']]
    item_index = Item_id_map[Data_train.iloc[index]['item_id']]
    User_rating_metrix[user_index, item_index] = 1

for index in range(Data_test.shape[0]):
    user_index = User_id_map[Data_train.iloc[index]['user_id']]
    item_index = Item_id_map[Data_train.iloc[index]['item_id']]
    User_rating_metrix_test[user_index, item_index] = 1

# 获取用户的交互字典(key是user_id，value是这个用户交互过的item_id，value是item_id的列表)
User_rating_dict = get_user_rating_dict(Data_train)
User_rating_dict_test = get_user_rating_dict(Data_test)
User_rating_dict_total = get_user_rating_dict(data)

BPR的目标是使得用户u对物品i的评分$u_i$

In [143]:
# 训练BPR模型参数，采用随机梯度下降训练方法
# 输入参数分别为（用户交互字典、用户矩阵、物品矩阵、物品评分偏移矩阵、学习率、正则化参数lambda、迭代次数）
def train_bpr(user_rating_dict, user_factors, item_factors, bias_factors, learning_rate, lambda_reg, iterations):
    for ite1 in range(iterations):
        for ite2 in range(len(User_ids)): # 每次训练随机抽取用户 num_users 次
            # 随机获取一个用户
            u = User_ids[random.randint(0, num_users-1)]
            
            # 如果这个用户没有在测试集中，则直接跳过（这种情况一般不会发生）
            if u not in User_ids_train:
                continue
            
            # 随机选取一个用户交互过的物品i，i是被u点击过的物品
            i = random.sample(user_rating_dict[u], 1)[0]
            # 随机选取一个用户没有交互过的物品j
            j = Item_ids[random.randint(0, num_items-1)]
            while j in user_rating_dict[u]:
                j = Item_ids[random.randint(0, num_items-1)]
                
            # 上面的u, i, j都是用户或者物品的id，我们需要构造的三元组是它们对应的索引
            u_index = User_id_map[u]
            i_index = Item_id_map[i]
            j_index = Item_id_map[j]
            
            # 用户u对物品i, j的评分
            x_ui = np.dot(user_factors[u_index], item_factors[i_index]) + bias_factors[i_index]
            x_uj = np.dot(user_factors[u_index], item_factors[j_index]) + bias_factors[j_index]
            # 用户对物品i, j的评分差
            x_uij = x_ui - x_uj
            
            # 下面的梯度基于论文中的推理
            loss_func = -1.0 / (1 + np.exp(x_uij))
            
            # 参数梯度
            grad_u = loss_func * (item_factors[i_index] - item_factors[j_index]) + lambda_reg * user_factors[u_index]
            grad_i = loss_func * user_factors[u_index] + lambda_reg * item_factors[i_index]
            grad_j = loss_func * (-user_factors[u_index]) + lambda_reg * item_factors[j_index]
            grad_bias_i = loss_func + lambda_reg * bias_factors[i_index]
            grad_bias_j = -loss_func + lambda_reg * bias_factors[j_index]
                
            # 同时更新参数
            user_factors[u_index] -= learning_rate * grad_u
            item_factors[i_index] -= learning_rate * grad_i
            item_factors[j_index] -= learning_rate * grad_j
            bias_factors[i_index] -= learning_rate * grad_bias_i
            bias_factors[j_index] -= learning_rate * grad_bias_j
        
    return user_factors, item_factors, bias_factors

得到预测矩阵，预测评分矩阵的行列坐标分别代表用户和物品的索引而非id，最后输出结果时需要做一个简单的转换

In [144]:
# 获取用户对物品的评分
def get_predictions(user_factors, item_factors, bias_factors):
    predictions = np.dot(user_factors, item_factors.T) + bias_factors
    return predictions

过滤掉训练集中已经存在的正反馈项目，使它们在预测结果中不会被推荐

In [145]:
def filter_known_interactions(user_rating_dict, predictions):
    # 确保推荐不能是训练集中的正项
    for uid in user_rating_dict.keys():
        for jid in user_rating_dict[uid]:
            predictions[User_id_map(uid) * num_items + Item_id_map(jid)] = 0
    return predictions

下面开始训练模型

In [146]:
User_factors, Item_factors, Bias_factors = train_bpr(User_rating_dict, User_factors, Item_factors, Bias_factors, Learning_rate, Lambda_reg, Iterations)

将模型参数写入文件中，这样下次就不用再训练模型了，可以直接将参数读取出来直接做预测

In [147]:
# 写入 CSV 文件
with open('parameters\\User_factors.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(User_factors)
    
# 写入 CSV 文件
with open('parameters\\Item_factors.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(Item_factors)

# 将一维数组转换为二维列表，每行一个元素
Bias_factors_list = Bias_factors.reshape(-1, 1)

# 写入 CSV 文件
with open('parameters\\Bias_factors.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(Bias_factors_list)

计算 AUC（曲线下面积）得分，测试模型的预测结果

In [148]:
# 进行预测，得到的`Prediction_metrix`表示每个用户对每个项目的预测评分
Prediction_metrix = get_predictions(User_factors, Item_factors, Bias_factors)

# 将矩阵平展，便于后续评估
Predictions = Prediction_metrix.reshape(-1)
Tests = np.zeros(num_items * num_users)

# 填充测试标签
for u in range(num_users):
    for i in range(num_items): # 这里的i是索引，从0开始
        if int(User_rating_metrix_test[u][i]) == 1:
            Tests[u * num_items + i] = 1
        else:
            Tests[u * num_items + i] = 0
            
# 计算 AUC 得分
auc_score = roc_auc_score(Tests, Predictions)
auc_score

0.9104754536499771

计算模型的precision，测试模型的预测结果

In [171]:
def calculate_precision(k = 10):
    relevant_recommended = 0
    total = 0
    
    for u in User_ids:
        # 如果用户不在测试集中，则跳过
        if u not in User_rating_dict_test:
            continue
        
        relevant_items = User_rating_dict_test[u]   # 用户实际交互过的物品(测试集)
        u_index = User_id_map[u]
        
        # 计算用户对所有物品的评分
        scores = np.dot(User_factors[u_index], Item_factors.T) + Bias_factors
        
        # 获取评分最高的 K 个物品
        recommended_items_index = np.argsort(scores)[::-1]
        recommended_items = []
        
        count = 0
        
        # 确保推荐的物品过滤掉训练集中用户交互过的物品
        for i_index in recommended_items_index:
            if Item_ids[i_index] not in User_rating_dict[u]:
                recommended_items.append(Item_ids[i_index])
                count = count + 1
                if count == k:
                    break
        
        # 计算推荐的物品和实际喜欢的物品之间的交集
        relevant_recommended += len(set(relevant_items) & set(recommended_items))
        total += k
    
    precision = relevant_recommended / total if total > 0 else 0
    return precision

Precision = calculate_precision()
print(Precision)

0.12902869757174393


输入用户矩阵、物品矩阵和需要作推荐的`user_id`，为该用户推荐top10物品

In [151]:
# 为用户推荐物品，去除掉用户已经交互过的物品
def get_top_k_recommendations(user_factors, item_factors, bias_factors, user_id, user_rating_dict_total, k = 10):
    u_index = User_id_map[user_id]
    scores = np.dot(item_factors, user_factors[u_index]) + bias_factors # 使用内积计算出该用户对所有物品的评分，并加上物品的全局评分偏移
    top_k_items_index = scores.argsort()[::-1]          # 对scores进行升序排序，得到升序的索引，随后将这个索引反转，相当于获取了scores降序排序的索引，
                                                        # 取前10位作为该用户的tok_10推荐，即得到了该用户评分最高的10个`item_id`
    
    top_k_items = []
    count = 0
    for i in top_k_items_index:
        if Item_ids[i] not in user_rating_dict_total[user_id]:
            top_k_items.append(Item_ids[i])
            count = count+1
            if count == 10:
                break
    return top_k_items

为每个用户生成物品top10推荐

In [152]:
# 读取需要预测的`user_id`
path = 'RSBD_Project_2024/test.txt'
unpredicted_user_ids = pd.read_csv(path, header=None, names=['user_id'])

# 使用矩阵存储推荐结果，其中第一列为`user_id`，剩余十列是`itemi_id`(其中1<=i<=10)
top_10_recommendations = np.zeros((len(unpredicted_user_ids), 11))

# 将第一列赋值为对应的`user_id`
unpredicted_user_ids = unpredicted_user_ids.values.flatten()
top_10_recommendations[:, 0] = unpredicted_user_ids
top_10_recommendations = top_10_recommendations.astype(int)

# 为每个用户推荐top10物品
for idx, u in enumerate(unpredicted_user_ids):
    recommendation = get_top_k_recommendations(User_factors, Item_factors, Bias_factors, u, User_rating_dict_total)
    # print(recommendation)
    top_10_recommendations[idx, 1:] = recommendation
    for item_id in recommendation:
        if item_id not in Item_ids:
            print("false")
# top_10_recommendations[:10]

# 将推荐结果存储在文件中
file_path = '2024110681_result.txt'

# 写入文件
with open(file_path, 'w') as file:
    for row in top_10_recommendations:
        user_id = row[0]
        item_ids_str = ','.join(map(str, row[1:]))  # 将推荐的物品 ID 转换为逗号分隔的字符串
        file.write(f"{user_id}: {item_ids_str}\n")

In [153]:
top_10_recommendations[:]

array([[ 81, 181,  50, ..., 100, 405, 222],
       [399,  56,  98, ..., 405,  28,  69],
       [890, 183, 180, ...,  79, 182, 511],
       ...,
       [516, 258, 127, ..., 300, 100, 318],
       [272, 174, 603, ..., 318, 182, 173],
       [629, 100, 318, ...,  79,  28, 483]])