In [92]:
import json

from tqdm import tqdm
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity,pairwise_distances
from collections import defaultdict

In [74]:
def load_data_and_map_ids(file_path, test_size=0.3):
    # 读取数据
    data = pd.read_csv(file_path, sep=' ', header=None, names=['user_id', 'item_id', 'click'])

    # 1.1 将id映射到指定范围
    user_id_map = {id: i for i, id in enumerate(data['user_id'].unique())}
    item_id_map = {id: i for i, id in enumerate(data['item_id'].unique())}

    data['user_id'] = data['user_id'].map(user_id_map)
    data['item_id'] = data['item_id'].map(item_id_map)

    # 1.2 划分训练集和测试集
    train_data = []
    test_data = []
    for user, group in data.groupby('user_id'):
        n_test_items = max(1, int(len(group) * test_size))
        test_items = group.sample(n=n_test_items)
        train_items = group.drop(test_items.index)
        train_data.append(train_items)
        test_data.append(test_items)

    return data, user_id_map, item_id_map, pd.concat(train_data), pd.concat(test_data)

In [82]:
def build_user_item_matrix(data,user_id_map,item_id_map):
    user_item_matrix = np.zeros((len(user_id_map),len(item_id_map)))
    for line in data.itertuples():
        user_item_matrix[line[1]-1,line[2]-1] = line[3]
    return pd.DataFrame(user_item_matrix,index=user_id_map.values(),columns=item_id_map.values())

In [4]:
def compute_user_similarity(user_item_matrix,method="cos"):
    if method == 'cos':
        similarity = cosine_similarity(user_item_matrix)
    else: # method == 'pearson':
        similarity = 1 - pairwise_distances(user_item_matrix,metric='correlation')
    return pd.DataFrame(similarity,index=user_item_matrix.index,columns=user_item_matrix.index)

In [122]:
def compute_item_similarity(user_item_matrix,method="cos"):
    if method == 'cos':
        similarity = cosine_similarity(user_item_matrix.T)
    else: # method == 'pearson':
        similarity = 1 - pairwise_distances(user_item_matrix.T,metric='correlation')
    return pd.DataFrame(similarity,index=user_item_matrix.T.index,columns=user_item_matrix.T.index)

In [127]:
def get_top_n_recommendations(user_item_matrix, similarity, n=10, top_k=10, method="user"):
    recommendations = defaultdict(list)
    # 3.1 基于用户的top-k推荐
    if method == "user":
        user_ids = user_item_matrix.index
        # 获取前n个相似度最高的用户
        for user_id in tqdm(user_ids, desc="Processing users"):
            user_vector = user_item_matrix.iloc[user_id]
            similar_users = similarity[user_id].argsort()[::-1][1:top_k + 1]  # 取前K个最相似的用户（除了自己）
            # 遍历item，获取top-k最高得分的item
            for item_id in user_item_matrix.columns:
                if user_vector[item_id] == 0:  # 用户未交互过的物品
                    similar_users_ratings = user_item_matrix.iloc[similar_users, item_id]
                    similar_users_similarities = similarity.iloc[user_id, similar_users]
                    # 加权平均得分
                    score = np.sum(similar_users_ratings * similar_users_similarities) / np.sum(
                        similar_users_similarities)
                    if score > 0:
                        recommendations[user_id].append((item_id, score))
            recommendations[user_id] = sorted(recommendations[user_id], key=lambda x: x[1], reverse=True)[:n]
    # 3.2 基于物品的top-k推荐
    elif method == "item":
        user_ids = user_item_matrix.index
        for user_id in tqdm(user_ids, desc="Processing users"):
            # 获取交互过的物品和候选物品
            user_vector = user_item_matrix.iloc[user_id]
            interacted_items = user_vector[user_vector > 0].index
            candidate_items = user_item_matrix.columns[~user_item_matrix.columns.isin(interacted_items)]

            # 计算得分
            item_scores = defaultdict(float)
            for item in candidate_items:
                for interacted_item in interacted_items:
                    item_scores[item] += similarity.iloc[item, interacted_item] * user_vector[interacted_item]

            top_items = sorted(item_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
            recommendations[user_id] = top_items
    return recommendations

In [7]:
def evaluate(recommendations, test_data, k=10):
    precision = defaultdict(float)
    recall = defaultdict(float)

    for user, user_recs in recommendations.items():
        user_test = set(test_data[test_data['user_id'] == user]['item_id']) # 测试集中的用户交互
        recs = set([item for item, _ in user_recs[:k]]) # 推荐的物品

        if len(recs) > 0:
            precision[user] = len(recs & user_test) / len(recs)
        if len(user_test) > 0:
            recall[user] = len(recs & user_test) / len(user_test)

    avg_precision = np.mean(list(precision.values()))
    avg_recall = np.mean(list(recall.values()))

    return avg_precision, avg_recall

In [142]:
def convert_recommendations_to_original_ids(recommendations, user_id_map, item_id_map):
    original_recommendations = {}
    reverse_user_id_map = {v: k for k, v in user_id_map.items()}
    reverse_item_id_map = {v: k for k, v in item_id_map.items()}

    for user, recs in recommendations.items():
        original_user_id = int(reverse_user_id_map[user])
        original_recs = [(int(reverse_item_id_map[item]), score) for item, score in recs]
        original_recommendations[original_user_id] = original_recs

    return original_recommendations

In [83]:
data, user_id_map, item_id_map, train_data, test_data = load_data_and_map_ids('../data/training.txt')

In [84]:
data

Unnamed: 0,user_id,item_id,click
0,0,0,1
1,1,1,1
2,2,2,1
3,3,3,1
4,4,4,1
...,...,...,...
44229,88,600,1
44230,847,401,1
44231,421,43,1
44232,790,286,1


In [85]:
user_item_matrix = build_user_item_matrix(train_data,user_id_map,item_id_map)

In [86]:
user_item_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1402,1403,1404,1405,1406,1407,1408,1409,1410,1411
0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
937,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
938,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [87]:
user_similarity = compute_user_similarity(user_item_matrix,method="pearson")

In [88]:
user_similarity

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,932,933,934,935,936,937,938,939,940,941
0,1.000000,0.110227,0.197857,0.036433,0.168030,0.096915,0.066873,0.101349,0.138053,0.070612,...,-0.013178,0.062053,-0.014024,-0.020987,0.051916,0.034926,-0.011438,-0.011438,-0.006594,0.103466
1,0.110227,1.000000,0.063055,0.000224,0.070822,0.075874,0.059616,0.106813,0.061542,0.017222,...,0.028595,0.068900,0.011485,0.057979,-0.032932,0.040851,-0.018420,-0.018420,-0.010620,0.020605
2,0.197857,0.063055,1.000000,0.050247,0.135557,0.107442,0.007235,-0.011838,0.033966,0.133773,...,0.025690,0.057438,0.016487,-0.030715,-0.004363,0.048605,0.028540,0.028540,0.068664,0.144524
3,0.036433,0.000224,0.050247,1.000000,-0.021008,0.041054,0.018264,0.099992,0.092729,0.038498,...,-0.000878,0.069015,-0.011183,0.027026,0.028576,-0.011792,-0.009121,-0.009121,-0.005259,0.114797
4,0.168030,0.070822,0.135557,-0.021008,1.000000,0.071272,0.037117,-0.002016,0.118193,0.078563,...,0.025339,0.064539,0.043520,-0.012214,0.035312,0.070502,-0.018894,-0.018894,-0.010893,0.046646
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
937,0.034926,0.040851,0.048605,-0.011792,0.070502,0.020450,-0.011792,0.088896,0.028791,-0.014420,...,0.027195,-0.018402,0.099359,0.061347,0.063437,1.000000,-0.005517,0.124318,0.221381,0.076275
938,-0.011438,-0.018420,0.028540,-0.009121,-0.018894,0.036159,-0.009121,-0.016009,-0.021586,-0.011154,...,0.100514,0.038106,0.131577,0.176443,-0.007629,-0.005517,1.000000,-0.004267,-0.002460,0.047101
939,-0.011438,-0.018420,0.028540,-0.009121,-0.018894,-0.014694,-0.009121,0.078179,-0.021586,-0.011154,...,-0.013027,0.038106,-0.005232,0.084306,-0.007629,0.124318,-0.004267,1.000000,0.576531,0.047101
940,-0.006594,-0.010620,0.068664,-0.005259,-0.010893,-0.008471,-0.005259,0.072224,-0.012445,-0.006431,...,-0.007510,0.082320,-0.003016,0.154844,-0.004399,0.221381,-0.002460,0.576531,1.000000,-0.007141


In [123]:
item_similarity = compute_item_similarity(user_item_matrix)

In [124]:
item_similarity

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1402,1403,1404,1405,1406,1407,1408,1409,1410,1411
0,1.000000,0.120483,0.114770,0.041204,0.000000,0.224702,0.087121,0.054708,0.212429,0.280877,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.130369
1,0.120483,1.000000,0.081650,0.102598,0.000000,0.046625,0.000000,0.045408,0.165295,0.116563,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.069561
2,0.114770,0.081650,1.000000,0.083771,0.069007,0.171312,0.132842,0.176107,0.242933,0.199864,...,0.0,0.091287,0.091287,0.0,0.0,0.0,0.0,0.0,0.0,0.094660
3,0.041204,0.102598,0.083771,1.000000,0.086711,0.095673,0.166924,0.046587,0.101753,0.023918,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.095157
4,0.000000,0.000000,0.069007,0.086711,1.000000,0.000000,0.000000,0.076753,0.083820,0.118217,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1407,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.073922,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.000000
1408,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.000000
1409,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,1.0,0.0,0.000000
1410,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,1.0,0.000000


In [94]:
recs = get_top_n_recommendations(user_item_matrix,user_similarity)

Processing users: 100%|██████████| 942/942 [21:43<00:00,  1.38s/it]


In [95]:
evaluate(recs,test_data)

(0.02335456475583864, 0.014633382035289999)

In [128]:
recs = get_top_n_recommendations(user_item_matrix,item_similarity,method="item")

Processing users: 100%|██████████| 942/942 [22:26<00:00,  1.43s/it] 


In [129]:
evaluate(recs,test_data)

(0.027176220806794056, 0.020906998936881065)

In [143]:
ori_recs = convert_recommendations_to_original_ids(recs,user_id_map,item_id_map)

In [144]:
with open('../res/item_CF_rec_real.json', 'w', encoding='utf-8') as f:
    json.dump(ori_recs, f, ensure_ascii=False, indent=4)