In [1]:
import json

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity,pairwise_distances
from collections import defaultdict

In [2]:
def load_data_and_map_ids(file_path, test_size=0.3):
    # 读取数据
    data = pd.read_csv(file_path, sep=' ', header=None, names=['user_id', 'item_id', 'click'])

    # 1.1 将id映射到指定范围
    user_id_map = {id: i for i, id in enumerate(data['user_id'].unique())}
    item_id_map = {id: i for i, id in enumerate(data['item_id'].unique())}

    data['user_id'] = data['user_id'].map(user_id_map)
    data['item_id'] = data['item_id'].map(item_id_map)

    # 1.2 划分训练集和测试集
    train_data = []
    test_data = []
    for user, group in data.groupby('user_id'):
        n_test_items = max(1, int(len(group) * test_size))
        test_items = group.sample(n=n_test_items)
        train_items = group.drop(test_items.index)
        train_data.append(train_items)
        test_data.append(test_items)

    return data, user_id_map, item_id_map, pd.concat(train_data), pd.concat(test_data)

In [3]:
def build_user_item_matrix(data):
    return data.pivot(index='user_id', columns='item_id', values='click').fillna(0)

In [4]:
def compute_user_similarity(user_item_matrix,method="cos"):
    if method == 'cos':
        similarity = cosine_similarity(user_item_matrix)
    else: # method == 'pearson':
        similarity = 1 - pairwise_distances(user_item_matrix,metric='correlation')
    return pd.DataFrame(similarity,index=user_item_matrix.index,columns=user_item_matrix.index)

In [5]:
def compute_item_similarity(user_item_matrix,method="cos"):
    if method == 'cos':
        similarity = cosine_similarity(user_item_matrix.T)
    else: # method == 'pearson':
        similarity = 1 - pairwise_distances(user_item_matrix.T,metric='correlation')
    return pd.DataFrame(similarity,index=user_item_matrix.T.index,columns=user_item_matrix.T.index)

In [6]:
def get_top_n_recommendations(user_item_matrix, similarity, n=10, top_k=10, method="user"):
    recommendations = defaultdict(list)
    if method == "user":
        user_ids = user_item_matrix.index
        # 3.1 获取前n个相似度最高的用户
        for i, user_id in enumerate(user_ids):
            user_vector = user_item_matrix.iloc[i]
            similar_users = similarity[i].argsort()[::-1][1:top_k + 1]  # 取前K个最相似的用户（除了自己）
            # 3.2 遍历item，获取top-k最高得分的item
            for item_id in user_item_matrix.columns:
                if user_vector[item_id] == 0:  # 用户未交互过的物品
                    similar_users_ratings = user_item_matrix.iloc[similar_users, item_id]
                    similar_users_similarities = similarity.iloc[i, similar_users]
                    # 加权平均得分
                    score = np.sum(similar_users_ratings * similar_users_similarities) / np.sum(similar_users_similarities)
    
                    if score > 0:
                        recommendations[user_id].append((item_id, score))
    
            recommendations[user_id] = sorted(recommendations[user_id], key=lambda x: x[1], reverse=True)[:n]
    elif method == "item":
        item_ids = user_item_matrix.T.index
        pass
    return recommendations

In [7]:
def evaluate(recommendations, test_data, k=10):
    precision = defaultdict(float)
    recall = defaultdict(float)

    for user, user_recs in recommendations.items():
        user_test = set(test_data[test_data['user_id'] == user]['item_id'])
        recs = set([item for item, _ in user_recs[:k]])

        if len(recs) > 0:
            precision[user] = len(recs & user_test) / len(recs)
        if len(user_test) > 0:
            recall[user] = len(recs & user_test) / len(user_test)

    avg_precision = np.mean(list(precision.values()))
    avg_recall = np.mean(list(recall.values()))

    return avg_precision, avg_recall

In [8]:
def convert_recommendations_to_original_ids(recommendations, user_id_map, item_id_map):
    original_recommendations = {}
    reverse_user_id_map = {v: k for k, v in user_id_map.items()}
    reverse_item_id_map = {v: k for k, v in item_id_map.items()}

    for user, recs in recommendations.items():
        original_user_id = reverse_user_id_map[user]
        original_recs = [(reverse_item_id_map[item], score) for item, score in recs]
        original_recommendations[original_user_id] = original_recs

    return original_recommendations

In [9]:
data, user_id_map, item_id_map, train_data, test_data = load_data_and_map_ids('../data/training.txt')

In [10]:
data

Unnamed: 0,user_id,item_id,click
0,0,0,1
1,1,1,1
2,2,2,1
3,3,3,1
4,4,4,1
...,...,...,...
44229,88,600,1
44230,847,401,1
44231,421,43,1
44232,790,286,1


In [11]:
user_item_matrix = build_user_item_matrix(train_data)

In [12]:
user_item_matrix

item_id,0,1,2,3,4,5,6,7,8,9,...,1398,1399,1401,1402,1403,1405,1407,1408,1409,1410
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
937,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
938,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
user_similarity = compute_user_similarity(user_item_matrix,method="pearson")

In [14]:
user_similarity

user_id,0,1,2,3,4,5,6,7,8,9,...,932,933,934,935,936,937,938,939,940,941
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.000000,0.147700,0.062719,0.126463,0.113708,0.029670,-0.008511,0.028836,0.119005,0.116659,...,0.019215,0.061415,0.105777,0.032760,0.074536,0.077665,0.029413,0.046566,0.046566,-0.007481
1,0.147700,1.000000,0.124201,0.196203,0.004857,0.056480,0.134274,0.096284,0.099556,0.163873,...,0.107990,0.050486,0.080459,-0.014691,0.013325,0.087234,-0.015492,-0.011982,-0.011982,-0.006907
2,0.062719,0.124201,1.000000,0.127768,0.057912,0.036677,0.085837,-0.001618,0.092158,0.057443,...,0.025247,0.040218,0.013935,0.010451,0.033529,-0.010961,0.007421,-0.019318,-0.019318,-0.011137
3,0.126463,0.196203,0.127768,1.000000,0.005575,0.066115,0.091242,0.005575,0.023519,0.089697,...,-0.026888,0.038757,0.125958,0.015554,-0.007238,0.019839,0.047676,-0.017550,0.027804,0.068320
4,0.113708,0.004857,0.057912,0.005575,1.000000,0.015895,-0.008734,0.055163,0.098577,0.073437,...,-0.014635,-0.002182,0.017903,-0.011712,0.026265,0.027836,-0.012350,-0.009552,-0.009552,-0.005507
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
937,0.077665,0.087234,-0.010961,0.019839,0.027836,0.010729,0.058721,-0.017081,0.049892,0.021567,...,0.049833,0.103955,0.002917,-0.009795,0.089440,1.000000,0.063007,-0.007989,-0.007989,-0.004606
938,0.029413,-0.015492,0.007421,0.047676,-0.012350,0.037798,0.098588,-0.012350,0.051479,0.027562,...,0.076424,0.070523,0.021366,0.205231,0.203917,0.063007,1.000000,-0.005776,0.124093,0.221277
939,0.046566,-0.011982,-0.019318,-0.017550,-0.009552,-0.019817,0.035518,-0.009552,-0.016782,-0.022654,...,-0.006845,0.043180,0.142292,0.131366,0.452684,-0.007989,-0.005776,1.000000,0.162944,0.286958
940,0.046566,-0.011982,-0.019318,0.027804,-0.009552,-0.019817,0.035518,-0.009552,0.030383,-0.022654,...,-0.006845,0.043180,0.037486,0.268210,0.083977,-0.007989,0.124093,0.162944,1.000000,0.576492


In [15]:
item_similarity = compute_item_similarity(user_item_matrix)

In [16]:
item_similarity

item_id,0,1,2,3,4,5,6,7,8,9,...,1398,1399,1401,1402,1403,1405,1407,1408,1409,1410
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.000000,0.128388,0.068626,0.162681,0.089398,0.000000,0.000000,0.096197,0.186813,0.177264,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.104828,0.0,0.0,0.0
1,0.128388,1.000000,0.133631,0.093169,0.087039,0.000000,0.102062,0.093659,0.106990,0.150075,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.204124,0.0,0.0,0.0
2,0.068626,0.133631,1.000000,0.199205,0.000000,0.000000,0.000000,0.050063,0.045751,0.112306,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
3,0.162681,0.093169,0.199205,1.000000,0.116775,0.034503,0.136931,0.125656,0.172251,0.275174,...,0.0,0.0,0.091287,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
4,0.089398,0.087039,0.000000,0.116775,1.000000,0.080582,0.159901,0.048912,0.044699,0.141073,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1405,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.073521,...,0.0,0.0,0.000000,0.0,0.0,1.0,0.000000,0.0,0.0,0.0
1407,0.104828,0.204124,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.0,1.000000,0.0,0.0,0.0
1408,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.073521,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,1.0,0.0,0.0
1409,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,1.0,0.0


In [17]:
user_ids = user_item_matrix.index
print(user_ids)
# 3.1 获取前n个相似度最高的用户
for i, user_id in enumerate(user_ids):
    user_vector = user_item_matrix.iloc[i]
    similar_users = user_similarity[i].argsort()[::-1][1:11]  # 取前K个最相似的用户（除了自己）
    # 3.2 遍历item，获取top-k最高得分的item
    for item_id in user_item_matrix.columns:
        if user_vector[item_id] == 0:  # 用户未交互过的物品
            similar_users_ratings = user_item_matrix.iloc[similar_users, item_id]
            similar_users_similarities = user_similarity.iloc[i, similar_users]
            # 过滤掉相似用户中未评分的情况
            mask = similar_users_ratings > 0
            if mask.sum() == 0:
                continue

            filtered_similar_users_ratings = similar_users_ratings[mask]
            filtered_similar_users_similarities = similar_users_similarities[mask]
            
            print("Similar users' ratings:", filtered_similar_users_ratings)
            print("Similar users' similarities:", filtered_similar_users_similarities)
            print("Sum of similar users' ratings:", np.sum(filtered_similar_users_ratings))

            # 加权平均得分
            score = np.sum(filtered_similar_users_ratings * filtered_similar_users_similarities) / np.sum(filtered_similar_users_similarities)
            print("Predicted score for item", item_id, ":", score)
            break
    break

Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            932, 933, 934, 935, 936, 937, 938, 939, 940, 941],
           dtype='int64', name='user_id', length=942)
user_id
356    0.0
792    0.0
471    0.0
673    0.0
885    0.0
73     0.0
69     0.0
771    0.0
883    0.0
31     0.0
Name: 1, dtype: float64


InvalidIndexError: (0, user_id
940    356
939    792
938    471
937    673
936    885
935     73
934     69
933    771
932    883
931     31
Name: 0, dtype: int64)

In [19]:
similar_users = user_similarity[0].argsort()[::-1][1:11]

In [20]:
print(similar_users)

user_id
940    356
939    792
938    471
937    673
936    885
935     73
934     69
933    771
932    883
931     31
Name: 0, dtype: int64


In [27]:
similar_users_ratings = user_item_matrix.iloc[similar_users, 101]
print(similar_users_ratings)

user_id
356    1.0
792    0.0
471    1.0
673    0.0
885    0.0
73     1.0
69     0.0
771    0.0
883    0.0
31     0.0
Name: 101, dtype: float64


In [28]:
similar_users_similarities = user_similarity.iloc[0, similar_users]
print(similar_users_similarities)

user_id
356    0.301166
792    0.286587
471    0.279055
673    0.272116
885    0.266444
73     0.266285
69     0.262847
771    0.262222
883    0.258791
31     0.258791
Name: 0, dtype: float64


In [29]:
np.sum(similar_users_ratings * similar_users_similarities) / np.sum(similar_users_similarities)

0.3118681907430994

In [30]:
similar_users_ratings * similar_users_similarities

user_id
356    0.301166
792    0.000000
471    0.279055
673    0.000000
885    0.000000
73     0.266285
69     0.000000
771    0.000000
883    0.000000
31     0.000000
dtype: float64

In [31]:
np.sum(similar_users_similarities)

2.714303360053586

In [32]:
np.sum(similar_users_ratings * similar_users_similarities)

0.8465048780278275