In [1]:
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

# 读取数据
ratings = pd.read_csv('data/ratings.csv')

# 过滤低频用户和电影
min_user_ratings = 50
min_movie_ratings = 50
user_counts = ratings['userId'].value_counts()
movie_counts = ratings['movieId'].value_counts()

ratings = ratings[ratings['userId'].isin(user_counts[user_counts >= min_user_ratings].index)]
ratings = ratings[ratings['movieId'].isin(movie_counts[movie_counts >= min_movie_ratings].index)]

In [2]:
user_avg = ratings.groupby("userId")["rating"].mean().reset_index()
user_avg.columns = ["userId", "avg_rating"]

ratings = pd.merge(ratings, user_avg, on="userId")
ratings["adjusted_rating"] = ratings["rating"] - ratings["avg_rating"]

adjusted_matrix = ratings.pivot_table(
    index="userId", columns="movieId", values="adjusted_rating"
).fillna(0)
sparse_adjusted_matrix = csr_matrix(adjusted_matrix.values)

In [3]:
item_user_matrix = sparse_adjusted_matrix.T

# 计算余弦相似度
cos_sim = cosine_similarity(item_user_matrix)

# 保存电影ID列表
movie_ids = adjusted_matrix.columns.tolist()

In [4]:
similar_items = {}
K = 20

for idx in range(len(cos_sim)):
    # 排除自身，取Top-K
    sim_scores = list(enumerate(cos_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:K+1]
    # 转换为电影ID和相似度
    similar_items[movie_ids[idx]] = [(movie_ids[i], score) for i, score in sim_scores]

In [None]:
def recommend_items(user_id, top_n=10):
    # 用户历史评分过的电影
    user_rated = adjusted_matrix.loc[user_id]
    rated_movies = user_rated[user_rated > 0].index.tolist()
    
    scores = defaultdict(float)
    # 遍历用户评分的每个电影
    for movie_id in rated_movies:
        if movie_id in similar_items:
            for (similar_movie, sim) in similar_items[movie_id]:
                if similar_movie not in rated_movies:
                    # 使用adjusted_matrix作为权重
                    scores[similar_movie] += sim * user_rated[movie_id]
    
    # 按得分排序，返回Top-N
    recommended = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
    return recommended

In [11]:
# Example：为用户ID为1的用户推荐10部电影
user_id = 116
recommendations = recommend_items(user_id)
print("推荐给用户", user_id, "的电影：")
for movie_id, score in recommendations:
    print(f"电影ID: {movie_id}, 推荐分数: {score:.4f}")

推荐给用户 116 的电影：
电影ID: 1196, 推荐分数: 4.8042
电影ID: 1210, 推荐分数: 3.4871
电影ID: 318, 推荐分数: 2.8098
电影ID: 1214, 推荐分数: 2.7837
电影ID: 1270, 推荐分数: 2.4911
电影ID: 50, 推荐分数: 2.1790
电影ID: 541, 推荐分数: 1.8814
电影ID: 593, 推荐分数: 1.8194
电影ID: 858, 推荐分数: 1.5731
电影ID: 2716, 推荐分数: 1.3812


In [None]:
from joblib import dump
import numpy as np


# 组合需要保存的数据
model_data = {
    "similar_items": similar_items,
    'adjusted_matrix': adjusted_matrix,
    "movie_ids": movie_ids
}

# 保存为压缩文件
dump(model_data, "data/itemcf_model.joblib", compress=3, protocol=4)

['itemcf_model.joblib']