## 基于物品/用户协同过滤算法的动漫推荐系统

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import operator
%matplotlib inline

In [2]:
anime = pd.read_csv('D:/git2/data_analysis/anime_data/anime.csv')
rating = pd.read_csv('D:/git2/data_analysis/anime_data/rating.csv')

评分表中，许多用户的评分为-1，这里用空值替换

In [3]:
rating.rating.replace({-1: np.nan}, regex=True, inplace = True)
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,
1,1,24,
2,1,79,
3,1,226,
4,1,241,


本次分析中，只考虑TV动漫，动漫电影暂不考虑

In [13]:
anime_tv = anime[anime['type']=='TV']  #取出类型为TV的样本
anime_tv.shape

(3787, 7)

In [14]:
merged = rating.merge(anime_tv, left_on = 'anime_id', right_on = 'anime_id', suffixes= ['_user', ''])  #矩阵连接
merged.rename(columns = {'rating_user':'user_rating'}, inplace = True)   #更改列名

In [18]:
merged=merged[['user_id', 'name', 'user_rating']]    #取出需要计算的列，用户id，动漫名，用户评分
merged_1= merged[merged.user_id <= 20000]
merged_1.head()
# merged_1.shape

Unnamed: 0,user_id,name,user_rating
0,1,Naruto,
1,3,Naruto,8.0
2,5,Naruto,6.0
3,6,Naruto,
4,10,Naruto,


In [19]:
train_data,test_data =train_test_split(merged_1,test_size=0.3,random_state=42)
train_data.shape
test_data.shape

(423801, 3)

In [20]:
train_piv = train_data.pivot_table(index=['user_id'], columns=['name'], values='user_rating')  
test_piv = test_data.pivot_table(index=['user_id'], columns=['name'], values='user_rating')  #创建透视表

In [22]:
print(test_piv.shape)
test_piv.head()

(18671, 2546)


name,.hack//Roots,.hack//Sign,.hack//Tasogare no Udewa Densetsu,009-1,07-Ghost,11eyes,12-sai.: Chicchana Mune no Tokimeki,3 Choume no Tama: Uchi no Tama Shirimasenka?,30-sai no Hoken Taiiku,91 Days,...,Zombie-Loan,"Zone of the Enders: Dolores, I",ef: A Tale of Melodies.,ef: A Tale of Memories.,gdgd Fairies,gdgd Fairies 2,iDOLM@STER Xenoglossia,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,7.0,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [23]:
train_norm = train_piv.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)  #数据归一化,修正计算余弦相似度的原始值
train_norm.fillna(0, inplace=True)                                                    #用0填补缺失值
train_norm = train_norm.T
train_norm = train_norm.loc[:, (train_norm != 0).any(axis=0)] #删除某个用户评分全都为0的用户

In [44]:
train_norm.head()

user_id,3,5,7,8,11,12,14,17,18,19,...,19986,19987,19989,19990,19993,19994,19995,19996,19997,19998
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
.hack//Roots,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//Sign,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//Tasogare no Udewa Densetsu,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
009-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
07-Ghost,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-0.032164,0.0,0.0,0.0,0.0,0.0


In [25]:
train_matrix = sp.sparse.csr_matrix(train_norm.values)  # 压缩为举证
print(train_matrix.todense().shape)

(2782, 16509)


In [28]:
item_sim = cosine_similarity(train_matrix)  # itemCF基于物品的推荐，计算余弦相似度,返回数组的第i行第j列表示item[i]与item[j]的余弦相似度
print(item_sim)

[[ 1.00000000e+00  1.37185171e-01  1.82221890e-01 ...  2.22250748e-02
  -1.79347261e-02 -2.45631136e-02]
 [ 1.37185171e-01  1.00000000e+00  1.07284212e-01 ... -7.76485644e-04
  -1.38520577e-02 -2.84871107e-02]
 [ 1.82221890e-01  1.07284212e-01  1.00000000e+00 ...  1.90523410e-02
  -3.36589529e-02 -2.48423613e-02]
 ...
 [ 2.22250748e-02 -7.76485644e-04  1.90523410e-02 ...  1.00000000e+00
  -1.21744099e-02 -1.27964129e-02]
 [-1.79347261e-02 -1.38520577e-02 -3.36589529e-02 ... -1.21744099e-02
   1.00000000e+00  3.91342656e-01]
 [-2.45631136e-02 -2.84871107e-02 -2.48423613e-02 ... -1.27964129e-02
   3.91342656e-01  1.00000000e+00]]


In [30]:
user_sim = cosine_similarity(train_matrix.T)  # userCF基于用户的推荐
print(user_sim)

[[ 1.          0.04606225  0.04953166 ... -0.06628728 -0.01280369
   0.00971892]
 [ 0.04606225  1.          0.06164367 ... -0.03169311  0.
   0.05186251]
 [ 0.04953166  0.06164367  1.         ...  0.          0.
   0.04005594]
 ...
 [-0.06628728 -0.03169311  0.         ...  1.          0.
   0.        ]
 [-0.01280369  0.          0.         ...  0.          1.
   0.        ]
 [ 0.00971892  0.05186251  0.04005594 ...  0.          0.
   1.        ]]


In [31]:
item_df = pd.DataFrame(item_sim, index = train_norm.index, columns = train_norm.index)  # 将矩阵转换为DataFrame格式，索引和列名换为item名
user_df = pd.DataFrame(user_sim, index = train_norm.columns, columns = train_norm.columns)# 将矩阵转换为DataFrame格式，索引和列名换为userID

In [32]:
def top_animes(anime_name):    #函数，提取相似度最大的前10个anime
    count = 1
    print('和 {} 相似的动漫有:\n'.format(anime_name))
    for item in item_df.sort_values(by = anime_name, ascending = False).index[1:11]:
        print('No. {}: {}'.format(count, item))
        count +=1  

In [37]:
def top_users(user):   #函数，提取相似度最大的前10个user及对应的相似度
    
    if user not in train_norm.columns:
        return('无用户 {} 相关的数据'.format(user))
    
    print('与用户 {} 兴趣相似的用户有:\n'.format(user))
    sim_values = user_df.sort_values(by=user, ascending=False).loc[:,user].tolist()[1:11]
    sim_users = user_df.sort_values(by=user, ascending=False).index[1:11]
    zipped = zip(sim_users, sim_values,)
    for user, sim in zipped:
        print('用户 #{0}, 相似度: {1:.2f}'.format(user, sim)) 

In [45]:
def similar_user_like(user):   # 函数，计算与用户相似度最高的前10个用户，并统计这些用户最喜欢的动漫，返回喜欢的人数前5的动漫
    
    if user not in train_norm.columns:
        return('No data available on user {}'.format(user))
    
    sim_users = user_df.sort_values(by=user, ascending=False).index[1:11]  #得到与用户相似度最高的前10个用户
    best = []
    most_common = {}
    
    for i in sim_users:
        max_score = train_norm.loc[:, i].max()   # 得到相似用户其最高评分
        best.append(train_norm[train_norm.loc[:, i]==max_score].index.tolist())  #将每个用户的最高评分的动漫放入best中
    for i in range(len(best)):                  # 统计前10个相似用户评分最高的动漫的个数，返回数量前5的动漫
        for j in best[i]:
            if j in most_common:
                most_common[j] += 1
            else:
                most_common[j] = 1
    sorted_list = sorted(most_common.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_list[:5]    

###  UserCF和ItemCF预测用户对动漫的评分

In [57]:
# UserCF 预测某一用户对某动漫的评分
def user_predicted_rating(anime_name, user):
    sim_users = user_df.sort_values(by=user, ascending=False).index[1:100]   #相似度前100的用户
    user_values = user_df.sort_values(by=user, ascending=False).loc[:,user].tolist()[1:100]    # 相似度前100的值
    rating_list = []
    weight_list = []
    for j, i in enumerate(sim_users):
        rating = train_piv.loc[i, anime_name]  # 找到相似用户和动漫对应的评分
        similarity = user_values[j]            # 找到相似用户的相似度
        if np.isnan(rating):
            continue
        elif not np.isnan(rating):
            rating_list.append(rating*similarity)  #计算用户对动漫的兴趣程度
            weight_list.append(similarity)
    return sum(rating_list)/sum(weight_list)  #返回用户对动漫的预测评分值

In [55]:
# ItemCF 预测某一用户对某动漫的评分
def item_predicted_rating(anime_name, user):
    sim_items = item_df.sort_values(by=anime_name, ascending=False).index[1:200]   #相似度前200的动漫
    item_values = item_df.sort_values(by=anime_name, ascending=False).loc[:,anime_name].tolist()[1:200]    # 相似度前200的值
    rating_list = []
    weight_list = []
    for j, i in enumerate(sim_items):
        rating = train_piv.loc[user, i]  # 找到用户及用户对相似动漫的评分
        similarity = item_values[j]            # 找到相似动漫的相似度
        if np.isnan(rating):
            continue
        elif not np.isnan(rating):
            rating_list.append(rating*similarity)  #计算用户对动漫的兴趣程度
            weight_list.append(similarity)
    return sum(rating_list)/sum(weight_list)  #返回用户对动漫的预测评分值

In [33]:
top_animes('Naruto')

和 Naruto 相似的动漫有:

No. 1: Bleach
No. 2: Yu☆Gi☆Oh! Duel Monsters
No. 3: Time Bokan Series: Itadakiman
No. 4: Dragon Ball GT
No. 5: Dragon Ball Z
No. 6: Ao no Exorcist
No. 7: Sekirei
No. 8: Tom Sawyer no Bouken
No. 9: Tesagure! Bukatsumono Encore
No. 10: Kyouryuu Tankentai Born Free


当输入火影忍者，返回与火影忍者相似的动漫有：死神，龙珠系列，妖精的尾巴，驱魔少年，通灵王，一骑当千，青之驱魔师等，分析可知，结果较为合理。

In [38]:
top_users(5)

与用户 5 兴趣相似的用户有:

用户 #2300, 相似度: 0.24
用户 #4341, 相似度: 0.23
用户 #13342, 相似度: 0.22
用户 #12033, 相似度: 0.22
用户 #17955, 相似度: 0.21
用户 #2127, 相似度: 0.21
用户 #16079, 相似度: 0.21
用户 #14531, 相似度: 0.21
用户 #4232, 相似度: 0.21
用户 #14507, 相似度: 0.21


In [46]:
similar_user_like(5)   # 预测5号用户可能喜欢的动漫

[('Steins;Gate', 6),
 ('Great Teacher Onizuka', 5),
 ('Tengen Toppa Gurren Lagann', 4),
 ('Clannad: After Story', 3),
 ('One Punch Man', 3)]

In [59]:
user_predicted_rating('Naruto', 3)  #基于用户的协同过滤预测3号用户对火影的评分

7.8053948487555

In [60]:
item_predicted_rating('Naruto', 3)   #基于物品的协同过滤预测3号用户对火影的评分

7.405785759803792

In [97]:
# 针对某个用户计算MSE
def mean_squared_error(user):
    errors = []
    watched = test_piv.T[test_piv.loc[user,:]>0].index.tolist()
    for i in watched:
        actual=test_piv.loc[user, i]  # 真实评分
        predicted = item_predicted_rating(i, user)  # 预测评分
        errors.append((predicted-actual)**2)  # 残差平方
    return np.mean(errors)  # 均方误差

In [98]:
mean_squared_error(3)

1.3892033052862747