In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("data/data.csv")
data.set_index('user_id',inplace = True)
print(len(data.columns))
movie_matrix = data.pivot_table(index = 'user_id',columns = 'title',values = 'rating')
movie_matrix.fillna(0,inplace = True)
movie_matrix.index

10


Int64Index([   1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
            ...
            6031, 6032, 6033, 6034, 6035, 6036, 6037, 6038, 6039, 6040],
           dtype='int64', name='user_id', length=6040)

## 1、基于用户相似

### 1.1 欧式距离

In [3]:
# 1、欧式距离（x1与x2的差的平方和开根号）
# 注意这里只针对两个用户共同评价的的电影计算欧式距离;movie_matrix
# 参考链接：
# 1、https://blog.csdn.net/qq_25948717/article/details/81839463
# 2、欧式距离：https://blog.csdn.net/sunnyyoona/article/details/39721485
# 3\https://www.cnblogs.com/liuning8023/p/5417052.html

from math import * 

# 计算两个用户之间的相似度
def Euclidean(user1,user2):
    distance = 0
    array1 = movie_matrix.loc[user1].values
    array2 = movie_matrix.loc[user2].values
    length = len(array1)
    for i in range(length):
        if float(array1[i]) > 0 and float(array2[i]) > 0 :
            distance += pow(float(array1[i]) - float(array2[i]),2)
    return 1/(1 + sqrt(distance))

# 计算某用户与所有其他用户之间的相似度，并选取最相似的那一个
def top_user(user):
    res = []
    length = len(movie_matrix.index)
    for i in movie_matrix.index:
        if user != i:
            similar = Euclidean(user,i)
            res.append((i,similar))
    res.sort(key=lambda val:val[1],reverse = True)
    return res[:3]

# 向目标用户user推荐电影
def recommend(user):
    top_sim_user = top_user(user)[0][0]
    recommends = []
    items = movie_matrix.loc[user].values
    length = len(movie_matrix.columns)
    for i in range(length):
        if movie_matrix.iloc[user-1][i] == 0 and movie_matrix.iloc[top_sim_user-1][i] > 0:
            recommends.append((movie_matrix.columns[i],movie_matrix.iloc[top_sim_user-1][i]))
    recommends.sort(key = lambda val:val[1],reverse = True)
    return recommends[:3]

recommend(5)

[('Amityville Horror, The (1979)', 5.0),
 ('Baby, The (1973)', 5.0),
 ('Blair Witch Project, The (1999)', 5.0)]

# 基于记忆的协同过滤整个baseline

### （1）数据处理

In [4]:
from sklearn import model_selection

# 'user_id', 'item_id', 'rating', 'timestamp'
header = ['user_id', 'item_id', 'rating', 'timestamp'] 
df = pd.read_table('data/ratings.dat',sep='::',names = header)
train_data,test_data = model_selection.train_test_split(df,test_size = 0.25)
df.head()

  """


Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [5]:
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
# print(n_users,n_items)

train_data_matrix = train_data.pivot_table(index = 'user_id',columns = 'item_id',values = 'rating')
test_data_matrix = test_data.pivot_table(index = 'user_id',columns = 'item_id',values = 'rating')

### （2）计算余弦相似性

In [13]:
train_data_matrix.fillna(0,inplace = True)
test_data_matrix.fillna(0,inplace = True)

train_data_matrix = train_data_matrix.values
test_data_matrix = test_data_matrix.values

from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix,metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T,metric='cosine')
user_similarity[:5]#和自己本身的相似度不计算，记为0

array([[0.        , 0.90030823, 0.89250953, ..., 1.        , 0.86166535,
        0.90627809],
       [0.90030823, 0.        , 0.84358166, ..., 0.97379625, 0.95198501,
        0.80746825],
       [0.89250953, 0.84358166, 0.        , ..., 0.90729282, 0.92237371,
        0.90643797],
       [0.94155176, 0.83401254, 0.88597029, ..., 0.95160661, 0.9067459 ,
        0.90162304],
       [0.94843388, 0.90084985, 0.92155709, ..., 0.99375801, 0.98969001,
        0.80480405]])

### （3）预测

> 例如：设想，用户k对他最喜欢的电影评价4颗星，其他好电影评价3颗星。假设现在另一个用户t对他最喜欢的一部电影评价为5颗星，看了想睡觉的一部电影评价为3颗星。这两位用户电影口味可能很相似，但使用评价体系的方法不同。

(3)-1 user-based CF(需要修正用户评价)

In [17]:
import numpy as np

#要知道我们在文章中看到的公式都是就某个值进行计算，现实中都是大量数组数据同时计算
def predict(ratings,similarity,type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        ratings_diff = (ratings - mean_user_rating[:,np.newaxis])#np.newaxis相当于None
        pred = mean_user_rating[:,np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T#修正
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    
    return pred

item_prediction = predict(train_data_matrix,item_similarity,type = 'item')
user_prediction = predict(train_data_matrix,user_similarity,type = 'user')

# 踩坑：科学计算的时候array更方便，df主要适用于数据操作

### (4)评估

mean square error(均方误差)

In [18]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def rmse(prediction,ground_truth):
    # ground_truth.nonzero():过滤掉prediction中不必要的元素
    prediction = prediction[ground_truth.nonzero()].flatten()#flatten:按行展开成一维数组
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction,ground_truth))

print('user-based CF RMSE:' + str(rmse(user_prediction,test_data_matrix)))
print('item-based CF RMSE:' + str(rmse(item_prediction,test_data_matrix)))

user-based CF RMSE:3.6190856160870934
item-based CF RMSE:3.6377405519496184


## 2、基于模型的协同过滤

当面对新用户或者开启新项目的时候，基于记忆的算法不能扩展到现实世界的场景

学习用户潜在喜好和项目的潜在特征

矩阵因式分解

Collaborative Filtering（https://blog.csdn.net/pipisorry/article/details/51788955/） 和Content-based 模型?

个性化）推荐系统构建三大方法：基于内容的推荐content-based，协同过滤collaborative filtering，隐语义模型

https://www.tuicool.com/articles/6vqyYfR

https://blog.csdn.net/u011537073/article/details/54143441