In [1]:
import pandas as pd

In [19]:
data = pd.read_csv("data/data.csv")
data.set_index('user_id',inplace = True)
print(len(data.columns))
movie_matrix = data.pivot_table(index = 'user_id',columns = 'title',values = 'rating')
movie_matrix.fillna(0,inplace = True)
movie_matrix.index

11


Index(['$1,000,000 Duck (1971)', ''Night Mother (1986)',
       ''Til There Was You (1997)', ''burbs, The (1989)',
       '...And Justice for All (1979)', '1-900 (1994)',
       '10 Things I Hate About You (1999)', '101 Dalmatians (1961)',
       '101 Dalmatians (1996)', '12 Angry Men (1957)',
       ...
       'Young Poisoner's Handbook, The (1995)', 'Young Sherlock Holmes (1985)',
       'Young and Innocent (1937)', 'Your Friends and Neighbors (1998)',
       'Zachariah (1971)', 'Zed & Two Noughts, A (1985)', 'Zero Effect (1998)',
       'Zero Kelvin (Kj鎟lighetens kj鴗ere) (1995)', 'Zeus and Roxanne (1997)',
       'eXistenZ (1999)'],
      dtype='object', name='title', length=3706)

## 1、基于用户相似

### 1.1 欧式距离

In [3]:
# 1、欧式距离（x1与x2的差的平方和开根号）
# 注意这里只针对两个用户共同评价的的电影计算欧式距离;movie_matrix
# 参考链接：
# 1、https://blog.csdn.net/qq_25948717/article/details/81839463
# 2、欧式距离：https://blog.csdn.net/sunnyyoona/article/details/39721485
# 3\https://www.cnblogs.com/liuning8023/p/5417052.html

from math import * 

# 计算两个用户之间的相似度
def Euclidean(user1,user2):
    distance = 0
    array1 = movie_matrix.loc[user1].values
    array2 = movie_matrix.loc[user2].values
    length = len(array1)
    for i in range(length):
        if float(array1[i]) > 0 and float(array2[i]) > 0 :
            distance += pow(float(array1[i]) - float(array2[i]),2)
    return 1/(1 + sqrt(distance))

# 计算某用户与所有其他用户之间的相似度，并选取最相似的那一个
def top_user(user):
    res = []
    length = len(movie_matrix.index)
    for i in movie_matrix.index:
        if user != i:
            similar = Euclidean(user,i)
            res.append((i,similar))
    res.sort(key=lambda val:val[1],reverse = True)
    return res[:3]

# 向目标用户user推荐电影
def recommend(user):
    top_sim_user = top_user(user)[0][0]
    recommends = []
    items = movie_matrix.loc[user].values
    length = len(movie_matrix.columns)
    for i in range(length):
        if movie_matrix.iloc[user-1][i] == 0 and movie_matrix.iloc[top_sim_user-1][i] > 0:
            recommends.append((movie_matrix.columns[i],movie_matrix.iloc[top_sim_user-1][i]))
    recommends.sort(key = lambda val:val[1],reverse = True)
    return recommends[:3]

recommend(5)

[('Amityville Horror, The (1979)', 5.0),
 ('Baby, The (1973)', 5.0),
 ('Blair Witch Project, The (1999)', 5.0)]

## 2、基于模型的推荐系统

1、模型评价：均方根误差

### （1）数据处理

In [15]:
from sklearn import model_selection

# 'user_id', 'item_id', 'rating', 'timestamp'
header = ['user_id', 'item_id', 'rating', 'timestamp'] 
df = pd.read_table('data/ratings.dat',sep='::',names = header)
train_data,test_data = model_selection.train_test_split(df,test_size = 0.25)
df.head()

  """


Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [28]:
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
# print(n_users,n_items)

train_data_matrix = train_data.pivot_table(index = 'user_id',columns = 'item_id',values = 'rating')
test_data_matrix = test_data.pivot_table(index = 'user_id',columns = 'item_id',values = 'rating')

### （2）计算余弦相似性

In [29]:
train_data_matrix.fillna(0,inplace = True)
test_data_matrix.fillna(0,inplace = True)

from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix,metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T,metric='cosine')

### （3）预测

> 例如：设想，用户k对他最喜欢的电影评价4颗星，其他好电影评价3颗星。假设现在另一个用户t对他最喜欢的一部电影评价为5颗星，看了想睡觉的一部电影评价为3颗星。这两位用户电影口味可能很相似，但使用评价体系的方法不同。

(3)-1 user-based CF(需要修正用户评价)

In [None]:
def predict(ratings,similarity,type='user'):
    if type == 'user':
        mean_user_rating = (ratings.meaning[:,np.newaxis])