In [96]:
import numpy as np
import pandas as pd

In [97]:
header = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('ml-100k/u.data', sep='\t', names=header)

In [98]:
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


　先看一眼数据集中的前两行。接下来，让我们统计其中的用户和电影总数。



In [99]:
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print ('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items))

Number of users = 943 | Number of movies = 1682


In [100]:
from sklearn import cross_validation as cv
train_data, test_data = cv.train_test_split(df, test_size=0.25)

# 基于内存协同过滤法



　　基于内存协同过滤法可以被主要分为两部分：用户-项目过滤（user-item filtering）和项目-项目过滤（ item-item filtering）。 user-item filtering选取一个特定用户，基于评价相似性找到与该用户相似的其他用户，并推荐那些相似用户所喜欢的项目。相比之下， item-item filtering 先选取一个项目，然后找出也喜欢这个项目的其他用户，并找出这些用户或相似用户也喜欢的其他项目，推荐过程需要项目并输出其他项目。

Item-Item Collaborative Filtering: “Users who liked this item also liked …”
User-Item Collaborative Filtering: “Users who are similar to you also liked …”
在这两种情况中，你根据整个数据集创建了一个用户-项目的矩阵。因为已经把数据分成了测试和训练两部分所以你需要创建两个[943 x 1682]矩阵。训练矩阵包含75%的评价，测试矩阵包含25%的矩阵。

用户-项目矩阵例子：



创建了用户-项目矩阵之后，计算相似性并创建一个相似度矩阵。

Item-Item Collaborative Filtering算法中项目之间的相似度依靠观测所有的已对相同项目评价的用户来测算。



对于User-Item Collaborative Filtering算法，用户之间的相似性依靠观测相同用户已评价的所有项目。



　　推荐系统中通常使用余弦相似性作为距离度量，在n维孔空间中评价被视为向量，基于这些向量之间的夹角来计算相似性。

　　用户a和m可以用下面的公式计算余弦相似性，其中你可以使用用户向量uk和ua之间的点积然后除以这两个向量欧式长度之乘。



　　而计算项目m和b之间的相似度可以用下面的公式：





首先创建user-item矩阵，因此你需要创建两个矩阵为测试和训练数据集。

In [101]:
#Create two user-item matrices, one for training and another for testing
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3] 
    
test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

你可以使用 sklearn 的pairwise_distances函数来计算余弦相似性。注意，因为评价都为正值输出取值应为0到1.

In [102]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

下一步是作出预测。既然构造了相似度矩阵user_similarity和item_similarity，因此你可以运用下面的公式为user-based CF做一个预测：



　　用户k和用户a之间的相似度根据一个相似用户a的一系列评价的乘积（修正为该用户的平均评价）的权重。你将需要标准化相似度这样可以使评价维持在1到5之间，最后一步，统计你想预测用户平均评价的总和。

　　这里考虑到的问题是一些用户评价所有电影时可能要么给最高分，要么给最低分。这些用户给出评价的相对不同比绝对值更重要。例如：设想，用户k对他最喜欢的电影评价4颗星，其他的好电影则评价3颗星。假设现在另一个用户t对他/她喜欢的一部电影评价为5颗星，看了想睡觉的一部电影评价为3颗星。这两位用户电影口味可能很相似但使用评价体系的方法不同。

　　当为item-based CF做一个推荐时候，你不要纠正用户的平均评价，因为用户本身用查询来做预测。

In [103]:
def predict(ratings, similarity, types):
    if types == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif types == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])    
    return pred

In [104]:
item_similarity = 1 - item_similarity
user_prediction = 1 - user_prediction

In [105]:
item_prediction = predict(train_data_matrix, item_similarity, 'item')
user_prediction = predict(train_data_matrix, user_similarity, 'user')

In [106]:
train_data_matrix

array([[ 0.,  3.,  4., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 5.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  5.,  0., ...,  0.,  0.,  0.]])

In [107]:
item_similarity

array([[ 1.        ,  0.27955118,  0.26455616, ...,  0.        ,
         0.05465623,  0.        ],
       [ 0.27955118,  1.        ,  0.22326236, ...,  0.        ,
         0.09065968,  0.        ],
       [ 0.26455616,  0.22326236,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  1.        ,
         0.        ,  0.        ],
       [ 0.05465623,  0.09065968,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  1.        ]])

In [108]:
train_data_matrix.dot(item_similarity)

array([[ 207.33881468,  193.29434831,  149.19050297, ...,    0.        ,
          24.95477152,    0.        ],
       [  40.63337675,   22.85132375,   24.97669612, ...,    0.        ,
           3.06766976,    0.        ],
       [  21.11581952,   15.02151818,   13.35527799, ...,    0.        ,
           1.40533718,    0.        ],
       ..., 
       [  25.41090616,   13.15115717,   14.37093791, ...,    0.        ,
           2.44785185,    0.        ],
       [  75.39447593,   60.88229237,   38.56576454, ...,    0.        ,
           8.49521379,    0.        ],
       [ 131.83468536,  141.46934654,   91.49183252, ...,    0.        ,
          19.20075592,    0.        ]])

In [109]:
 np.array([np.abs(item_similarity).sum(axis=1)])

array([[ 229.7910334 ,  217.32317181,  160.87485905, ...,    1.        ,
          24.93489801,    1.        ]])

In [110]:
item_prediction

array([[ 0.90229289,  0.88943276,  0.92736991, ...,  0.        ,
         1.00079702,  0.        ],
       [ 0.17682751,  0.10514904,  0.15525543, ...,  0.        ,
         0.12302716,  0.        ],
       [ 0.0918914 ,  0.06912065,  0.08301656, ...,  0.        ,
         0.05636025,  0.        ],
       ..., 
       [ 0.11058267,  0.06051429,  0.08932992, ...,  0.        ,
         0.09816972,  0.        ],
       [ 0.32810016,  0.28014635,  0.23972524, ...,  0.        ,
         0.34069575,  0.        ],
       [ 0.57371553,  0.65096301,  0.5687143 , ...,  0.        ,
         0.77003547,  0.        ]])

In [111]:
user_prediction

array([[ 1.60983095,  0.57066506,  0.49957309, ...,  0.29883223,
         0.30119721,  0.29883223],
       [ 1.33491331,  0.27509894,  0.15861048, ..., -0.0699804 ,
        -0.0667833 , -0.0699804 ],
       [ 1.34525195,  0.2435305 ,  0.13870624, ..., -0.09971135,
        -0.09642453, -0.09971135],
       ..., 
       [ 1.19570112,  0.20175209,  0.09167406, ..., -0.13072517,
        -0.12769873, -0.13072517],
       [ 1.37465031,  0.31427483,  0.23291778, ...,  0.00380413,
         0.00656176,  0.00380413],
       [ 1.41983143,  0.37542525,  0.31395489, ...,  0.10816345,
         0.11053602,  0.10816345]])

In [112]:
for i in range(item_prediction.shape[0]):
    for j in range(item_prediction.shape[1]):
        if item_prediction[i][j] == max(item_prediction[i]):
            print ("user:%s , film:%s , max_ranting:%f" %(i,j,item_prediction[i][j]))

user:0 , film:118 , max_ranting:1.797643
user:1 , film:1670 , max_ranting:0.527797
user:2 , film:1611 , max_ranting:0.650608
user:3 , film:1670 , max_ranting:0.587794
user:4 , film:1617 , max_ranting:0.843272
user:5 , film:710 , max_ranting:1.458614
user:6 , film:598 , max_ranting:3.963528
user:6 , film:676 , max_ranting:3.963528
user:7 , film:1543 , max_ranting:0.428393
user:8 , film:1670 , max_ranting:0.279504
user:9 , film:710 , max_ranting:4.211352
user:10 , film:713 , max_ranting:0.913365
user:11 , film:1672 , max_ranting:0.401275
user:12 , film:851 , max_ranting:3.287100
user:12 , film:856 , max_ranting:3.287100
user:13 , film:919 , max_ranting:1.520452
user:14 , film:1373 , max_ranting:0.775584
user:15 , film:1617 , max_ranting:0.954004
user:16 , film:1605 , max_ranting:0.218763
user:17 , film:956 , max_ranting:2.106082
user:18 , film:1617 , max_ranting:0.240980
user:19 , film:1543 , max_ranting:0.417791
user:20 , film:986 , max_ranting:1.961388
user:21 , film:1617 , max_ranting

In [113]:
user_prediction

array([[ 1.60983095,  0.57066506,  0.49957309, ...,  0.29883223,
         0.30119721,  0.29883223],
       [ 1.33491331,  0.27509894,  0.15861048, ..., -0.0699804 ,
        -0.0667833 , -0.0699804 ],
       [ 1.34525195,  0.2435305 ,  0.13870624, ..., -0.09971135,
        -0.09642453, -0.09971135],
       ..., 
       [ 1.19570112,  0.20175209,  0.09167406, ..., -0.13072517,
        -0.12769873, -0.13072517],
       [ 1.37465031,  0.31427483,  0.23291778, ...,  0.00380413,
         0.00656176,  0.00380413],
       [ 1.41983143,  0.37542525,  0.31395489, ...,  0.10816345,
         0.11053602,  0.10816345]])

# 评估　　

In [114]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))


In [115]:

print ('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print ('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE: 3.1299052969657892
Item-based CF RMSE: 3.1700674073538098
