In [1]:
import numpy as np
import pandas as pd

In [2]:
header = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('ml-100k/u.data', sep='\t', names=header)
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [3]:
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print (str(n_users) + 'users')
print (str(n_items) + 'items')

943users
1682items


构造 用户-电影评分矩阵


In [4]:
ratings = np.zeros((n_users, n_items))
for row in df.itertuples():
    ratings[row[1]-1, row[2]-1] = row[3]
ratings

array([[ 5.,  3.,  4., ...,  0.,  0.,  0.],
       [ 4.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 5.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  5.,  0., ...,  0.,  0.,  0.]])

计算数据稀疏度


In [5]:
sparsity = float(len(ratings.nonzero()[0]))
sparsity /= (ratings.shape[0] * ratings.shape[1])
sparsity *= 100
print ('Sparsity: {:4.2f}%'.format(sparsity))

Sparsity: 6.30%


　把每个用户对电影的10个评分拿出来，将数据分为训练集与测试机两部分


In [6]:
from sklearn import cross_validation as cv
train, test = cv.train_test_split(ratings, test_size=0.25)



In [7]:
def fast_similarity(ratings, kind='user', epsilon=1e-9):
    # epsilon -> small number for handling dived-by-zero errors
    if kind == 'user':
        sim = ratings.dot(ratings.T) + epsilon
    elif kind == 'item':
        sim = ratings.T.dot(ratings) + epsilon
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

In [8]:
%timeit fast_similarity(train, kind='user')

The slowest run took 5.06 times longer than the fastest. This could mean that an intermediate result is being cached.
10 loops, best of 3: 17 ms per loop


In [9]:
user_similarity = fast_similarity(train, kind='user')
item_similarity = fast_similarity(train, kind='item')

In [10]:
user_similarity

array([[ 1.        ,  0.42163526,  0.49092877, ...,  0.21150221,
         0.07409907,  0.06463378],
       [ 0.42163526,  1.        ,  0.37068397, ...,  0.07308831,
         0.13291959,  0.11517464],
       [ 0.49092877,  0.37068397,  1.        , ...,  0.16927496,
         0.07605612,  0.07414699],
       ..., 
       [ 0.21150221,  0.07308831,  0.16927496, ...,  1.        ,
         0.03946981,  0.06594862],
       [ 0.07409907,  0.13291959,  0.07605612, ...,  0.03946981,
         1.        ,  0.34569567],
       [ 0.06463378,  0.11517464,  0.07414699, ...,  0.06594862,
         0.34569567,  1.        ]])

In [11]:
item_similarity

array([[  1.00000000e+00,   3.84825821e-01,   3.18238612e-01, ...,
          6.72672794e-12,   5.38138235e-02,   5.38138235e-02],
       [  3.84825821e-01,   1.00000000e+00,   2.65926434e-01, ...,
          1.54450516e-11,   9.26703095e-02,   9.26703095e-02],
       [  3.18238612e-01,   2.65926434e-01,   1.00000000e+00, ...,
          1.93891684e-11,   1.29261122e-11,   1.16335010e-01],
       ..., 
       [  6.72672794e-12,   1.54450516e-11,   1.93891684e-11, ...,
          1.00000000e+00,   1.66666667e-10,   1.66666667e-10],
       [  5.38138235e-02,   9.26703095e-02,   1.29261122e-11, ...,
          1.66666667e-10,   1.00000000e+00,   1.11111111e-10],
       [  5.38138235e-02,   9.26703095e-02,   1.16335010e-01, ...,
          1.66666667e-10,   1.11111111e-10,   1.00000000e+00]])

In [12]:
def predict_fast_simple(ratings, similarity, kind='user'):
    if kind == 'user':
        return similarity.dot(ratings) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif kind == 'item':
        return ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])

In [13]:
item_prediction = predict_fast_simple(train, item_similarity, kind='item')
user_prediction = predict_fast_simple(train, user_similarity, kind='user')

In [14]:
for i in range(item_prediction.shape[0]):
    for j in range(item_prediction.shape[1]):
        if item_prediction[i][j] > max(item_prediction[i]) * 0.8:
            print ("user:%s , film:%s , max_ranting:%f" %(i,j,item_prediction[i][j]))

user:0 , film:710 , max_ranting:1.700683
user:0 , film:972 , max_ranting:1.512581
user:0 , film:1672 , max_ranting:1.383583
user:0 , film:1676 , max_ranting:1.477298
user:1 , film:700 , max_ranting:0.741038
user:1 , film:972 , max_ranting:0.802954
user:1 , film:1129 , max_ranting:0.762784
user:1 , film:1672 , max_ranting:0.908161
user:2 , film:700 , max_ranting:0.841196
user:2 , film:710 , max_ranting:0.971632
user:2 , film:972 , max_ranting:0.954515
user:2 , film:1121 , max_ranting:0.852467
user:2 , film:1123 , max_ranting:0.795368
user:2 , film:1125 , max_ranting:0.801357
user:2 , film:1298 , max_ranting:0.791603
user:2 , film:1532 , max_ranting:0.902487
user:2 , film:1672 , max_ranting:0.846523
user:3 , film:972 , max_ranting:1.261148
user:3 , film:1129 , max_ranting:1.339317
user:3 , film:1281 , max_ranting:1.264564
user:3 , film:1672 , max_ranting:1.550833
user:4 , film:700 , max_ranting:1.035587
user:4 , film:710 , max_ranting:1.238077
user:4 , film:972 , max_ranting:1.048290
use

使用sklearn计算MSE,首先去除数据矩阵中的无效0值，然后直接调用sklearn里面的mean_squared_error函数计算MSE


In [15]:
from sklearn.metrics import mean_squared_error

def get_mse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

In [16]:

print ('User-based CF MSE:' + str(get_mse(user_prediction, test)))
print ('Item-based CF MSE:'  + str(get_mse(item_prediction, test)))

User-based CF MSE:9.01059180519
Item-based CF MSE:11.1304713507


 为提高预测的MSE，可以只考虑使用与目标用户最相似的k个用户的数据，进行Top-k预测并进行MSE计算



In [17]:
def predict_topk(ratings, similarity, kind='user', k=40):
    pred = np.zeros(ratings.shape)
    if kind == 'user':
        for i in range(ratings.shape[0]):
            top_k_users = [np.argsort(similarity[:,i])[:-k-1:-1]]
            for j in range(ratings.shape[1]):
                pred[i, j] = similarity[i, :][top_k_users].dot(ratings[:, j][top_k_users]) 
                pred[i, j] /= np.sum(np.abs(similarity[i, :][top_k_users]))
    if kind == 'item':
        for j in range(ratings.shape[1]):
            top_k_items = [np.argsort(similarity[:,j])[:-k-1:-1]]
            for i in range(ratings.shape[0]):
                pred[i, j] = similarity[j, :][top_k_items].dot(ratings[i, :][top_k_items].T) 
                pred[i, j] /= np.sum(np.abs(similarity[j, :][top_k_items]))        
    
    return pred

In [None]:
pred = predict_topk(train, user_similarity, kind='user', k=40)
print ('Top-k User-based CF MSE:'  + str(get_mse(pred, test)))

pred = predict_topk(train, item_similarity, kind='item', k=40)
print ('Top-k Item-based CF MSE: ' + str(get_mse(pred, test)))

Top-k User-based CF MSE:8.80845775018
Top-k Item-based CF MSE: 9.27217280415


为进一步降低MSE，这里尝试使用不同的k值寻找最小的MSE，使用matplotlib 可视化输出结果


In [None]:
k_array = [5, 15, 30, 50, 100, 200]
user_train_mse = []
user_test_mse = []
item_test_mse = []
item_train_mse = []

def get_mse(pred, actual):
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

for k in k_array:
    user_pred = predict_topk(train, user_similarity, kind='user', k=k)
    item_pred = predict_topk(train, item_similarity, kind='item', k=k)
    
    user_train_mse += [get_mse(user_pred, train)]
    user_test_mse += [get_mse(user_pred, test)]
    
    item_train_mse += [get_mse(item_pred, train)]
    item_test_mse += [get_mse(item_pred, test)]  

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

pal = sns.color_palette("Set2", 2)

plt.figure(figsize=(8, 8))
plt.plot(k_array, user_train_mse, c=pal[0], label='User-based train', alpha=0.5, linewidth=5)
plt.plot(k_array, user_test_mse, c=pal[0], label='User-based test', linewidth=5)
plt.plot(k_array, item_train_mse, c=pal[1], label='Item-based train', alpha=0.5, linewidth=5)
plt.plot(k_array, item_test_mse, c=pal[1], label='Item-based test', linewidth=5)
plt.legend(loc='best', fontsize=20)
plt.xticks(fontsize=16);
plt.yticks(fontsize=16);
plt.xlabel('k', fontsize=30);
plt.ylabel('MSE', fontsize=30);

 计算无偏置下均方根误差MSE


In [None]:
def predict_nobias(ratings, similarity, kind='user'):
    if kind == 'user':
        user_bias = ratings.mean(axis=1)
        ratings = (ratings - user_bias[:, np.newaxis]).copy()
        pred = similarity.dot(ratings) / np.array([np.abs(similarity).sum(axis=1)]).T
        pred += user_bias[:, np.newaxis]
    elif kind == 'item':
        item_bias = ratings.mean(axis=0)
        ratings = (ratings - item_bias[np.newaxis, :]).copy()
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
        pred += item_bias[np.newaxis, :]
        
    return pred

In [None]:
user_pred = predict_nobias(train, user_similarity, kind='user')
print ('Bias-subtracted User-based CF MSE:'  + str(get_mse(user_pred, test)))

item_pred = predict_nobias(train, item_similarity, kind='item')
print ('Bias-subtracted Item-based CF MSE:'  + str(get_mse(item_pred, test)))

In [None]:
import requests
import json


In [None]:
# response = requests.get('http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)')
# print (response.url.split('/')[-2])

In [None]:
# # Get base url filepath structure. w185 corresponds to size of movie poster.
# headers = {'Accept': 'application/json'}
# payload = {'api_key': '这里填入你的API'} 
# response = requests.get("http://api.themoviedb.org/3/configuration", params=payload, headers=headers)
# response = json.loads(response.text)
# base_url = response['images']['base_url'] + 'w185'

# def get_poster(imdb_url, base_url):
#     # Get IMDB movie ID
#     response = requests.get(imdb_url)
#     movie_id = response.url.split('/')[-2]
    
#     # Query themoviedb.org API for movie poster path.
#     movie_url = 'http://api.themoviedb.org/3/movie/{:}/images'.format(movie_id)
#     headers = {'Accept': 'application/json'}
#     payload = {'api_key': '这里填入你的API'} 
#     response = requests.get(movie_url, params=payload, headers=headers)
#     try:
#         file_path = json.loads(response.text)['posters'][0]['file_path']
#     except:
#         # IMDB movie ID is sometimes no good. Need to get correct one.
#         movie_title = imdb_url.split('?')[-1].split('(')[0]
#         payload['query'] = movie_title
#         response = requests.get('http://api.themoviedb.org/3/search/movie', params=payload, headers=headers)
#         movie_id = json.loads(response.text)['results'][0]['id']
#         payload.pop('query', None)
#         movie_url = 'http://api.themoviedb.org/3/movie/{:}/images'.format(movie_id)
#         response = requests.get(movie_url, params=payload, headers=headers)
#         file_path = json.loads(response.text)['posters'][0]['file_path']
        
#     return base_url + file_path