In [2]:
import numpy as np
import pandas as pd

In [3]:
header = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('u.data', sep='\t', names=header)


In [4]:
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print ('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items))

Number of users = 943 | Number of movies = 1682


In [5]:
from sklearn import cross_validation as cv
train_data, test_data = cv.train_test_split(df, test_size=0.25)



In [6]:
#Create two user-item matrices, one for training and another for testing
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

In [7]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [8]:
user_similarity

array([[ 0.        ,  0.856434  ,  0.9565955 , ...,  0.88879327,
         0.85538312,  0.71803181],
       [ 0.856434  ,  0.        ,  0.96122583, ...,  0.85812597,
         0.88903925,  0.90724243],
       [ 0.9565955 ,  0.96122583,  0.        , ...,  0.91723797,
         0.93298167,  0.97988699],
       ..., 
       [ 0.88879327,  0.85812597,  0.91723797, ...,  0.        ,
         0.97267212,  0.94233371],
       [ 0.85538312,  0.88903925,  0.93298167, ...,  0.97267212,
         0.        ,  0.93389849],
       [ 0.71803181,  0.90724243,  0.97988699, ...,  0.94233371,
         0.93389849,  0.        ]])

In [9]:
train_data_matrix.shape

(943, 1682)

In [10]:
test_data_matrix.shape

(943, 1682)

In [11]:
item_similarity

array([[ 0.        ,  0.66732332,  0.78971878, ...,  1.        ,
         1.        ,  1.        ],
       [ 0.66732332,  0.        ,  0.85271605, ...,  1.        ,
         1.        ,  1.        ],
       [ 0.78971878,  0.85271605,  0.        , ...,  1.        ,
         1.        ,  0.88205783],
       ..., 
       [ 1.        ,  1.        ,  1.        , ...,  0.        ,
         1.        ,  1.        ],
       [ 1.        ,  1.        ,  1.        , ...,  1.        ,
         0.        ,  1.        ],
       [ 1.        ,  1.        ,  0.88205783, ...,  1.        ,
         1.        ,  0.        ]])

In [12]:
item_similarity.shape

(1682, 1682)

In [13]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [14]:
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

In [15]:
train_data_matrix

array([[ 5.,  3.,  4., ...,  0.,  0.,  0.],
       [ 4.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 5.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  5.,  0., ...,  0.,  0.,  0.]])

In [16]:
test_data_matrix

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [17]:
df.shape

(100000, 4)

In [18]:
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [19]:
print(train_data.shape, " ",test_data.shape)

(75000, 4)   (25000, 4)


In [20]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [21]:
print ('User-based CF RMSE: ', str(rmse(user_prediction, test_data_matrix)))
print ('Item-based CF RMSE: ', str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE:  3.123971361925488
Item-based CF RMSE:  3.450764690126627


In [22]:
user_prediction 

array([[  1.52438571e+00,   5.51761840e-01,   4.62490156e-01, ...,
          2.96124422e-01,   2.96085842e-01,   2.95777378e-01],
       [  1.25399691e+00,   2.66435517e-01,   1.39473807e-01, ...,
         -5.68253582e-02,  -5.54125115e-02,  -5.54052350e-02],
       [  1.27073562e+00,   2.22068846e-01,   1.02975159e-01, ...,
         -9.75067029e-02,  -9.58265815e-02,  -9.56650450e-02],
       ..., 
       [  1.12647321e+00,   1.82130081e-01,   6.45508502e-02, ...,
         -1.23346140e-01,  -1.22420933e-01,  -1.22160659e-01],
       [  1.32541862e+00,   2.97016490e-01,   1.94605115e-01, ...,
         -1.48486131e-03,  -6.32492699e-04,  -3.57930686e-04],
       [  1.34977236e+00,   3.52908215e-01,   2.72521214e-01, ...,
          1.01901651e-01,   1.01987851e-01,   1.01862955e-01]])

In [23]:
item_prediction

array([[ 0.36963022,  0.38222799,  0.39550905, ...,  0.44717863,
         0.43720284,  0.43295648],
       [ 0.09167753,  0.1069525 ,  0.10451885, ...,  0.10941902,
         0.11091993,  0.11069   ],
       [ 0.06785928,  0.07104139,  0.06958623, ...,  0.06727019,
         0.07044579,  0.07084463],
       ..., 
       [ 0.02782559,  0.03563003,  0.03560138, ...,  0.04101952,
         0.04060796,  0.04115975],
       [ 0.13806681,  0.14569971,  0.15308378, ...,  0.15718853,
         0.15602287,  0.15778335],
       [ 0.20349337,  0.19933987,  0.22209691, ...,  0.25454232,
         0.24761504,  0.24685515]])

In [24]:
user_prediction.shape

(943, 1682)

In [25]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds

#get SVD components from train matrix. Choose k.
u, s, vt = svds(train_data_matrix, k = 20)
s_diag_matrix=np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
print ('User-based CF MSE: ' + str(rmse(X_pred, test_data_matrix)))

User-based CF MSE: 2.7186381350771183


In [26]:
idx_to_movie = {}
with open('u.item', 'r') as f:
    for line in f.readlines():
        info = line.split('|')
        idx_to_movie[int(info[0])-1] = info[1]
idx_to_movie

{0: 'Toy Story (1995)',
 1: 'GoldenEye (1995)',
 2: 'Four Rooms (1995)',
 3: 'Get Shorty (1995)',
 4: 'Copycat (1995)',
 5: 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)',
 6: 'Twelve Monkeys (1995)',
 7: 'Babe (1995)',
 8: 'Dead Man Walking (1995)',
 9: 'Richard III (1995)',
 10: 'Seven (Se7en) (1995)',
 11: 'Usual Suspects, The (1995)',
 12: 'Mighty Aphrodite (1995)',
 13: 'Postino, Il (1994)',
 14: "Mr. Holland's Opus (1995)",
 15: 'French Twist (Gazon maudit) (1995)',
 16: 'From Dusk Till Dawn (1996)',
 17: 'White Balloon, The (1995)',
 18: "Antonia's Line (1995)",
 19: 'Angels and Insects (1995)',
 20: 'Muppet Treasure Island (1996)',
 21: 'Braveheart (1995)',
 22: 'Taxi Driver (1976)',
 23: 'Rumble in the Bronx (1995)',
 24: 'Birdcage, The (1996)',
 25: 'Brothers McMullen, The (1995)',
 26: 'Bad Boys (1995)',
 27: 'Apollo 13 (1995)',
 28: 'Batman Forever (1995)',
 29: 'Belle de jour (1967)',
 30: 'Crimson Tide (1995)',
 31: 'Crumb (1994)',
 32: 'Desperado (1995)',
 33: 'D

In [39]:
for x in range(item_similarity.shape[0]):
    for y in range(item_similarity[x].shape[0]):
        item_similarity[x][y] = 1 - item_similarity[x][y]
item_similarity        

array([[ 1.        ,  0.33267668,  0.21028122, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.33267668,  1.        ,  0.14728395, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.21028122,  0.14728395,  1.        , ...,  0.        ,
         0.        ,  0.11794217],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  1.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.        ,  0.        ,  0.11794217, ...,  0.        ,
         0.        ,  1.        ]])

In [40]:
# FOR "COLD START" PROBLEM
def top_k_movies(similarity, mapper, movie_idx, k=6):
    return [mapper[x] for x in np.argsort(similarity[movie_idx,:])[:-k-1:-1]]

In [53]:
# WE WANT TO SUGGEST MOVIES TO A USER WHO LIKES THIS MOVIE.
fav = 'Batman Forever (1995)' 
with open('u.item', 'r') as f:
    for line in f.readlines():
        info = line.split('|')
        if info[1] == fav:
            idx = int(info[0])
            break
idx

29

In [54]:
idx = idx - 1
k = 7
movies = top_k_movies(item_similarity, idx_to_movie, idx, k = k + 1)
movies[1:k + 1]

['Cliffhanger (1993)',
 'Batman (1989)',
 'Judge Dredd (1995)',
 'Batman Returns (1992)',
 'Star Trek IV: The Voyage Home (1986)',
 'Demolition Man (1993)',
 'Star Trek VI: The Undiscovered Country (1991)']