In [132]:
#import all libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split

In [133]:
#load data and analyse it
column_names=['user_id',"item_id","rating","timestamp"]
movie_data=pd.read_csv(r"C:\Users\Preeti Saraswat\Desktop\jupyter notebooks\ml-100k\ml-100k\u.data", sep="\t", names=column_names)

In [213]:
movie_data.head(10)

Unnamed: 0,user_id,item_id,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)
5,296,242,4,884196057,Kolya (1996)
6,34,242,5,888601628,Kolya (1996)
7,271,242,4,885844495,Kolya (1996)
8,201,242,4,884110598,Kolya (1996)
9,209,242,4,883589606,Kolya (1996)


In [237]:
movie_titles = pd.read_csv(r"C:\Users\Preeti Saraswat\Desktop\jupyter notebooks\ml-100k\ml-100k\Movie_Id_Titles")
movie_titles.head(50)

Unnamed: 0,item_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
5,6,Shanghai Triad (Yao a yao yao dao waipo qiao) ...
6,7,Twelve Monkeys (1995)
7,8,Babe (1995)
8,9,Dead Man Walking (1995)
9,10,Richard III (1995)


In [215]:
movie_data = pd.merge(movie_data, movie_titles, on='item_id')
movie_data.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title_x,title_y
0,196,242,3,881250949,Kolya (1996),Kolya (1996)
1,63,242,3,875747190,Kolya (1996),Kolya (1996)
2,226,242,5,883888671,Kolya (1996),Kolya (1996)
3,154,242,3,879138235,Kolya (1996),Kolya (1996)
4,306,242,5,876503793,Kolya (1996),Kolya (1996)


In [216]:
movie_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 6 columns):
user_id      100000 non-null int64
item_id      100000 non-null int64
rating       100000 non-null int64
timestamp    100000 non-null int64
title_x      100000 non-null object
title_y      100000 non-null object
dtypes: int64(4), object(2)
memory usage: 5.3+ MB


In [138]:
num_of_users=len(np.unique(movie_data['user_id']))
num_of_users

943

In [139]:
num_of_movies=len(np.unique(movie_data['item_id']))
num_of_movies

1682

In [140]:
#create user preference matrix based on explicit feedback model.
#map user/item ID's to user/item indices
ratings = np.zeros((num_of_users, num_of_movies))

In [141]:
for row in movie_data.itertuples():
    ratings[row[1]-1, row[2]-1] = row[3]
ratings

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [142]:
#now convert it to sparsematrix
sparsity = float(len(ratings.nonzero()[0]))
sparsity /= (ratings.shape[0] * ratings.shape[1])
sparsity *= 100
print ('Sparsity: {:4.2f}%'.format(sparsity))

Sparsity: 6.30%


In [143]:
#split the data into train-test
train_data, test_data = train_test_split(ratings, test_size=0.25)

In [144]:
# now we need to find out the similarity in train data to make predictions in test data

In [145]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data, metric='cosine')
item_similarity = pairwise_distances(train_data.T, metric='cosine') #transpose because now we will have each movie in row and users in cols


In [146]:
print (item_similarity[:4, :4])

[[0.         0.59331855 0.66646388 0.54361739]
 [0.59331855 0.         0.76279149 0.51149275]
 [0.66646388 0.76279149 0.         0.68483253]
 [0.54361739 0.51149275 0.68483253 0.        ]]


In [147]:
print (user_similarity[:4, :4])

[[0.         0.76950819 0.83507977 0.5017549 ]
 [0.76950819 0.         0.87507369 0.81176469]
 [0.83507977 0.87507369 0.         0.73637513]
 [0.5017549  0.81176469 0.73637513 0.        ]]


In [148]:
#now we have similarity between users and similarity between movies so we need to predict for test data

In [149]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
    return pred

In [150]:
item_prediction = predict(train_data, item_similarity, type='item')
user_prediction = predict(train_data, user_similarity, type='user')

In [151]:
user_prediction[1].sort()

In [152]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, actual):
    prediction = prediction[actual.nonzero()].flatten() 
    actual = actual[actual.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, actual))

In [153]:
print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data)))

User-based CF RMSE: 3.0984866032288285
Item-based CF RMSE: 3.5140906960221923


In [117]:
idx_to_movie = {}
with open(r'C:\Users\Preeti Saraswat\Desktop\jupyter notebooks\ml-100k\ml-100k\u.item', 'r') as f:
    for line in f.readlines():
        info = line.split('|')
        idx_to_movie[int(info[0])-1] = info[4]
def top_k_movies(similarity, mapper, movie_idx, k=6):
    return [x for x in np.argsort(similarity[movie_idx,:])[:-k-1:-1]]

In [238]:
#now to find similar item ex for user1
idx = 29 # Batman Forever
similar_movie_ids = top_k_movies(item_similarity, idx_to_movie, idx)

In [239]:
similar_movie_ids

[800, 436, 1361, 1362, 1363, 1364]

In [240]:
movie_titles.loc[movie_titles['item_id'].isin(similar_movie_ids)]

Unnamed: 0,item_id,title
435,436,"American Werewolf in London, An (1981)"
799,800,In the Mouth of Madness (1995)
1360,1361,"Search for One-eye Jimmy, The (1996)"
1361,1362,American Strays (1996)
1362,1363,"Leopard Son, The (1996)"
1363,1364,Bird of Prey (1996)


it gives us quite similar recommendations (Genre: Fantasy)