In [1]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split

In [2]:
actual_ratings = pd.read_csv('ratings.csv')
actual_ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [3]:
movies = pd.read_csv('movies.csv')
movies.describe()


Unnamed: 0,movieId
count,9742.0
mean,42200.353623
std,52160.494854
min,1.0
25%,3248.25
50%,7300.0
75%,76232.0
max,193609.0


In [5]:
ratings = actual_ratings

In [6]:
n_ratings = len(ratings)
n_movies = len(ratings['movieId'].unique())
n_users = len(ratings['userId'].unique())
print(f"Number of ratings: {n_ratings}")
print(f"Number of movies: {n_movies}")
print(f"Number of users: {n_users}")
print(f"Average ratings per user: {round(n_ratings/n_users, 2)}")
print(f"Average ratings per movie: {round(n_ratings/n_movies, 2)}")

Number of ratings: 100836
Number of movies: 9724
Number of users: 610
Average ratings per user: 165.3
Average ratings per movie: 10.37


In [7]:
user_freq = ratings[['userId', 'movieId']].groupby('userId').count().reset_index()
user_freq.columns = ['userId', 'n_ratings']
user_freq.head()

Unnamed: 0,userId,n_ratings
0,1,232
1,2,29
2,3,39
3,4,216
4,5,44


In [8]:
from scipy.sparse import csr_matrix

In [9]:
def create_matrix(df):
    N = len(df['userId'].unique())
    M = len(df['movieId'].unique())
    
    user_mapper = dict(zip(np.unique(df["userId"]), list(range(N))))
    movie_mapper = dict(zip(np.unique(df['movieId']), list(range(M))))
    user_inv_mapper = dict(zip(list(range(N)), np.unique(df['userId'])))
    movie_inv_mapper = dict(zip(list(range(M)), np.unique(df['movieId'])))
    user_index = [user_mapper[i] for i in df['userId']]
    movie_index = [movie_mapper[i] for i in df['movieId']]
    df['rating'] = (df['rating'] - df.groupby('userId')['rating'].transform('mean'))
    X = csr_matrix((df['rating'], (movie_index, user_index)), shape=(M, N))
    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

In [10]:
X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_matrix(ratings)

In [11]:
from sklearn.neighbors import NearestNeighbors


In [12]:
def find_similar_movies(movieId, X, k, metric='cosine', show_distance=False):
    neighbour_ids = []
    movie_ind = movie_mapper[movieId]
    movie_vec = X[movie_ind]
    k += 1
    kNN = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='cosine')
    nn = kNN.fit(X)
    movie_vec = movie_vec.reshape(1, -1)
    neighbour = kNN.kneighbors(movie_vec, return_distance=show_distance)
    for i in range(0, k):
        try:
            n = neighbour.item(i)
            neighbour_ids.append(movie_inv_mapper[n])
        except IndexError:
            break
        
    neighbour_ids.pop(0)
    return neighbour_ids, nn


In [13]:
movie_titles = dict(zip(movies['movieId'], movies['title']))

movie_id = 2 #An Example

similar_ids,nn = find_similar_movies(movie_id, X, k=10)
movie_title = movie_titles[movie_id]
print(f"Since you've watched {movie_title}")
for i in similar_ids:
    print(movie_titles[i])

Since you've watched Jumanji (1995)
Loser (1991)
Return from Witch Mountain (1978)
Bad Words (2013)
Timeline (2003)
Synecdoche, New York (2008)
Fool's Gold (2008)
Young Adult (2011)
Extract (2009)
Aloha (2015)
Senseless (1998)


In [14]:
from sklearn.metrics import mean_squared_error
from evaluation_metric import *
dataset = pd.merge(ratings,movies, on='movieId').drop(['timestamp', 'genres'], axis= 1)
pivot_table = dataset.pivot_table(values= 'rating', index= 'userId', columns= 'title').fillna(0)
pivot_table = pivot_table.apply(np.sign)
missing_pivot = dataset.pivot_table(values='rating', index='userId', columns='title')

rate = {}
rows_indexes = {}

for i, row in missing_pivot.iterrows():
    rows = [x for x in range(0,len(missing_pivot.columns))]
    combine = list(zip(row.index, row.values, rows))
    rated = [(x,z) for x,y,z in combine if str(y) != 'nan']
    index = [i[1] for i in rated]
    row_names = [i[0] for i in rated]
    rows_indexes[i] = index
    rate[i] = row_names

missing_pivot.head()

notrated = {}
notrated_indexes = {}

for i,row in pivot_table.iterrows():
    rows = [x for x in range(0, len(missing_pivot.columns))]
    combine = list(zip(row.index, row.values, row))
    idx_row = [(idx,col) for idx, val, col  in combine if not val > 0]
    indices = [i[1] for i in idx_row]
    row_names = [i[0] for i in idx_row]
    notrated[i] = row_names
    notrated_indexes[i] = indices
item_distance, item_indices = nn.kneighbors(pivot_table.T.values)
item_distance = 1 - item_distance
predictions = item_distance.T.dot(pivot_table.T.values)/np.array([np.abs(item_distance.T).sum(axis=1)]).T
ground_truth = pivot_table.T.values[item_distance.argsort()[0]]


rmse_result = calculate_rmse(predictions, ground_truth)
mae_result = calculate_mae(predictions, ground_truth, nn)
map_result, precision_result, recall_result = calculate_map_precision_recall(predictions, ground_truth, nn)

print(f"RMSE: {rmse_result}")
print(f"MAE: {mae_result}")
print(f"MAP: {map_result}")
print(f"Precision: {precision_result}")
print(f"Recall: {recall_result}")



RMSE: 0.9666088458741167
MAE: 0.9629328841886303
MAP: 0.7613654460186111
Precision: 0.6629328841886304
Recall: 0.7566088458741167
