Importing Necessary Libraries

In [38]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action = 'ignore',category=FutureWarning)

In [39]:
# load the dataset
ratings = pd.read_csv("/content/ratings.csv")
print(ratings.head())
movies = pd.read_csv("/content/movies.csv")
print(movies.head())

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [40]:
print(ratings.columns)
print(movies.columns)

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')
Index(['movieId', 'title', 'genres'], dtype='object')


In [41]:
n_ratings = len(ratings)
n_movies = len(ratings['movieId'].unique())
n_users = len(ratings['movieId'].unique())

print(f"Number of ratings: {n_ratings}")
print(f"Number of unique moviesID's: {n_movies}")
print(f"Number of unique users: {n_users}")
print(f"Average ratiings per user: {round(n_ratings/n_users, 2)}")
print(f"Average ratiings per movies: {round(n_ratings/n_movies, 2)}")

Number of ratings: 100836
Number of unique moviesID's: 9724
Number of unique users: 9724
Average ratiings per user: 10.37
Average ratiings per movies: 10.37


In [42]:
user_freq = ratings[['userId','movieId']].groupby('userId').count().reset_index()
user_freq.columns = ['userId','n_ratings']
print(user_freq.head())

   userId  n_ratings
0       1        232
1       2         29
2       3         39
3       4        216
4       5         44


In [43]:
mean_rating = ratings.groupby('movieId')[['rating']].mean()  #finds highest and lowest rated

lowest_rated = mean_rating['rating'].idxmin()     #lowest rated movies
movies.loc[movies['movieId'] == lowest_rated]

highest_rated = mean_rating['rating'].idxmin()    #highest rated movies
movies.loc[movies['movieId'] == highest_rated]

ratings[ratings['movieId'] == highest_rated]       #people who watched these highest rated movies

ratings[ratings['movieId'] == lowest_rated]       #people who watched lowest rated movies

movie_stats = ratings.groupby('movieId')[['rating']].agg(['count','mean'])   #adding baysian average
movie_stats.column = movie_stats.columns.droplevel()

  movie_stats.column = movie_stats.columns.droplevel()


In [44]:
from scipy.sparse import csr_matrix
def create_matrix(df):
  N = len(df['userId'].unique())
  M = len(df['movieId'].unique())

  user_mapper = dict(zip(np.unique(df['userId']), list(range(N))))     #mapping id to index
  movie_mapper = dict(zip(np.unique(df['movieId']), list(range(M))))

  user_inv_mapper = dict(zip(list(range(N)), np.unique(df['userId'])))    #mapping index to id
  movie_inv_mapper = dict(zip(list(range(N)), np.unique(df['movieId'])))

  user_index = [user_mapper[i] for i in df['userId']]
  movie_index = [movie_mapper[i] for i in df['movieId']]

  X = csr_matrix((df['rating'], (movie_index, user_index)), shape=(M,N))

  return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_matrix(ratings)

In [24]:
# from sklearn.neighbors import NearestNeighbors
# def find_similar_movies(movie_id, X, k, metric='cosine',show_distance=False):

#   neighbour_id = []

#   movie_ind = movie_mapper[movie_id]
#   movie_vec = X[movie_ind]
#   k += 1
#   kNN = NearestNeighbors(n_neighbors = k, algorithm='brute', metric = metric)
#   kNN.fit(X)
#   movie_vec = movie_vec.reshape(1,-1)
#   neighbour = kNN.kneighbors(movie_vec, return_distance = show_distance)
#   for i in range(0,k):
#     n = neighbour.item(i)
#     neighbour_id.append(movie_inv_mapper[n])
#   neighbour_id.pop(0)
#   return neighbour_id

# movie_titles = dict(zip(movies['movieId'], movies['title']))

# movie_id = 3

# similar_ids = find_similar_movies(movie_id, X, k=10)
# movie_title=movie_titles[movie_id]

# print(f"Since you watched{movie_title}")
# for i in similar_ids:
#   print(movie_titles[i])

In [51]:
from sklearn.neighbors import NearestNeighbors
def find_similar_movies(movie_id, X, k, metric='cosine', show_distance=False):

    neighbour_ids = []

    movie_ind = movie_mapper[movie_id]
    movie_vec = X[movie_ind]
    k+=1
    kNN = NearestNeighbors(n_neighbors=k, algorithm="brute", metric=metric)
    kNN.fit(X)
    movie_vec = movie_vec.reshape(1,-1)
    neighbour = kNN.kneighbors(movie_vec, return_distance=show_distance)

    for i in range(0,k):
        n = neighbour.item(i)
        neighbour_ids.append(movie_inv_mapper[n])
    neighbour_ids.pop(0)

    return neighbour_ids


movie_titles = dict(zip(movies['movieId'], movies['title']))

movie_id = 3

similar_ids = find_similar_movies(movie_id, X, k=10)
movie_title = movie_titles[movie_id]

print(f"Since you watched {movie_title}")
for i in similar_ids:
    print(movie_titles[i])

KeyError: 2578

In [36]:
def recommend_movies_for_user(user_id, X, user_mapper, movie_mapper, movie_inv_mapper, k=10):
    df1 = ratings[ratings['userId'] == user_id]

    if df1.empty:
        print(f"User with ID {user_id} does not exist.")
        return

    movie_id = df1[df1['rating'] == max(df1['rating'])]['movieId'].iloc[0]

    movie_titles = dict(zip(movies['movieId'], movies['title']))

    similar_ids = find_similar_movies(movie_id, X, k)
    movie_title = movie_titles.get(movie_id, "Movie not found")

    if movie_title == "Movie not found":
        print(f"Movie with ID {movie_id} not found.")
        return

    print(f"Since you watched {movie_title}, you might also like:")
    for i in similar_ids:
        print(movie_titles.get(i, "Movie not found"))

In [37]:
user_id = 150
recommended_movies_for_user(user_id, X, user_mapper, movie_mapper, movie_inv_mapper, k=10)

NameError: name 'recommended_movies_for_user' is not defined