In [2]:
# Importing Libraries
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
#loading rating dataset
ratings = pd.read_csv(r"C:\Users\sugan\anaconda3\movie\Action.csv" )
print(ratings.head())

                                                name  year movie_rated  \
0                                    The Dark Knight  2008       PG-13   
1                                          Inception  2010       PG-13   
2                                         The Matrix  1999           R   
3  The Lord of the Rings: The Fellowship of the Ring  2001       PG-13   
4                              The Dark Knight Rises  2012       PG-13   

  run_length                       genres            release_date  rating  \
0   2h 32min       Action; Crime; Drama;       18 July 2008 (USA)     9.0   
1   2h 28min  Action; Adventure; Sci-Fi;       16 July 2010 (USA)     8.8   
2   2h 16min             Action; Sci-Fi;      31 March 1999 (USA)     8.7   
3   2h 58min   Action; Adventure; Drama;   19 December 2001 (USA)     8.8   
4   2h 44min          Action; Adventure;       20 July 2012 (USA)     8.4   

   num_raters  num_reviews                                         review_url  
0     222452

In [4]:
n_ratings = len(ratings)
n_movies = len(ratings['name'].unique())
n_genres = len(ratings['genres'].unique())
 
print(f"Number of ratings: {n_ratings}")
print(f"Number of unique movieId's: {n_movies}")
print(f"Number of unique genres: {n_genres}")
print(f"Average ratings per genre: {round(n_ratings/n_genres, 2)}")
print(f"Average ratings per movie: {round(n_ratings/n_movies, 2)}")

Number of ratings: 100
Number of unique movieId's: 100
Number of unique genres: 21
Average ratings per genre: 4.76
Average ratings per movie: 1.0


In [5]:
user_freq = ratings[['genres', 'name']].groupby(
    'genres').count().reset_index()
user_freq.columns = ['genres', 'n_ratings']
print(user_freq.head())

                         genres  n_ratings
0           Action; Adventure;           5
1   Action; Adventure; Comedy;           7
2    Action; Adventure; Drama;           5
3  Action; Adventure; Fantasy;          16
4   Action; Adventure; Horror;           1


In [6]:
# Find Lowest and Highest rated movies:
mean_rating = ratings.groupby('name')[['rating']].mean()
# Lowest rated movies
lowest_rated = mean_rating['rating'].idxmin()
ratings.loc[ratings['name'] == lowest_rated]
# Highest rated movies
highest_rated = mean_rating['rating'].idxmax()
ratings.loc[ratings['name'] == highest_rated]
# show number of people who rated movies rated movie highest
ratings[ratings['name']==highest_rated]
# show number of people who rated movies rated movie lowest
ratings[ratings['name']==lowest_rated]
 
## the above movies has very low dataset. We will use bayesian average
movie_stats = ratings.groupby('name')[['rating']].agg(['count', 'mean'])
movie_stats.columns = movie_stats.columns.droplevel()

In [7]:
# Now, we create user-item matrix using scipy csr matrix
from scipy.sparse import csr_matrix
 
def create_matrix(df):
     
    N = len(df['genres'].unique())
    M = len(df['name'].unique())
     
    # Map Ids to indices
    user_mapper = dict(zip(np.unique(df["genres"]), list(range(N))))
    movie_mapper = dict(zip(np.unique(df["name"]), list(range(M))))
     
    # Map indices to IDs
    user_inv_mapper = dict(zip(list(range(N)), np.unique(df["genres"])))
    movie_inv_mapper = dict(zip(list(range(M)), np.unique(df["name"])))
     
    user_index = [user_mapper[i] for i in df['genres']]
    movie_index = [movie_mapper[i] for i in df['name']]
 
    X = csr_matrix((df["rating"], (movie_index, user_index)), shape=(M, N))
     
    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper
     
X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_matrix(ratings)

In [14]:
"""
Find similar movies using KNN
"""
from sklearn.neighbors import NearestNeighbors

def find_similar_movies(movie_id, X, k, metric='cosine', show_distance=False):
     
    neighbour_ids = []
     
    movie_ind = movie_mapper[movie_id]
    movie_vec = X[movie_ind]
    k+=1
    kNN = NearestNeighbors(n_neighbors=k, algorithm="brute", metric=metric)
    kNN.fit(X)
    movie_vec = movie_vec.reshape(1,-1)
    neighbour = kNN.kneighbors(movie_vec, return_distance=show_distance)
    for i in range(0,k):
        n = neighbour.item(i)
        neighbour_ids.append(movie_inv_mapper[n])
    neighbour_ids.pop(0)
    return neighbour_ids
 
 
movie_titles = dict(zip(ratings['name'], ratings['name']))
 
name = 'The Matrix'
 
#similar_ids = find_similar_movies(movie_id, X, k=10)
similar_ids = find_similar_movies(name, X, k=10)
#movie_title = movie_titles[movie_id]
movie_title = movie_titles[name]
 
print(f"Since you watched {movie_title}")
for i in similar_ids:
    print(movie_titles[i])

Since you watched The Matrix
The Matrix Reloaded
The Matrix
Terminator 2: Judgment Day
Star Wars: Episode IV - A New Hope
Star Wars: Episode III - Revenge of the Sith
Star Wars: Episode VI - Return of the Jedi
Suicide Squad
Star Wars: Episode VIII - The Last Jedi
Star Wars: Episode V - The Empire Strikes Back
Star Wars: Episode VII - The Force Awakens
