# Movie Recommender Notebook 
##### From: https://www.kaggle.com/code/atasaygin/movie-recommendation-engine-and-eda/notebook
##### Check also: https://sijanb.com.np/posts/designing-recommendation-system-using-k-nearest-neighbor-knn/

In [149]:
import pandas as pd

In [150]:

dNames = pd.read_csv("movies.csv")
dRatings = pd.read_csv("ratings.csv")

In [151]:
dRatings.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


In [152]:
dNames.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [153]:
data = pd.merge(dRatings, dNames, how='inner')
data.head(3)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [154]:
print("Number of users who rated at least one movie: ", data.userId.nunique())
print("--"*10)

# Number of Movies in the dataset:
print("Number of movies in the dataset:", data.title.nunique())
print("--"*10)

Number of users who rated at least one movie:  610
--------------------
Number of movies in the dataset: 9719
--------------------


In [155]:
# Extract year into a separate column 
data['movie_year'] = data.title.str.extract('.*\((.*)\).*')
data.head(3)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,movie_year
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995


In [156]:
# Removing year from the movie titles
data['title'] = data.title.str.split('(').str[0]
data.head(3)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,movie_year
0,1,1,4.0,964982703,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,5,1,4.0,847434962,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
2,7,1,4.5,1106635946,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995


In [157]:
# Change timestamp to readable one
from datetime import *
def UNIX_to_Readable(df):
    return pd.to_datetime(datetime.fromtimestamp(df).strftime('%Y-%m-%d %H:%M:%S'))


# Converting Unix date-format to readable format
data.timestamp = data.timestamp.apply(UNIX_to_Readable)
data.head(3)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,movie_year
0,1,1,4.0,2000-07-30 14:45:03,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,5,1,4.0,1996-11-08 01:36:02,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
2,7,1,4.5,2005-01-25 01:52:26,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995


In [158]:
# Look at the ratings

print("Unique Rating points:", data.rating.unique())

Unique Rating points: [4.  4.5 2.5 3.5 3.  5.  0.5 2.  1.5 1. ]


In [159]:
# Removing decimal values to the ceiling value to decrease number of rating classes
import numpy as np
data.rating = np.ceil(data.rating)
print("Unique Rating Points:", data.rating.unique())

Unique Rating Points: [4. 5. 3. 1. 2.]


In [160]:
ratings_more20 = data['movieId'].value_counts()[data['movieId'].value_counts() >= 10].index
data = data[data.movieId.isin(ratings_more20)]
print("Minimum Number of Rated Movies after Drop:\n",data.title.value_counts().nsmallest(5))  

Minimum Number of Rated Movies after Drop:
 Skulls, The           10
Doom                  10
Urban Legend          10
Detroit Rock City     10
Fast Five             10
Name: title, dtype: int64


In [161]:
# import matplotlib.pyplot as plt
import plotly.express as px
rating_val_count = data.rating.value_counts()
fig = px.bar(rating_val_count, x=rating_val_count.index, y=rating_val_count, text=rating_val_count,
             labels={
                 "index": "Ratings",
                 'y': 'Number of Ratings'},
             color=rating_val_count
             )
fig.update_traces(textposition='outside')
fig.update_layout(title_text='Frequency of the Ratings',
                  title_x=0.5, title_font=dict(size=24))
fig.update_traces(marker=dict(line=dict(color='blue', width=2)))
fig.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)'})
fig.show()

In [162]:
genres_value_counts = data['genres'].str.split('|', expand=True).stack().value_counts()
genres_value_counts

Drama          32956
Comedy         31221
Action         27199
Thriller       22703
Adventure      21910
Sci-Fi         15485
Romance        14914
Crime          14272
Fantasy        10384
Children        7865
Mystery         6540
Animation       5906
Horror          5255
War             4126
IMAX            3878
Musical         3471
Western         1606
Film-Noir        716
Documentary      399
dtype: int64

In [163]:
fig = px.bar(genres_value_counts, x=genres_value_counts.index, y=genres_value_counts, text=genres_value_counts,
             labels={
                 "index": "Genres",
                 'y': 'Frequency'},
             color=genres_value_counts
             )
fig.update_traces(textposition='outside')
fig.update_layout(title_text='Top Frequent the Movie Genres',
                  title_x=0.5, title_font=dict(size=24))
fig.update_traces(marker=dict(line=dict(color='#000000', width=2)))
fig.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)'})
fig.show()

In [164]:
# Created a (movieId: title) dictionary for all movieId's for replacing them with their names
movieId_dict = data.drop_duplicates('title')[['movieId', 'title']].set_index('movieId').to_dict()['title']

# First 5 elements of this dictionary
list(movieId_dict.items())[:5]

[(1, 'Toy Story '),
 (3, 'Grumpier Old Men '),
 (6, 'Heat '),
 (47, 'Seven '),
 (50, 'Usual Suspects, The ')]

In [165]:
# Creating a pivot table that has indexes as user ratings, and columns as each movie title
dataRecommendation = data.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# Replacing dataRecommendation columns with the movie titles
dataRecommendation.columns = dataRecommendation.columns.map(movieId_dict)

dataRecommendation.head(5)

movieId,Toy Story,Jumanji,Grumpier Old Men,Father of the Bride Part II,Heat,Sabrina,Sudden Death,GoldenEye,"American President, The",Dracula: Dead and Loving It,...,Moana,Rogue One: A Star Wars Story,Hidden Figures,Get Out,Logan,Dunkirk,Blade Runner 2049,Coco,Star Wars: The Last Jedi,Deadpool 2
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [166]:
dataRecommendation.describe()

movieId,Toy Story,Jumanji,Grumpier Old Men,Father of the Bride Part II,Heat,Sabrina,Sudden Death,GoldenEye,"American President, The",Dracula: Dead and Loving It,...,Moana,Rogue One: A Star Wars Story,Hidden Figures,Get Out,Logan,Dunkirk,Blade Runner 2049,Coco,Star Wars: The Last Jedi,Deadpool 2
count,610.0,610.0,610.0,610.0,610.0,610.0,610.0,610.0,610.0,610.0,...,610.0,610.0,610.0,610.0,610.0,610.0,610.0,610.0,610.0,610.0
mean,1.419672,0.645902,0.285246,0.252459,0.672131,0.286885,0.083607,0.780328,0.429508,0.077049,...,0.059016,0.180328,0.063934,0.093443,0.181967,0.077049,0.119672,0.078689,0.067213,0.078689
std,1.986227,1.425498,0.981242,0.88883,1.540065,0.968125,0.529485,1.544397,1.227403,0.481818,...,0.482664,0.86547,0.506941,0.60874,0.890381,0.539687,0.709152,0.564731,0.519322,0.579087
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0


In [167]:
dataRecommendation.tail(5)

movieId,Toy Story,Jumanji,Grumpier Old Men,Father of the Bride Part II,Heat,Sabrina,Sudden Death,GoldenEye,"American President, The",Dracula: Dead and Loving It,...,Moana,Rogue One: A Star Wars Story,Hidden Figures,Get Out,Logan,Dunkirk,Blade Runner 2049,Coco,Star Wars: The Last Jedi,Deadpool 2
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
606,3.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,3.0,2.0,2.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,3.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
610,5.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0


In [168]:
def encode_units(k):
    if k <= 0:
        return 0
    if k >= 1:
        return 1

sets = dataRecommendation.applymap(encode_units)
sets.tail()

movieId,Toy Story,Jumanji,Grumpier Old Men,Father of the Bride Part II,Heat,Sabrina,Sudden Death,GoldenEye,"American President, The",Dracula: Dead and Loving It,...,Moana,Rogue One: A Star Wars Story,Hidden Figures,Get Out,Logan,Dunkirk,Blade Runner 2049,Coco,Star Wars: The Last Jedi,Deadpool 2
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
606,1,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
607,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
608,1,1,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
609,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
610,1,0,0,0,1,0,0,0,0,0,...,0,1,0,1,1,0,0,0,0,0


In [169]:
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=11, n_jobs=-1)
knn.fit(dataRecommendation.values.T)

In [170]:
# Here is our recommendations for Blade Runner, there will be 7 movie recommendations
recommendation_result = list(knn.kneighbors([dataRecommendation['Blade Runner'].values], 8))
recommendation_result

KeyError: 'Blade Runner'

In [None]:
recommendations = pd.DataFrame(np.vstack((recommendation_result[1], recommendation_result[0])),
                 index=['movieId', 'Cosine_Similarity (degree)']).T
recommendations = recommendations.drop([0]).reset_index(drop=True)
recommendations  # In this step, I created a dataframe that stores the movieId (for dataRecommendation column order) and Cosine Similarity in degrees