# <center>Movie Recommender Notebook
From: https://www.kaggle.com/code/atasaygin/movie-recommendation-engine-and-eda/notebook
    
Check also: https://sijanb.com.np/posts/designing-recommendation-system-using-k-nearest-neighbor-knn/

##### To implement an item based collaborative filtering, KNN is a perfect go-to model and also a very good baseline for recommender system development. We will use this algorithm in this notebook. Check this blog: 
https://labelyourdata.com/articles/movie-recommendation-with-machine-learning

In [181]:
import os
import plotly.express as px
import numpy as np
from datetime import datetime
import pandas as pd
import plotly.graph_objects as go
import seaborn as sns
import re
from sklearn.neighbors import NearestNeighbors
import random

In [182]:
dataNames = pd.read_csv("movies.csv")
dataRatings = pd.read_csv("ratings.csv")

In [183]:
dataRatings.head(3)  # First dataset 

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


In [184]:
dataNames.head(3)  # Second dataset

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [185]:
# Inner join of two datasets (common column is movieId)
data = pd.merge(dataRatings, dataNames, how='inner')
data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [186]:
# Number of Users who rated at least one movie:
print("Number of Users who rated at least one move: ", data.userId.nunique())
print("-"*25)

# Number of Movies in the dataset:
print("Number of Movies in the dataset:", data.title.nunique())
print("-"*25)

# Unique of Rating points in the dataset:
print("Unique Rating points:", data.rating.unique())

Number of Users who rated at least one move:  610
-------------------------
Number of Movies in the dataset: 9719
-------------------------
Unique Rating points: [4.  4.5 2.5 3.5 3.  5.  0.5 2.  1.5 1. ]


#  EDA etc. 


In [187]:
# Extracting movie release years into one column
data['movie_year'] = data.title.str.extract('.*\((.*)\).*')
data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,movie_year
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995


#### From now on, we can find the movie make year with the movie_year column

In [188]:
# Removing year from the movie titles
data['title'] = data.title.str.split('(').str[0].str[:-1]
data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,movie_year
0,1,1,4.0,964982703,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,5,1,4.0,847434962,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
2,7,1,4.5,1106635946,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
3,15,1,2.5,1510577970,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
4,17,1,4.5,1305696483,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995


In [189]:
def UNIX_to_Readable(df):
    return pd.to_datetime(datetime.fromtimestamp(df).strftime('%Y-%m-%d %H:%M:%S'))


# Converting Unix date-format to readable format
data.timestamp = data.timestamp.apply(UNIX_to_Readable)
data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,movie_year
0,1,1,4.0,2000-07-30 14:45:03,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,5,1,4.0,1996-11-08 01:36:02,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
2,7,1,4.5,2005-01-25 01:52:26,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
3,15,1,2.5,2017-11-13 07:59:30,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
4,17,1,4.5,2011-05-18 01:28:03,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995


#### Some ratings are not integers. For narrowing down the number of unique rating points, let's only use integers and change decimal to ceiling int.

In [190]:
# Removing decimal values to the ceiling value to decrease number of rating classes
data.rating = np.ceil(data.rating)
print("Unique Rating Points:", data.rating.unique())

Unique Rating Points: [4. 5. 3. 1. 2.]


In [191]:
# import matplotlib.pyplot as plt
rating_val_count = data.rating.value_counts()
fig = px.bar(rating_val_count, x=rating_val_count.index, y=rating_val_count, text=rating_val_count,
             labels={
                 "index": "Ratings",
                 'y': 'Number of Ratings'},
             color=rating_val_count
             )
fig.update_traces(textposition='outside')
fig.update_layout(title_text='Frequency of the Ratings',
                  title_x=0.5, title_font=dict(size=24))
fig.update_traces(marker=dict(line=dict(color='blue', width=2)))
fig.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)'})
fig.show()

In [192]:
fig = px.bar(genres_value_counts, x=genres_value_counts.index, y=genres_value_counts, text=genres_value_counts,
             labels={
                 "index": "Genres",
                 'y': 'Frequency'},
             color=genres_value_counts**0.50
             )
fig.update_traces(textposition='outside')
fig.update_layout(title_text='Top Frequent the Movie Genres',
                  title_x=0.5, title_font=dict(size=24))
fig.update_traces(marker=dict(line=dict(color='#032323', width=2)))
fig.update_layout({'plot_bgcolor': 'rgba(0, 0, 1, 0)'})
fig.show()

In [193]:
# Removing movies that has been rated less than 10 times 
movieFrequency_greater_10 = data['movieId'].value_counts()[data['movieId'].value_counts() >= 10].index
data = data[data.movieId.isin(movieFrequency_greater_10)]

print("Minimum Number of Rated Movies after Drop:\n", data.title.value_counts().nsmallest(5)) 

Minimum Number of Rated Movies after Drop:
 Skulls, The          10
Doom                 10
Urban Legend         10
Detroit Rock City    10
Fast Five            10
Name: title, dtype: int64


In [194]:
genre_vs_rating = data.groupby(['genres', 'rating']).size().unstack().fillna(0)
year_vs_rating = data.groupby(['movie_year', 'rating']).size().unstack().fillna(0)
movie_vs_rating = data.groupby(['title', 'rating']).size().unstack().fillna(0)

In [195]:
# Let's calculate the Weighted Average for dataframe rows
def Weighted_Average(df):
    x = []
    for i in range(0, df.shape[0]):
        x.append((np.average(df.iloc[i].index, weights=df.iloc[i].values, axis=0)).round(2))
    return x

# Weighted Average calculation for each movie_vs_rating rows
movie_vs_rating['weightedAverage'] = Weighted_Average(movie_vs_rating)
movie_vs_rating.sort_values('weightedAverage', ascending=False).head()

# Weighted Average calculation for each year_vs_rating rows
year_vs_rating['weightedAverage'] = Weighted_Average(year_vs_rating)

# Weighted Average calculation for each genre_vs_rating rows
genre_vs_rating['weightedAverage'] = Weighted_Average(genre_vs_rating)

In [196]:
# Creating a new DataFrame for unique movies with their weightedAverages and Genres
a = data.merge(movie_vs_rating.reset_index()).drop_duplicates('title')[['title', 'genres', 'weightedAverage']]
a.head()

Unnamed: 0,title,genres,weightedAverage
0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,4.03
215,Grumpier Old Men,Comedy|Romance,3.35
267,Heat,Action|Crime|Thriller,4.02
369,Seven,Mystery|Thriller,4.09
572,"Usual Suspects, The",Crime|Mystery|Thriller,4.37


In [197]:
# Created a (movieId: title) dictionary for all movieId's for replacing them with their names
movieId_dict = data.drop_duplicates('title')[['movieId', 'title']].set_index('movieId').to_dict()['title']

# First 5 elements of this dictionary
list(movieId_dict.items())[:5]

[(1, 'Toy Story'),
 (3, 'Grumpier Old Men'),
 (6, 'Heat'),
 (47, 'Seven'),
 (50, 'Usual Suspects, The')]

In [198]:
# Creating a pivot table that has indexes as user ratings, and columns as each movie title
dataRecommendation = data.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# Replacing dataRecommendation columns with the movie titles
dataRecommendation.columns = dataRecommendation.columns.map(movieId_dict)

dataRecommendation.head(10)

movieId,Toy Story,Jumanji,Grumpier Old Men,Father of the Bride Part II,Heat,Sabrina,Sudden Death,GoldenEye,"American President, The",Dracula: Dead and Loving It,...,Moana,Rogue One: A Star Wars Story,Hidden Figures,Get Out,Logan,Dunkirk,Blade Runner 2049,Coco,Star Wars: The Last Jedi,Deadpool 2
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,4.0,5.0,5.0,4.0,4.0,0.0,3.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,4.0,0.0,0.0,0.0,0.0,0.0,2.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [199]:
def encode_units(k):
    if k <= 0:
        return 0
    if k >= 1:
        return 1


sets = dataRecommendation.applymap(encode_units)
sets.head()

movieId,Toy Story,Jumanji,Grumpier Old Men,Father of the Bride Part II,Heat,Sabrina,Sudden Death,GoldenEye,"American President, The",Dracula: Dead and Loving It,...,Moana,Rogue One: A Star Wars Story,Hidden Figures,Get Out,Logan,Dunkirk,Blade Runner 2049,Coco,Star Wars: The Last Jedi,Deadpool 2
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [200]:
# I will use NearestNeighbors algorithm that I learnt from the scikit-learn documentation here
knn = NearestNeighbors(n_neighbors=11, metric='cosine', algorithm='brute', n_jobs=-1)
knn.fit(dataRecommendation.values.T)

In [201]:
recommendation_result = list(knn.kneighbors([dataRecommendation['Blade Runner'].values], 8))
recommendation_result

[array([[0.        , 0.33327106, 0.39746001, 0.41320104, 0.4165723 ,
         0.41812178, 0.42141976, 0.42396941]]),
 array([[261, 371, 481, 449, 501, 465, 318, 475]])]

In [202]:
recommendations = pd.DataFrame(np.vstack((recommendation_result[1], recommendation_result[0])),
                 index=['movieId', 'Cosine_Similarity (degree)']).T
recommendations = recommendations.drop([0]).reset_index(drop=True)
recommendations  # In this step, I created a dataframe that stores the movieId (for dataRecommendation column order) and Cosine Similarity in degrees

Unnamed: 0,movieId,Cosine_Similarity (degree)
0,371.0,0.333271
1,481.0,0.39746
2,449.0,0.413201
3,501.0,0.416572
4,465.0,0.418122
5,318.0,0.42142
6,475.0,0.423969


In [203]:
a = dataRecommendation.columns.to_frame().reset_index(drop=True).to_dict()['movieId']
recommendations.movieId = recommendations.movieId.map(a)
recommendations

Unnamed: 0,movieId,Cosine_Similarity (degree)
0,2001: A Space Odyssey,0.333271
1,Alien,0.39746
2,Monty Python and the Holy Grail,0.413201
3,"Terminator, The",0.416572
4,Star Wars: Episode V - The Empire Strikes Back,0.418122
5,Dr. Strangelove or: How I Learned to Stop Worr...,0.42142
6,Apocalypse Now,0.423969


# Some Movie Recommendations

In [204]:
# Movie Recommendation as Function

def movie_recommendation(movie_name, num_of_recommendations):
    a = dataRecommendation.columns.to_frame().reset_index(drop=True).to_dict()['movieId']
    recommendation_result = list(knn.kneighbors([dataRecommendation[movie_name].values], num_of_recommendations + 1))
    recommendation_result = pd.DataFrame(np.vstack((recommendation_result[1], recommendation_result[0])),
                                         index=['movieId', 'Cosine_Similarity (degree)']).T
    recommendation_result = recommendation_result.drop([0]).reset_index(drop=True)
    recommendation_result.movieId = recommendation_result.movieId.map(a)
    return recommendation_result

In [207]:
movie_recommendation('Final Destination', 7)

Unnamed: 0,movieId,Cosine_Similarity (degree)
0,Bedazzled,0.49745
1,American Psycho,0.519241
2,xXx,0.534563
3,28 Weeks Later,0.538634
4,Evolution,0.561056
5,Resident Evil,0.579539
6,Phone Booth,0.584776


In [209]:
movie_recommendation('Pulp Fiction', 7)

Unnamed: 0,movieId,Cosine_Similarity (degree)
0,"Silence of the Lambs, The",0.289614
1,"Shawshank Redemption, The",0.296947
2,Seven,0.301255
3,Forrest Gump,0.314539
4,"Usual Suspects, The",0.329722
5,Fight Club,0.370092
6,Braveheart,0.373015


In [214]:
movie_recommendation('Casablanca', 7)

Unnamed: 0,movieId,Cosine_Similarity (degree)
0,Rear Window,0.457141
1,Citizen Kane,0.457896
2,Dr. Strangelove or: How I Learned to Stop Worr...,0.473591
3,"Godfather, The",0.476248
4,"Graduate, The",0.486059
5,North by Northwest,0.486758
6,It's a Wonderful Life,0.496012


In [215]:
movie_recommendation("Harry Potter and the Sorcerer's Stone", 7)

Unnamed: 0,movieId,Cosine_Similarity (degree)
0,Harry Potter and the Chamber of Secrets,0.216779
1,Harry Potter and the Goblet of Fire,0.27284
2,Harry Potter and the Prisoner of Azkaban,0.274688
3,Shrek,0.378849
4,Harry Potter and the Half-Blood Prince,0.390901
5,Harry Potter and the Order of the Phoenix,0.404071
6,Spider-Man,0.413676
