<a href="https://colab.research.google.com/github/safakan/CCTB_repo_safak_atakan_celik/blob/main/DEA113/NOTES_Build_Recommendation_Systems_for_Movies_Like_Netflix.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Building Recommendation Systems

## Libraries

In [172]:
%pip install tqdm==4.66.4  | tail -n 1
%pip install pandas==2.1.4  | tail -n 1
%pip install scikit-learn==1.5.1  | tail -n 1



In [173]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import statistics
import kagglehub

# just for practice sake
def warn(*args, **kwargs):
    pass

import warnings

warnings.warn = warn
warnings.filterwarnings('ignore')


## Extract: Movie Lens Small Latest Dataset

In [174]:
# import kagglehub
# Download latest version
path = kagglehub.dataset_download("shubhammehta21/movie-lens-small-latest-dataset")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/shubhammehta21/movie-lens-small-latest-dataset/versions/1


In [175]:
df_movies = pd.read_csv(path + "/movies.csv")
df_ratings = pd.read_csv(path + "/ratings.csv")
df_tags = pd.read_csv(path + "/tags.csv")

In [176]:
df_movies.sample(5)

Unnamed: 0,movieId,title,genres
462,528,"Scout, The (1994)",Comedy|Drama
3319,4492,Critters (1986),Comedy|Sci-Fi
2724,3658,Quatermass and the Pit (1967),Horror|Sci-Fi
933,1233,"Boot, Das (Boat, The) (1981)",Action|Drama|War
9293,158783,The Handmaiden (2016),Drama|Romance|Thriller


In [177]:
df_ratings.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp
60283,388,62999,5.0,1386694287
67507,437,296,5.0,859722193
5359,37,231,5.0,845239442
80298,506,47629,4.0,1424487000
28053,193,2858,2.5,1436558605


In [178]:
df_tags.sample(5)

Unnamed: 0,userId,movieId,tag,timestamp
1540,474,1996,ghosts,1137373825
3081,567,7361,mind-bending,1525282728
524,184,27156,epic,1537094270
587,318,48698,the catholic church is the most corrupt organi...,1276006189
2066,474,6041,In Netflix queue,1137200888


## Check, Transform, PreProcess dataset

Merging all three dataframes into one dataframe to be used: movies, ratings, tags

In [179]:
df_movies_n_ratings = df_movies.merge(df_ratings, on="movieId", how="inner")
df_movies_n_ratings_n_tags = df_movies_n_ratings.merge(df_tags, on=["movieId", "userId"], how="inner")
df = df_movies_n_ratings_n_tags
df

Unnamed: 0,movieId,title,genres,userId,rating,timestamp_x,tag,timestamp_y
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,4.0,1122227329,pixar,1139045764
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,4.0,978575760,pixar,1137206825
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,3.5,1525286001,fun,1525286013
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,1528843890,fantasy,1528843929
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,1528843890,magic board game,1528843932
...,...,...,...,...,...,...,...,...
3471,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62,4.0,1528934550,star wars,1528934552
3472,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,1537098554,anime,1537098582
3473,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,1537098554,comedy,1537098587
3474,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,1537098554,gintama,1537098603


Exploratory Data Analysis (EDA)

In [180]:
# shape: row and column count
df.shape

(3476, 8)

In [181]:
# data types
df.dtypes

Unnamed: 0,0
movieId,int64
title,object
genres,object
userId,int64
rating,float64
timestamp_x,int64
tag,object
timestamp_y,int64


In [182]:
# missing values
df.isnull().sum()

Unnamed: 0,0
movieId,0
title,0
genres,0
userId,0
rating,0
timestamp_x,0
tag,0
timestamp_y,0


Dropping the timestamp columns because they're irrelevant in this case

In [183]:
df.drop(columns=["timestamp_x", "timestamp_y"], inplace=True)
df

Unnamed: 0,movieId,title,genres,userId,rating,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,4.0,pixar
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,4.0,pixar
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,3.5,fun
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,fantasy
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,magic board game
...,...,...,...,...,...,...
3471,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62,4.0,star wars
3472,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,anime
3473,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,comedy
3474,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,gintama


In [184]:
df.shape

(3476, 6)

## Recommendation Systems

### Popularity-based recommendation

In [201]:
# copy the dataframe for this specific system's use case
df_popularity = df

In [202]:
df_popularity

Unnamed: 0,movieId,title,genres,userId,rating,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,4.0,pixar
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,4.0,pixar
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,3.5,fun
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,fantasy
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,magic board game
...,...,...,...,...,...,...
3471,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62,4.0,star wars
3472,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,anime
3473,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,comedy
3474,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,gintama


Number of votes (numVotes) Feature

In [203]:
# calculate and store
num_votes = df_popularity.groupby("movieId").size().reset_index(name="numVotes")


# merge
df_popularity = pd.merge(df_popularity, num_votes, on="movieId")
df_popularity

Unnamed: 0,movieId,title,genres,userId,rating,tag,numVotes
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,4.0,pixar,3
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,4.0,pixar,3
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,3.5,fun,3
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,fantasy,4
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,magic board game,4
...,...,...,...,...,...,...,...
3471,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62,4.0,star wars,2
3472,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,anime,4
3473,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,comedy,4
3474,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,gintama,4


Average Rating (avgRating) Feature

In [204]:
# calculate and store
avg_rating = df_popularity.groupby("movieId")["rating"].mean().reset_index(name="avgRating")

# merge
df_popularity = pd.merge(df_popularity, avg_rating, on="movieId")
df_popularity

Unnamed: 0,movieId,title,genres,userId,rating,tag,numVotes,avgRating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,4.0,pixar,3,3.833333
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,4.0,pixar,3,3.833333
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,3.5,fun,3,3.833333
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,fantasy,4,3.750000
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,magic board game,4,3.750000
...,...,...,...,...,...,...,...,...
3471,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62,4.0,star wars,2,4.000000
3472,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,anime,4,3.500000
3473,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,comedy,4,3.500000
3474,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,gintama,4,3.500000


Cleaning by removing unnecessary columns and rows

In [205]:
# df_popularity.drop(columns=["genres", "userId", "rating", "tag"], inplace=True)
# df_popularity.drop_duplicates(inplace=True)
# df_popularity

Unnamed: 0,movieId,title,numVotes,avgRating
0,1,Toy Story (1995),3,3.833333
3,2,Jumanji (1995),4,3.750000
7,3,Grumpier Old Men (1995),2,2.500000
9,5,Father of the Bride Part II (1995),2,1.500000
11,7,Sabrina (1995),1,3.000000
...,...,...,...,...
3461,183611,Game Night (2018),3,4.000000
3464,184471,Tomb Raider (2018),3,3.500000
3467,187593,Deadpool 2 (2018),3,4.000000
3470,187595,Solo: A Star Wars Story (2018),2,4.000000


Calculating Weighted Scores

In [206]:
import statistics

def calculate_weighted_score(avg_rating, num_votes, C, m):
  return (num_votes * avg_rating + m * C) / (num_votes + m)

# Calculating the global average rating (C)
average_avgRating = statistics.mean(df_popularity["avgRating"])

# Calculating the average number of votes (m) - threshold
average_numVotes = statistics.mean(df_popularity["numVotes"])

# Calculating and storing weighted scores
df_popularity["weighted_score"] = df_popularity.apply(lambda row: calculate_weighted_score(row["avgRating"], row["numVotes"], average_avgRating, average_numVotes), axis=1)
df_popularity

Unnamed: 0,movieId,title,numVotes,avgRating,weighted_score
0,1,Toy Story (1995),3,3.833333,3.788714
3,2,Jumanji (1995),4,3.750000,3.743421
7,3,Grumpier Old Men (1995),2,2.500000,3.168895
9,5,Father of the Bride Part II (1995),2,1.500000,2.711680
11,7,Sabrina (1995),1,3.000000,3.515304
...,...,...,...,...,...
3461,183611,Game Night (2018),3,4.000000,3.881749
3464,184471,Tomb Raider (2018),3,3.500000,3.602644
3467,187593,Deadpool 2 (2018),3,4.000000,3.881749
3470,187595,Solo: A Star Wars Story (2018),2,4.000000,3.854716


Exercise 1 - Get the top 5 suggestions sorting by score in descending order

In [207]:
df_popularity.sort_values(by="weighted_score", ascending=False).head(5)

Unnamed: 0,movieId,title,numVotes,avgRating,weighted_score
199,296,Pulp Fiction (1994),181,4.983425,4.967226
1337,2959,Fight Club (1999),54,4.944444,4.893394
604,924,2001: A Space Odyssey (1968),41,4.95122,4.884498
998,1732,"Big Lebowski, The (1998)",32,4.953125,4.868802
164,293,Léon: The Professional (a.k.a. The Professiona...,35,4.928571,4.852577


### Content Based Recommendation

In [212]:
df_content = df_popularity.copy()

In [213]:
df_content

Unnamed: 0,movieId,title,numVotes,avgRating,weighted_score
0,1,Toy Story (1995),3,3.833333,3.788714
1,2,Jumanji (1995),4,3.750000,3.743421
2,3,Grumpier Old Men (1995),2,2.500000,3.168895
3,5,Father of the Bride Part II (1995),2,1.500000,2.711680
4,7,Sabrina (1995),1,3.000000,3.515304
...,...,...,...,...,...
1459,183611,Game Night (2018),3,4.000000,3.881749
1460,184471,Tomb Raider (2018),3,3.500000,3.602644
1461,187593,Deadpool 2 (2018),3,4.000000,3.881749
1462,187595,Solo: A Star Wars Story (2018),2,4.000000,3.854716
