In [56]:
# Imports
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib

In [57]:
# Load cleaned data
path = r"C:\Users\Admibn\OneDrive\Desktop\Netflix Project\netflix-analysis-project\data\cleaned\netflix_titles_clean.csv"
df = pd.read_csv(path)
df.shape

(8804, 13)

In [58]:
# Viewing first 5 rows
df.head(5)

Unnamed: 0,title,show_id,type,release_year,primary_country,genres_str,rating,date_added,date_added_year,date_added_month,director_primary,cast_primary,metadata
0,Dick Johnson Is Dead,s1,Movie,2020,United States,Documentaries,PG-13,2021-09-25,2021.0,9.0,Kirsten Johnson,,Dick Johnson Is Dead | As her father nears the...
1,Blood & Water,s2,Tv Show,2021,South Africa,International TV Shows|TV Dramas|TV Mysteries,TV-MA,2021-09-24,2021.0,9.0,,Ama Qamata,Blood & Water | After crossing paths at a part...
2,Ganglands,s3,Tv Show,2021,Unknown,Crime TV Shows|International TV Shows|TV Actio...,TV-MA,2021-09-24,2021.0,9.0,Julien Leclercq,Sami Bouajila,Ganglands | To protect his family from a power...
3,Jailbirds New Orleans,s4,Tv Show,2021,Unknown,Docuseries|Reality TV,TV-MA,2021-09-24,2021.0,9.0,,,"Jailbirds New Orleans | Feuds, flirtations and..."
4,Kota Factory,s5,Tv Show,2021,India,International TV Shows|Romantic TV Shows|TV Co...,TV-MA,2021-09-24,2021.0,9.0,,Mayur More,Kota Factory | In a city of coaching centers k...


In [59]:
# Combining description and genres into a single feature
df["combined_features"] = (
    df["metadata"].fillna("").str.lower() + " " + df["genres_str"].fillna("").str.lower()
)

In [60]:
# TF-IDF Vectorization
# We will use TF-IDF to convert the text data into numerical format suitable for similarity calculations
# Similarity calculation means we can recommend similar shows based on content
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(df["combined_features"])
tfidf_matrix.shape

(8804, 5000)

In [61]:
# Cosine Similarity - Calculating similarity between all shows
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim.shape

(8804, 8804)

In [62]:
# We will create a Series to map show titles to their indices
# Why? Because our cosine similarity matrix is indexed by numerical indices
indices = pd.Series(df.index, index=df["title"]).drop_duplicates()
indices.head()

title
Dick Johnson Is Dead     0
Blood & Water            1
Ganglands                2
Jailbirds New Orleans    3
Kota Factory             4
dtype: int64

In [63]:
# Recommendation function
# Given a show title, return the top 10 most similar shows
def recommend(title, cosine_sim=cosine_sim):
    title = title.strip().lower() # Normalize input
    if title not in indices.index.str.lower():
        return "Title not found in the dataset."
    idx = df["title"].str.lower().to_list().index(title) # Get index of the show
    sim_scores = list(enumerate(cosine_sim[idx])) # Get similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:11] # Top 10 similar shows
    movie_indices = [i[0] for i in sim_scores] # Get indices of the shows
    return df["title"].iloc[movie_indices].to_list()

In [64]:
# Testing the recommender
print(recommend("breaKing BAD"))

['Futmalls.com', 'Ozark', 'Dare Me', 'Get Shorty', 'Have You Ever Fallen in Love, Miss Jiang?', 'Bad Blood', 'Sparta', 'Tunnel', 'The Lizzie Borden Chronicles', 'The Judgement']


In [65]:
# Save the model and data reference
model_dir = r"C:\Users\Admibn\OneDrive\Desktop\Netflix Project\netflix-analysis-project\app\models"
os.makedirs(model_dir, exist_ok=True)
joblib.dump(tfidf, model_dir + r"\tfidf_vectorizer.pkl")
joblib.dump(cosine_sim, model_dir + r"\cosine_similarity.pkl")
joblib.dump(df[["title", "combined_features"]], model_dir + r"\data_reference.pkl")
# Joblib used for saving large numpy arrays efficiently since cosine similarity matrix can be large and complex

['C:\\Users\\Admibn\\OneDrive\\Desktop\\Netflix Project\\netflix-analysis-project\\app\\models\\data_reference.pkl']

In [66]:
# Verify saved files
os.listdir(model_dir)

['cosine_similarity.pkl', 'data_reference.pkl', 'tfidf_vectorizer.pkl']