In [60]:
import numpy as np
import pandas as pd

In [61]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [62]:
movies = pd.read_csv("/content/drive/MyDrive/movies (1).csv")
netflixTitles = pd.read_csv("/content/drive/MyDrive/netflix_titles.csv")
netflixOrig = pd.read_csv("/content/drive/MyDrive/NetflixOriginals.csv", encoding="ISO-8859-1")
netflix_movies = pd.read_csv("/content/drive/MyDrive/netflix_movies")

In [63]:
#lowercase the names
netflixOrig = netflixOrig.rename(str.lower, axis='columns')
netflixTitles = netflixTitles.rename(str.lower, axis='columns')

#merge two netflix columns
merged = pd.merge(netflixOrig, netflixTitles, how = "inner", on = "title")


In [64]:
#DATA CLEANUP OF MERGED (trying to match movie dataset)

#Get all the movies between January 2000 to August 2023
merged = merged[(merged["release_year"] >= 2000) & (merged["release_year"] <= 2023)]

#Get all movies in English, we have 346 rows after this
merged = merged[merged["language"] == "English"]

#Clean up title, genre, description, cast, director
#lowercase
#no punctuation sub it with a space
merged["title"] = merged["title"].str.lower().str.replace('[^\w\s]',' ', regex=True)
merged["genre"] = merged["genre"].str.lower().str.replace('[^\w\s]',' ', regex=True)
merged["description"] = merged["description"].str.lower().str.replace('[^\w\s]',' ', regex=True)
merged["cast"] = merged["cast"].str.lower().str.replace('[^\w\s]',' ', regex=True)
merged["director"] = merged["director"].str.lower().str.replace('[^\w\s]',' ', regex=True)

#dropped unnecessary columns
merged = merged.drop(columns = ["show_id", "date_added", "duration", "listed_in"])

In [65]:
#Get important features, I used just the "description" for now
merged["features"] = merged["description"]

#drop nan values from "features"
merged = merged.dropna(subset = ["features"])

merged

Unnamed: 0,title,genre,premiere,runtime,imdb score,language,type,director,cast,country,release_year,rating,description,features
3,the open house,horror thriller,"January 19, 2018",94,3.2,English,Movie,matt angel suzanne coote,dylan minnette piercey dalton patricia bethu...,"Canada, United States",2018,TV-MA,following a tragedy a mother and her teen son...,following a tragedy a mother and her teen son...
7,the last days of american crime,heist film thriller,"June 5, 2020",149,3.7,English,Movie,olivier megaton,edgar ramírez michael pitt anna brewster pa...,United States,2020,TV-MA,a bank robber joins a plot to commit one final...,a bank robber joins a plot to commit one final...
8,paradox,musical western fantasy,"March 23, 2018",73,3.9,English,Movie,daryl hannah,neil young lukas nelson micah nelson corey ...,United States,2018,TV-MA,neil young and his band of outlaws sow seeds o...,neil young and his band of outlaws sow seeds o...
13,mercy,thriller,"November 22, 2016",90,4.2,English,Movie,chris sparling,james wolk caitlin fitzgerald tom lipinski ...,United States,2016,TV-MA,two brothers clash with their half siblings wh...,two brothers clash with their half siblings wh...
16,the last thing he wanted,political thriller,"February 21, 2020",115,4.3,English,Movie,dee rees,anne hathaway ben affleck willem dafoe toby...,United States,2020,R,a hard hitting reporter becomes entangled in t...,a hard hitting reporter becomes entangled in t...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499,klaus,animation christmas comedy adventure,"November 15, 2019",97,8.2,English,Movie,sergio pablos,jason schwartzman j k simmons rashida jones...,"Spain, United Kingdom",2019,PG,a selfish postman and a reclusive toymaker for...,a selfish postman and a reclusive toymaker for...
500,seaspiracy,documentary,"March 24, 2021",89,8.2,English,Movie,ali tabrizi,,United States,2021,TV-14,passionate about ocean life a filmmaker sets ...,passionate about ocean life a filmmaker sets ...
502,dancing with the birds,documentary,"October 23, 2019",51,8.3,English,Movie,huw cordey,stephen fry,United States,2019,TV-PG,from ruffling their majestic feathers to naili...,from ruffling their majestic feathers to naili...
504,springsteen on broadway,one man show,"December 16, 2018",153,8.5,English,Movie,thom zimny,bruce springsteen,United States,2018,TV-MA,bruce springsteen shares personal stories from...,bruce springsteen shares personal stories from...


In [66]:
#DATA CLEANUP OF MOVIES COPIED FROM REVENUE MODEL FOR CONSISTENCY
#Get all the released movies
movies = movies[movies['status'] == 'Released']

#Get all the movies between January 2000 to August 2023
movies = movies[(movies['release_date'] >= '2014-01-01') & (movies['release_date'] <= '2023-08-31')]

#Get all movies that have English as original language??
#Might be easier to build model and predict revenue if we eliminate disparties that could come from diff countries
movies = movies[movies['original_language'] == 'en']

#Off the bat these columns seem pretty useless so deleting them
movies = movies.drop(['poster_path', 'backdrop_path', 'recommendations'], axis=1)

#Drop null runtime
movies = movies[(movies['runtime'].isnull()) == False]

movies = movies[movies['revenue'] != 0]


#Clean up genres,title,  overview, credits
#lowercase
#no punctuation sub it with a space
movies["genres"] = movies["genres"].str.lower().str.replace('[^\w\s]',' ', regex=True)
movies["title"] = movies["title"].str.lower().str.replace('[^\w\s]',' ', regex=True)
movies["overview"] = movies["overview"].str.lower().str.replace('[^\w\s]',' ', regex=True)
movies["credits"] = movies["credits"].str.lower().str.replace('[^\w\s]',' ', regex=True)
movies["keywords"] = movies["keywords"].str.lower().str.replace('[^\w\s]',' ', regex=True)

movies

Unnamed: 0,id,title,genres,original_language,overview,popularity,production_companies,release_date,budget,revenue,runtime,status,tagline,vote_average,vote_count,credits,keywords
0,615656,meg 2 the trench,action science fiction horror,en,an exploratory dive into the deepest depths of...,8763.998,Apelles Entertainment-Warner Bros. Pictures-di...,2023-08-02,129000000.0,352056482.0,116.0,Released,Back for seconds.,7.079,1365.0,jason statham wu jing shuya sophia cai sergio ...,based on novel or book sequel kaiju
1,758323,the pope s exorcist,horror mystery thriller,en,father gabriele amorth chief exorcist of the v...,5953.227,Screen Gems-2.0 Entertainment-Jesus & Mary-Wor...,2023-04-05,18000000.0,65675816.0,103.0,Released,Inspired by the actual files of Father Gabriel...,7.433,545.0,russell crowe daniel zovatto alex essoe franco...,spain rome italy vatican pope pig possession c...
2,667538,transformers rise of the beasts,action adventure science fiction,en,when a new threat capable of destroying the en...,5409.104,Skydance-Paramount-di Bonaventura Pictures-Bay...,2023-06-06,200000000.0,407045464.0,127.0,Released,Unite or fall.,7.340,1007.0,anthony ramos dominique fishback luna lauren v...,peru alien end of the world based on cartoon b...
3,640146,ant man and the wasp quantumania,action adventure science fiction,en,super hero partners scott lang and hope van dy...,4425.387,Marvel Studios-Kevin Feige Productions,2023-02-15,200000000.0,475766228.0,125.0,Released,Witness the beginning of a new dynasty.,6.507,2811.0,paul rudd evangeline lilly jonathan majors kat...,hero ant sequel superhero based on comic famil...
4,677179,creed iii,drama action,en,after dominating the boxing world adonis creed...,3994.342,Metro-Goldwyn-Mayer-Proximity Media-Balboa Pro...,2023-03-01,75000000.0,269000000.0,116.0,Released,You can't run from your past.,7.262,1129.0,michael b jordan tessa thompson jonathan majo...,philadelphia pennsylvania husband wife relatio...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
718621,543887,abuela s luck,drama comedy crime,en,abuela s luck is a story about appreciation an...,0.600,,2018-08-25,8000.0,10000.0,9.0,Released,A young man's relationship with his grandmothe...,7.000,1.0,manny ureña manuel cabrero wilton guzman shaki...,robbery latin hold up robbery underage drinkin...
719150,544967,romeo and juliet,comedy romance drama,en,adaption of william shakespeare s classic trag...,0.600,Memeteam,2015-12-13,0.0,100.0,16.0,Released,,0.000,0.0,james andersson filip holmberg alicia hirvenoj...,
719372,576819,active shooter,documentary crime,en,a documentary regarding the active shooter phe...,0.600,U.S. Department of Homeland Security,2017-04-01,150000.0,250000.0,90.0,Released,Everything you need for survival.,0.000,0.0,connor patrick griffin kristina anderson dave ...,
720449,554742,salty reef interview,comedy documentary,en,a seasoned reporter is faced by a new challeng...,0.600,,2016-04-17,0.0,75.0,1.0,Released,"If our reefs could speak, what would they say?",10.000,1.0,cameron hazlip,coral reef interview satire reef


In [67]:
#the data was too big for the for loop, so I decided to just use movies overview
#for i in range(0, movies2.shape[0]):
    #movies2["features"] = (movies2["title"] + " " + movies2["genres"] + " " + movies2["credits"]+ " "  + " " + movies2["overview"])

movies2 = movies.copy()
movies2["features"] = movies2["overview"].str.lower().str.replace('[^\w\s]',' ', regex=True)
movies2["features"]

movies2 = movies2.dropna(subset = ["features"])

movies2

Unnamed: 0,id,title,genres,original_language,overview,popularity,production_companies,release_date,budget,revenue,runtime,status,tagline,vote_average,vote_count,credits,keywords,features
0,615656,meg 2 the trench,action science fiction horror,en,an exploratory dive into the deepest depths of...,8763.998,Apelles Entertainment-Warner Bros. Pictures-di...,2023-08-02,129000000.0,352056482.0,116.0,Released,Back for seconds.,7.079,1365.0,jason statham wu jing shuya sophia cai sergio ...,based on novel or book sequel kaiju,an exploratory dive into the deepest depths of...
1,758323,the pope s exorcist,horror mystery thriller,en,father gabriele amorth chief exorcist of the v...,5953.227,Screen Gems-2.0 Entertainment-Jesus & Mary-Wor...,2023-04-05,18000000.0,65675816.0,103.0,Released,Inspired by the actual files of Father Gabriel...,7.433,545.0,russell crowe daniel zovatto alex essoe franco...,spain rome italy vatican pope pig possession c...,father gabriele amorth chief exorcist of the v...
2,667538,transformers rise of the beasts,action adventure science fiction,en,when a new threat capable of destroying the en...,5409.104,Skydance-Paramount-di Bonaventura Pictures-Bay...,2023-06-06,200000000.0,407045464.0,127.0,Released,Unite or fall.,7.340,1007.0,anthony ramos dominique fishback luna lauren v...,peru alien end of the world based on cartoon b...,when a new threat capable of destroying the en...
3,640146,ant man and the wasp quantumania,action adventure science fiction,en,super hero partners scott lang and hope van dy...,4425.387,Marvel Studios-Kevin Feige Productions,2023-02-15,200000000.0,475766228.0,125.0,Released,Witness the beginning of a new dynasty.,6.507,2811.0,paul rudd evangeline lilly jonathan majors kat...,hero ant sequel superhero based on comic famil...,super hero partners scott lang and hope van dy...
4,677179,creed iii,drama action,en,after dominating the boxing world adonis creed...,3994.342,Metro-Goldwyn-Mayer-Proximity Media-Balboa Pro...,2023-03-01,75000000.0,269000000.0,116.0,Released,You can't run from your past.,7.262,1129.0,michael b jordan tessa thompson jonathan majo...,philadelphia pennsylvania husband wife relatio...,after dominating the boxing world adonis creed...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
718621,543887,abuela s luck,drama comedy crime,en,abuela s luck is a story about appreciation an...,0.600,,2018-08-25,8000.0,10000.0,9.0,Released,A young man's relationship with his grandmothe...,7.000,1.0,manny ureña manuel cabrero wilton guzman shaki...,robbery latin hold up robbery underage drinkin...,abuela s luck is a story about appreciation an...
719150,544967,romeo and juliet,comedy romance drama,en,adaption of william shakespeare s classic trag...,0.600,Memeteam,2015-12-13,0.0,100.0,16.0,Released,,0.000,0.0,james andersson filip holmberg alicia hirvenoj...,,adaption of william shakespeare s classic trag...
719372,576819,active shooter,documentary crime,en,a documentary regarding the active shooter phe...,0.600,U.S. Department of Homeland Security,2017-04-01,150000.0,250000.0,90.0,Released,Everything you need for survival.,0.000,0.0,connor patrick griffin kristina anderson dave ...,,a documentary regarding the active shooter phe...
720449,554742,salty reef interview,comedy documentary,en,a seasoned reporter is faced by a new challeng...,0.600,,2016-04-17,0.0,75.0,1.0,Released,"If our reefs could speak, what would they say?",10.000,1.0,cameron hazlip,coral reef interview satire reef,a seasoned reporter is faced by a new challeng...


In [68]:
#We need to concatenate both datasets to put them in TF-IDF vectorizer
#After that we can create a vector matrix and find similar rows by using cosine_similarity
#When we find similar movies we will just make sure that thw most similar movie is not a Netflix movie
netflixDf = merged.copy()
theatreDf = movies2.copy()

#create a column isNetflix to differentiate between two datasets
netflixDf["isNetflix"] = 1
theatreDf["isNetflix"] = 0

#concatenating
netflix_theatre_concat = pd.concat([netflixDf, theatreDf], axis=0)

netflix_theatre_concat

Unnamed: 0,title,genre,premiere,runtime,imdb score,language,type,director,cast,country,...,production_companies,release_date,budget,revenue,status,tagline,vote_average,vote_count,credits,keywords
3,the open house,horror thriller,"January 19, 2018",94.0,3.2,English,Movie,matt angel suzanne coote,dylan minnette piercey dalton patricia bethu...,"Canada, United States",...,,,,,,,,,,
7,the last days of american crime,heist film thriller,"June 5, 2020",149.0,3.7,English,Movie,olivier megaton,edgar ramírez michael pitt anna brewster pa...,United States,...,,,,,,,,,,
8,paradox,musical western fantasy,"March 23, 2018",73.0,3.9,English,Movie,daryl hannah,neil young lukas nelson micah nelson corey ...,United States,...,,,,,,,,,,
13,mercy,thriller,"November 22, 2016",90.0,4.2,English,Movie,chris sparling,james wolk caitlin fitzgerald tom lipinski ...,United States,...,,,,,,,,,,
16,the last thing he wanted,political thriller,"February 21, 2020",115.0,4.3,English,Movie,dee rees,anne hathaway ben affleck willem dafoe toby...,United States,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
718621,abuela s luck,,,9.0,,,,,,,...,,2018-08-25,8000.0,10000.0,Released,A young man's relationship with his grandmothe...,7.0,1.0,manny ureña manuel cabrero wilton guzman shaki...,robbery latin hold up robbery underage drinkin...
719150,romeo and juliet,,,16.0,,,,,,,...,Memeteam,2015-12-13,0.0,100.0,Released,,0.0,0.0,james andersson filip holmberg alicia hirvenoj...,
719372,active shooter,,,90.0,,,,,,,...,U.S. Department of Homeland Security,2017-04-01,150000.0,250000.0,Released,Everything you need for survival.,0.0,0.0,connor patrick griffin kristina anderson dave ...,
720449,salty reef interview,,,1.0,,,,,,,...,,2016-04-17,0.0,75.0,Released,"If our reefs could speak, what would they say?",10.0,1.0,cameron hazlip,coral reef interview satire reef


In [69]:
#dropping duplicates from concatenated df
netflix_theatre_concat1 = netflix_theatre_concat.drop_duplicates(subset=['title'])


netflix_theatre_concat = netflix_theatre_concat1.copy() #did this to avoid a weird warning

#giving each movie a movie_id to differentiate rows later
netflix_theatre_concat["movie_id"] = [i for i in range(0, netflix_theatre_concat.shape[0])]


In [70]:
#Converting our "feature" data to vectors using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

#we can experiment with TfidfVectorizer() parameters to see the impact on the vectorization
#ex. vector = TfidfVectorizer(max_df=0.8, min_df=2, ngram_range=(1, 2))

vector = TfidfVectorizer()
vectors = vector.fit_transform(netflix_theatre_concat["features"])


In [71]:
from sklearn.metrics.pairwise import cosine_similarity
#cosine similarity (dot product of two vectors divided by magnitude of those vec)

cos_sim = cosine_similarity(vectors)

In [72]:
cos_sim.shape

(3265, 3265)

In [73]:
#Need to check if the title exists in our df
def doesTitleExist(name):
    if name in netflixDf["title"].values:
        return "YES"
    else:
        return "NO"

# Dimensionality Reduction Visual

This visualization will follow both the TD IDF Model and the Work2Vec Model. We hope to see through the description of the movie, how the Netflix Movies are mapped with one another. We hope to see if we can find any pattern of similarity with one another based upon the genre of the movie.

In [74]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# To make the visualizations
!git clone https://github.com/CAHLR/d3-scatterplot.git
from google.colab.output import eval_js
from IPython.display import Javascript
from gensim.models import KeyedVectors, Word2Vec

fatal: destination path 'd3-scatterplot' already exists and is not an empty directory.


In [75]:
vector_df = pd.DataFrame(vectors.toarray(), columns=vector.get_feature_names_out())
vector_list_tdif = vector_df.to_numpy()
vector_list_tdif.shape

(3265, 15308)

In [76]:
only_netflix = pd.DataFrame(vector_list_tdif)
only_netflix["Title"] = netflix_theatre_concat.reset_index()["title"]
only_netflix["genre"] = netflix_theatre_concat.reset_index()["genre"]
only_netflix["In Netflix"] = only_netflix["Title"].apply(doesTitleExist)
only_netflix = only_netflix[only_netflix["In Netflix"] == "YES"]
only_netflix_vectors = only_netflix.drop(columns = ["Title", "In Netflix", "genre"])
only_netflix_vectors.shape

(346, 15308)

#Using UMAP and HDB Scanning to Cluster instead of KMeans

Saw that Kmeans was not performing as well

In [77]:
from IPython.display import Image
# from sklearn.manifold import TSNE


import gensim
import requests
import string
from google.colab.output import eval_js

In [78]:
pip install umap-learn



In [79]:
pip install hdbscan



In [80]:
import umap
import hdbscan
from matplotlib import pyplot as plt

embedding_ = umap.UMAP(
    n_neighbors= 50,
    min_dist=0.0,
    n_components=2,
    random_state=42,
).fit_transform(only_netflix_vectors)


hbd_labels = hdbscan.HDBSCAN(
    min_samples= 1,
    min_cluster_size = 6,
).fit_predict(embedding_)



  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [81]:
umap_df = pd.DataFrame({"x": embedding_[:,0], "y":embedding_[:,1], "Cluster": hbd_labels, "Title": only_netflix.reset_index()["Title"], "Genre": only_netflix.reset_index()["genre"]})
umap_df = umap_df.merge(netflix_movies[["title","Revenue"]], right_on = "title", left_on="Title").drop(columns=["title"])
umap_df
# netflix_movies

Unnamed: 0,x,y,Cluster,Title,Genre,Revenue
0,7.550463,6.073264,-1,the open house,horror thriller,14521.0
1,8.603117,5.612448,5,the last days of american crime,heist film thriller,31220247.0
2,10.126534,4.086634,-1,paradox,musical western fantasy,209752.0
3,7.559603,4.849699,13,mercy,thriller,1136167.0
4,7.197787,6.407810,7,the last thing he wanted,political thriller,1431904.0
...,...,...,...,...,...,...
341,7.749982,3.861218,-1,klaus,animation christmas comedy adventure,195673000.0
342,8.492708,5.205153,9,seaspiracy,documentary,770945583.0
343,7.713545,3.613604,1,dancing with the birds,documentary,120000.0
344,10.109854,4.952626,-1,springsteen on broadway,one man show,18144644.0


In [82]:
%matplotlib inline
from matplotlib import pyplot as plt
umap_df.to_csv('d3-scatterplot/tdif_vectorized_umap_version1.tsv', sep='\t', index=False)

In [83]:
def show_port(port, data_file, width=600, height=800):
  display(Javascript("""
  (async ()=>{
    fm = document.createElement('iframe')
    fm.src = await google.colab.kernel.proxyPort(%d) + '/index.html?dataset=%s'
    fm.width = '90%%'
    fm.height = '%d'
    fm.frameBorder = 0
    document.body.append(fm)
  })();
  """ % (port, data_file, height)))

port = 8000
data_file = 'tdif_vectorized_umap_version1.tsv'
height = 1500

get_ipython().system_raw('cd d3-scatterplot && python3 -m http.server %d &' % port)
show_port(port, data_file, height)

<IPython.core.display.Javascript object>

In [84]:
 #Investigating the Cluster
revenue_grouped =  umap_df.groupby("Cluster").agg({"Revenue": np.mean})
genre_grouped = umap_df.groupby(by=["Cluster", "Genre"])[["x"]].count()
common_genre = pd.DataFrame(genre_grouped.unstack(-1).fillna(0).idxmax(1).apply(lambda x: x[1]))

grouped = revenue_grouped.merge(common_genre, right_index=True, left_index=True).rename({0:"Common Genre"}, axis=1).sort_values(by="Revenue", ascending=False)
grouped

Unnamed: 0_level_0,Revenue,Common Genre
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
9,122062200.0,comedy
17,105552300.0,romantic comedy
14,100058100.0,drama
7,84105490.0,drama
13,76694980.0,comedy
4,73717120.0,documentary
12,61372520.0,romantic comedy
8,45010820.0,adventure comedy
2,44670270.0,documentary
-1,42831980.0,documentary


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=419f63df-2e68-44d4-9d98-aec60329482b' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>