In [None]:
import numpy as np
import pandas as pd

In [None]:
movies = pd.read_csv("movies.csv")
netflixTitles = pd.read_csv("netflix_titles.csv")
netflixOrig = pd.read_csv("NetflixOriginals.csv")

In [None]:
#lowercase the names
netflixOrig = netflixOrig.rename(str.lower, axis='columns')
netflixTitles = netflixTitles.rename(str.lower, axis='columns')

#merge two netflix columns
merged = pd.merge(netflixOrig, netflixTitles, how = "inner", on = "title")


In [None]:
#DATA CLEANUP OF MERGED (trying to match movie dataset)

#Get all the movies between January 2000 to August 2023
merged = merged[(merged["release_year"] >= 2000) & (merged["release_year"] <= 2023)]

#Get all movies in English, we have 346 rows after this
merged = merged[merged["language"] == "English"]

#Clean up title, genre, description, cast, director
#lowercase
#no punctuation sub it with a space 
merged["title"] = merged["title"].str.lower().str.replace('[^\w\s]',' ', regex=True)
merged["genre"] = merged["genre"].str.lower().str.replace('[^\w\s]',' ', regex=True)
merged["description"] = merged["description"].str.lower().str.replace('[^\w\s]',' ', regex=True)
merged["cast"] = merged["cast"].str.lower().str.replace('[^\w\s]',' ', regex=True)
merged["director"] = merged["director"].str.lower().str.replace('[^\w\s]',' ', regex=True)

#dropped unnecessary columns
merged = merged.drop(columns = ["show_id", "date_added", "duration", "listed_in"])

In [None]:
#Get important features
for i in range(0, merged.shape[0]):
   merged["features"] = (merged["title"] + " " + merged["genre"] + " " + merged["description"])


#drop nan values from "features"
merged = merged.dropna(subset = ["features"])

merged["features"]

3      the open house horror thriller following a tra...
7      the last days of american crime heist film thr...
8      paradox musical western fantasy neil young and...
13     mercy thriller two brothers clash with their h...
16     the last thing he wanted political thriller a ...
                             ...                        
498    klaus animation christmas comedy adventure a s...
499    seaspiracy documentary passionate about ocean ...
501    dancing with the birds documentary from ruffli...
503    springsteen on broadway one man show bruce spr...
504    david attenborough  a life on our planet docum...
Name: features, Length: 346, dtype: object

In [None]:
#DATA CLEANUP OF MOVIES COPIED FROM REVENUE MODEL FOR CONSISTENCY
#Get all the released movies 
movies = movies[movies['status'] == 'Released']

#Get all the movies between January 2000 to August 2023
movies = movies[(movies['release_date'] >= '2014-01-01') & (movies['release_date'] <= '2023-08-31')]

#Get all movies that have English as original language?? 
#Might be easier to build model and predict revenue if we eliminate disparties that could come from diff countries
movies = movies[movies['original_language'] == 'en']

#Off the bat these columns seem pretty useless so deleting them
movies = movies.drop(['poster_path', 'backdrop_path', 'recommendations'], axis=1)

#Drop null runtime
movies = movies[(movies['runtime'].isnull()) == False]

movies = movies[movies['revenue'] != 0]


#Clean up genres,title,  overview, credits 
#lowercase
#no punctuation sub it with a space 
movies["genres"] = movies["genres"].str.lower().str.replace('[^\w\s]',' ', regex=True)
movies["title"] = movies["title"].str.lower().str.replace('[^\w\s]',' ', regex=True)
movies["overview"] = movies["overview"].str.lower().str.replace('[^\w\s]',' ', regex=True)
movies["credits"] = movies["credits"].str.lower().str.replace('[^\w\s]',' ', regex=True)
movies["keywords"] = movies["keywords"].str.lower().str.replace('[^\w\s]',' ', regex=True)

movies

Unnamed: 0,id,title,genres,original_language,overview,popularity,production_companies,release_date,budget,revenue,runtime,status,tagline,vote_average,vote_count,credits,keywords
0,615656,meg 2 the trench,action science fiction horror,en,an exploratory dive into the deepest depths of...,8763.998,Apelles Entertainment-Warner Bros. Pictures-di...,2023-08-02,129000000.0,352056482.0,116.0,Released,Back for seconds.,7.079,1365.0,jason statham wu jing shuya sophia cai sergio ...,based on novel or book sequel kaiju
1,758323,the pope s exorcist,horror mystery thriller,en,father gabriele amorth chief exorcist of the v...,5953.227,Screen Gems-2.0 Entertainment-Jesus & Mary-Wor...,2023-04-05,18000000.0,65675816.0,103.0,Released,Inspired by the actual files of Father Gabriel...,7.433,545.0,russell crowe daniel zovatto alex essoe franco...,spain rome italy vatican pope pig possession c...
2,667538,transformers rise of the beasts,action adventure science fiction,en,when a new threat capable of destroying the en...,5409.104,Skydance-Paramount-di Bonaventura Pictures-Bay...,2023-06-06,200000000.0,407045464.0,127.0,Released,Unite or fall.,7.340,1007.0,anthony ramos dominique fishback luna lauren v...,peru alien end of the world based on cartoon b...
3,640146,ant man and the wasp quantumania,action adventure science fiction,en,super hero partners scott lang and hope van dy...,4425.387,Marvel Studios-Kevin Feige Productions,2023-02-15,200000000.0,475766228.0,125.0,Released,Witness the beginning of a new dynasty.,6.507,2811.0,paul rudd evangeline lilly jonathan majors kat...,hero ant sequel superhero based on comic famil...
4,677179,creed iii,drama action,en,after dominating the boxing world adonis creed...,3994.342,Metro-Goldwyn-Mayer-Proximity Media-Balboa Pro...,2023-03-01,75000000.0,269000000.0,116.0,Released,You can't run from your past.,7.262,1129.0,michael b jordan tessa thompson jonathan majo...,philadelphia pennsylvania husband wife relatio...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
718621,543887,abuela s luck,drama comedy crime,en,abuela s luck is a story about appreciation an...,0.600,,2018-08-25,8000.0,10000.0,9.0,Released,A young man's relationship with his grandmothe...,7.000,1.0,manny ureña manuel cabrero wilton guzman shaki...,robbery latin hold up robbery underage drinkin...
719150,544967,romeo and juliet,comedy romance drama,en,adaption of william shakespeare s classic trag...,0.600,Memeteam,2015-12-13,0.0,100.0,16.0,Released,,0.000,0.0,james andersson filip holmberg alicia hirvenoj...,
719372,576819,active shooter,documentary crime,en,a documentary regarding the active shooter phe...,0.600,U.S. Department of Homeland Security,2017-04-01,150000.0,250000.0,90.0,Released,Everything you need for survival.,0.000,0.0,connor patrick griffin kristina anderson dave ...,
720449,554742,salty reef interview,comedy documentary,en,a seasoned reporter is faced by a new challeng...,0.600,,2016-04-17,0.0,75.0,1.0,Released,"If our reefs could speak, what would they say?",10.000,1.0,cameron hazlip,coral reef interview satire reef


In [None]:

movies2 = movies.copy()
for i in range(0, movies2.shape[0]):
    movies2["features"] = (movies2["title"] + " " + movies2["genres"] + " " + movies2["overview"])

movies2["features"]

movies2 = movies2.dropna(subset = ["features"])

movies2


Unnamed: 0,id,title,genres,original_language,overview,popularity,production_companies,release_date,budget,revenue,runtime,status,tagline,vote_average,vote_count,credits,keywords,features
0,615656,meg 2 the trench,action science fiction horror,en,an exploratory dive into the deepest depths of...,8763.998,Apelles Entertainment-Warner Bros. Pictures-di...,2023-08-02,129000000.0,352056482.0,116.0,Released,Back for seconds.,7.079,1365.0,jason statham wu jing shuya sophia cai sergio ...,based on novel or book sequel kaiju,meg 2 the trench action science fiction horro...
1,758323,the pope s exorcist,horror mystery thriller,en,father gabriele amorth chief exorcist of the v...,5953.227,Screen Gems-2.0 Entertainment-Jesus & Mary-Wor...,2023-04-05,18000000.0,65675816.0,103.0,Released,Inspired by the actual files of Father Gabriel...,7.433,545.0,russell crowe daniel zovatto alex essoe franco...,spain rome italy vatican pope pig possession c...,the pope s exorcist horror mystery thriller fa...
2,667538,transformers rise of the beasts,action adventure science fiction,en,when a new threat capable of destroying the en...,5409.104,Skydance-Paramount-di Bonaventura Pictures-Bay...,2023-06-06,200000000.0,407045464.0,127.0,Released,Unite or fall.,7.340,1007.0,anthony ramos dominique fishback luna lauren v...,peru alien end of the world based on cartoon b...,transformers rise of the beasts action advent...
3,640146,ant man and the wasp quantumania,action adventure science fiction,en,super hero partners scott lang and hope van dy...,4425.387,Marvel Studios-Kevin Feige Productions,2023-02-15,200000000.0,475766228.0,125.0,Released,Witness the beginning of a new dynasty.,6.507,2811.0,paul rudd evangeline lilly jonathan majors kat...,hero ant sequel superhero based on comic famil...,ant man and the wasp quantumania action adven...
4,677179,creed iii,drama action,en,after dominating the boxing world adonis creed...,3994.342,Metro-Goldwyn-Mayer-Proximity Media-Balboa Pro...,2023-03-01,75000000.0,269000000.0,116.0,Released,You can't run from your past.,7.262,1129.0,michael b jordan tessa thompson jonathan majo...,philadelphia pennsylvania husband wife relatio...,creed iii drama action after dominating the bo...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
718621,543887,abuela s luck,drama comedy crime,en,abuela s luck is a story about appreciation an...,0.600,,2018-08-25,8000.0,10000.0,9.0,Released,A young man's relationship with his grandmothe...,7.000,1.0,manny ureña manuel cabrero wilton guzman shaki...,robbery latin hold up robbery underage drinkin...,abuela s luck drama comedy crime abuela s luck...
719150,544967,romeo and juliet,comedy romance drama,en,adaption of william shakespeare s classic trag...,0.600,Memeteam,2015-12-13,0.0,100.0,16.0,Released,,0.000,0.0,james andersson filip holmberg alicia hirvenoj...,,romeo and juliet comedy romance drama adaption...
719372,576819,active shooter,documentary crime,en,a documentary regarding the active shooter phe...,0.600,U.S. Department of Homeland Security,2017-04-01,150000.0,250000.0,90.0,Released,Everything you need for survival.,0.000,0.0,connor patrick griffin kristina anderson dave ...,,active shooter documentary crime a documentary...
720449,554742,salty reef interview,comedy documentary,en,a seasoned reporter is faced by a new challeng...,0.600,,2016-04-17,0.0,75.0,1.0,Released,"If our reefs could speak, what would they say?",10.000,1.0,cameron hazlip,coral reef interview satire reef,salty reef interview comedy documentary a seas...


In [None]:
#PLAN
#make one large dataset, concatenate netflix and theatre movie data sets 
#make a binary column: isNetflix yes/no 
#give row id to every movie
#use TD IDF vectorizer to create big matrix
#use cosine similarity to compare rows/find most similar
#check if it isNetflix and show only non netflix similar movies

In [None]:
#We need to concatenate both datasets to put them in TF-IDF vectorizer
#After that we can create a vector matrix and find similar rows by using cosine_similarity
#When we find similar movies we will just make sure that thw most similar movie is not a Netflix movie
netflixDf = merged.copy()
theatreDf = movies2.copy()

#create a column isNetflix to differentiate between two datasets
netflixDf["isNetflix"] = 1
theatreDf["isNetflix"] = 0

#concatenating 
netflix_theatre_concat = pd.concat([netflixDf, theatreDf], axis=0)

netflix_theatre_concat

Unnamed: 0,title,genre,premiere,runtime,imdb score,language,show_id,type,director,cast,...,production_companies,release_date,budget,revenue,status,tagline,vote_average,vote_count,credits,keywords
3,the open house,horror thriller,"January 19, 2018",94.0,3.2,English,s5074,Movie,matt angel suzanne coote,dylan minnette piercey dalton patricia bethu...,...,,,,,,,,,,
7,the last days of american crime,heist film thriller,"June 5, 2020",149.0,3.7,English,s2425,Movie,olivier megaton,edgar ramírez michael pitt anna brewster pa...,...,,,,,,,,,,
8,paradox,musical western fantasy,"March 23, 2018",73.0,3.9,English,s4973,Movie,daryl hannah,neil young lukas nelson micah nelson corey ...,...,,,,,,,,,,
13,mercy,thriller,"November 22, 2016",90.0,4.2,English,s5710,Movie,chris sparling,james wolk caitlin fitzgerald tom lipinski ...,...,,,,,,,,,,
16,the last thing he wanted,political thriller,"February 21, 2020",115.0,4.3,English,s2898,Movie,dee rees,anne hathaway ben affleck willem dafoe toby...,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
718621,abuela s luck,,,9.0,,,,,,,...,,2018-08-25,8000.0,10000.0,Released,A young man's relationship with his grandmothe...,7.0,1.0,manny ureña manuel cabrero wilton guzman shaki...,robbery latin hold up robbery underage drinkin...
719150,romeo and juliet,,,16.0,,,,,,,...,Memeteam,2015-12-13,0.0,100.0,Released,,0.0,0.0,james andersson filip holmberg alicia hirvenoj...,
719372,active shooter,,,90.0,,,,,,,...,U.S. Department of Homeland Security,2017-04-01,150000.0,250000.0,Released,Everything you need for survival.,0.0,0.0,connor patrick griffin kristina anderson dave ...,
720449,salty reef interview,,,1.0,,,,,,,...,,2016-04-17,0.0,75.0,Released,"If our reefs could speak, what would they say?",10.0,1.0,cameron hazlip,coral reef interview satire reef


In [None]:
#dropping duplicates from concatenated df 
netflix_theatre_concat1 = netflix_theatre_concat.drop_duplicates(subset=['title'])


netflix_theatre_concat = netflix_theatre_concat1.copy() #did this to avoid a weird warning

#giving each movie a movie_id to differentiate rows later
netflix_theatre_concat["movie_id"] = [i for i in range(0, netflix_theatre_concat.shape[0])]


In [None]:
#Converting our "feature" data to vectors using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

vector = TfidfVectorizer()
vectors = vector.fit_transform(netflix_theatre_concat["features"]) 
vectors

<3175x15862 sparse matrix of type '<class 'numpy.float64'>'
	with 120147 stored elements in Compressed Sparse Row format>

In [None]:
print(vectors)
#A: Document index B: Specific word-vector index C: TFIDF score for word B in document A

#It indicates the tfidf score for all non-zero values in the word vector for each document.

  (0, 14149)	0.1446689138885501
  (0, 580)	0.15433663443129692
  (0, 3186)	0.30725364790630194
  (0, 5619)	0.18629691267571286
  (0, 14802)	0.28754772741409856
  (0, 4611)	0.28754772741409856
  (0, 15466)	0.14742555842079944
  (0, 6746)	0.14317784482956492
  (0, 15002)	0.22249217391192433
  (0, 15001)	0.3228951169536776
  (0, 11642)	0.30725364790630194
  (0, 14310)	0.053394903172258534
  (0, 9369)	0.2147509789015037
  (0, 13131)	0.17046953189854605
  (0, 14028)	0.21716280228408075
  (0, 6603)	0.09797747078242591
  (0, 817)	0.10940173030532734
  (0, 9339)	0.1643115421446226
  (0, 14429)	0.219735529379675
  (0, 5598)	0.20637736629694958
  (0, 14217)	0.09647754325668399
  (0, 6812)	0.1117125110793326
  (0, 6847)	0.1719825340250802
  (0, 9969)	0.2286776496729017
  (0, 14141)	0.045605098516711184
  :	:
  (3174, 460)	0.27130908583819674
  (3174, 5972)	0.5426181716763935
  (3174, 14164)	0.24884170877814935
  (3174, 9876)	0.2356991357084589
  (3174, 7432)	0.20823518426294857
  (3174, 12059)	0.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
#cosine similarity (dot product of two vectors divided by magnitude of those vec)

cos_sim = cosine_similarity(vectors)

In [None]:
cos_sim.shape

(3175, 3175)

# Final Function:

In [None]:
def moviefinder_2(name):

    #Find the movie id of the title
    movID = netflix_theatre_concat[netflix_theatre_concat["title"] == name ]["movie_id"].values
    #checking if there are any matches of the movie
    if len(movID) > 0:
        #movie ID
        movID = movID[0]
        #cosine similarity scores for the movie ID
        score = list(enumerate(cos_sim[movID]))
        #sort based on the score, desc
        score_sort = sorted(score, key = lambda x:x[1], reverse = True)
        #the first value is the movie itself so we are removing it and starting from index 1
        score_sort = score_sort[1:]
    
        #creating a list for nonNetflix movies   
        nonNetflixSimilarMovies = []
        for movi, score in score_sort:
            mov_title = netflix_theatre_concat.iloc[movi]["title"]
            Netflix = netflix_theatre_concat.iloc[movi]["isNetflix"]
            #checking if the movie is not from Netflix
            if Netflix == 0: 
                #adding it to the list
                nonNetflixSimilarMovies.append(mov_title)
            #only giving 10 similar nonNetflix movies    
            if len(nonNetflixSimilarMovies)>=10:
                break
        #returns the first nonNetflix similar movie
        return nonNetflixSimilarMovies[:1] 
    else:
        #otherwise returns and empty list of the movie
        return []

# Manually evaluating netflix movies and similar theatre-released movies

In [None]:
moviefinder_2("tall girl") #match, both comedy/romance

['she s funny that way']

In [None]:
moviefinder_2("army of the dead") #Match

['not another zombie movie    about the living dead']

In [None]:
moviefinder_2("the princess switch") #Match

['the secret princess']

In [None]:
moviefinder_2("the perfection") #Both movies about musical prodigies.

# "the perfection" is Horror, 'whiplash' is drama

['whiplash']

In [None]:
moviefinder_2("wine country") #Both about a getaway with a group of friends
#"wine country"one is a fun birthday Napa getaway (comedy/drama)
#'the cabin house' is about a getaway that turns weekend of pure terror (horror)

['the cabin house']

In [None]:
moviefinder_2("christmas inheritance") #MATCH Both Romance Movies, both about finding love in hometown/small town

['the best of me']

In [None]:
newDf = netflixDf.copy()
newDf["Similar Movie"] = newDf["title"].apply(moviefinder_2)
newDf["Similar Movie"] = [",".join(map(str, i)) for i in newDf["Similar Movie"]]
newDf["movie_id"] = [i for i in range(0, newDf.shape[0])]

In [None]:
newDf

Unnamed: 0,title,genre,premiere,runtime,imdb score,language,show_id,type,director,cast,...,date_added,release_year,rating,duration,listed_in,description,features,isNetflix,Similar Movie,movie_id
3,the open house,horror thriller,"January 19, 2018",94,3.2,English,s5074,Movie,matt angel suzanne coote,dylan minnette piercey dalton patricia bethu...,...,"January 19, 2018",2018,TV-MA,95 min,"Horror Movies, Thrillers",following a tragedy a mother and her teen son...,the open house horror thriller following a tra...,1,sinister 2,0
7,the last days of american crime,heist film thriller,"June 5, 2020",149,3.7,English,s2425,Movie,olivier megaton,edgar ramírez michael pitt anna brewster pa...,...,"June 5, 2020",2020,TV-MA,149 min,"Action & Adventure, Dramas, Independent Movies",a bank robber joins a plot to commit one final...,the last days of american crime heist film thr...,1,the hurricane heist,1
8,paradox,musical western fantasy,"March 23, 2018",73,3.9,English,s4973,Movie,daryl hannah,neil young lukas nelson micah nelson corey ...,...,"March 23, 2018",2018,TV-MA,74 min,"Dramas, Independent Movies, Music & Musicals",neil young and his band of outlaws sow seeds o...,paradox musical western fantasy neil young and...,1,never grow old,2
13,mercy,thriller,"November 22, 2016",90,4.2,English,s5710,Movie,chris sparling,james wolk caitlin fitzgerald tom lipinski ...,...,"November 22, 2016",2016,TV-MA,88 min,"Dramas, Thrillers",two brothers clash with their half siblings wh...,mercy thriller two brothers clash with their h...,1,the dinner,3
16,the last thing he wanted,political thriller,"February 21, 2020",115,4.3,English,s2898,Movie,dee rees,anne hathaway ben affleck willem dafoe toby...,...,"February 21, 2020",2020,R,116 min,"Dramas, Thrillers",a hard hitting reporter becomes entangled in t...,the last thing he wanted political thriller a ...,1,winter s tale,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
498,klaus,animation christmas comedy adventure,"November 15, 2019",97,8.2,English,s3274,Movie,sergio pablos,jason schwartzman j k simmons rashida jones...,...,"November 15, 2019",2019,PG,98 min,"Children & Family Movies, Comedies",a selfish postman and a reclusive toymaker for...,klaus animation christmas comedy adventure a s...,1,saving christmas,341
499,seaspiracy,documentary,"March 24, 2021",89,8.2,English,s1172,Movie,ali tabrizi,,...,"March 24, 2021",2021,TV-14,90 min,Documentaries,passionate about ocean life a filmmaker sets ...,seaspiracy documentary passionate about ocean ...,1,the batman,342
501,dancing with the birds,documentary,"October 23, 2019",51,8.3,English,s3387,Movie,huw cordey,stephen fry,...,"October 23, 2019",2019,TV-PG,52 min,Documentaries,from ruffling their majestic feathers to naili...,dancing with the birds documentary from ruffli...,1,the angry birds movie 2,343
503,springsteen on broadway,one man show,"December 16, 2018",153,8.5,English,s4294,Movie,thom zimny,bruce springsteen,...,"December 16, 2018",2018,TV-MA,153 min,Music & Musicals,bruce springsteen shares personal stories from...,springsteen on broadway one man show bruce spr...,1,blinded by the light,344


In [None]:
#Exporting csv file
newDf.to_csv("withSimilarMovies.csv")

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=419f63df-2e68-44d4-9d98-aec60329482b' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>