In [1]:
import pandas as pd
import sklearn as sk
import scipy.stats as stats
import numpy as np
from scipy.spatial import distance #we are using this to find the eucledian distance between two vectors
import ast #we are using this to convert string of dictionaries to tuple of dictionaries
from sklearn.preprocessing import MultiLabelBinarizer #we are using this to create vectors for multi-hot encoding
mlb = MultiLabelBinarizer() #Initializing the method

In [2]:
#Reading the tmdb movies CSV file and storing the data in a dataframe called tmdb_movies
tmdb_movies = pd.read_csv("tmdb_5000_movies.csv")

In [3]:
#Reading the tmdb credits CSV file and storing the data in a dataframem called tmdb_credits
tmdb_credits = pd.read_csv("tmdb_5000_credits.csv")

In [4]:
#Merging the above mentioned two dataframes on the id column as storing it in a dataframe called complete_tmdb, 
#we have used the outer join here to merge and include all the columns of both dataframes
complete_tmdb = pd.merge(tmdb_movies,tmdb_credits, left_on= "id", right_on= "movie_id", how='outer')

In [5]:
complete_tmdb.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,spoken_languages,status,tagline,title_x,vote_average,vote_count,movie_id,title_y,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [6]:
#To show the column names of the joined table
complete_tmdb.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title_x', 'vote_average',
       'vote_count', 'movie_id', 'title_y', 'cast', 'crew'],
      dtype='object')

In [7]:
#The maximum value of popularity in the dataframe
complete_tmdb.popularity.max()

875.581305

In [8]:
#The title of the movie with maximal popularity
complete_tmdb.original_title.loc[complete_tmdb['popularity'].idxmax()]
#complete_tmdb['popularity'].idxmax() will provide the index of the movie which has the maximum popularity
#The .loc function which we have used on the column 'original_title' when provided an index will produce the value of the cell

'Minions'

In [9]:
#Data Cleaning 1
#using len function to find the number of movies before droping the Nan values in 'release_date'
len(complete_tmdb.release_date)

4803

In [10]:
#This will show us the rows which have Nan values in the 'release date'
complete_tmdb.loc[complete_tmdb['release_date'].isna()]

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,spoken_languages,status,tagline,title_x,vote_average,vote_count,movie_id,title_y,cast,crew
4553,0,[],,380097,[],en,America Is Still the Place,1971 post civil rights San Francisco seemed li...,0.0,[],...,[],Released,,America Is Still the Place,0.0,0,380097,America Is Still the Place,[],[]


In [11]:
#Data Cleaning 1
#using dropna function on the column 'release_date' to drop the complete row having a Nan value in the release_date column
#we are storing the cleaned data in a new dataframe called completetmdb1
complete_tmdb1 = complete_tmdb.dropna(subset=['release_date'])

In [12]:
#Data Cleaning 1
#The number of movies after the Nan values have been dropped
len(complete_tmdb1.release_date)

4802

In [13]:
complete_tmdb1.keywords
#As we can see the empty values in the keywords columns are "[]" and not " "

0       [{"id": 1463, "name": "culture clash"}, {"id":...
1       [{"id": 270, "name": "ocean"}, {"id": 726, "na...
2       [{"id": 470, "name": "spy"}, {"id": 818, "name...
3       [{"id": 849, "name": "dc comics"}, {"id": 853,...
4       [{"id": 818, "name": "based on novel"}, {"id":...
                              ...                        
4798    [{"id": 5616, "name": "united states\u2013mexi...
4799                                                   []
4800    [{"id": 248, "name": "date"}, {"id": 699, "nam...
4801                                                   []
4802    [{"id": 1523, "name": "obsession"}, {"id": 224...
Name: keywords, Length: 4802, dtype: object

In [14]:
#Data Cleaning 2
#Number of movies before deleting empty values in 'keywords' column
len(complete_tmdb1.keywords)

4802

In [15]:
#Data Cleaning 2
#Here we are first trying to find the index of all the rows where the length of value in keywords is less than 3 we will use this for the same : complete_tmdb1[complete_tmdb1['keywords'].apply(len)<3].index
# .apply function is a function which takes a function as an input and applies to all the rows of the mentioned column/table
#since we know that the empty values in keywords columns is "[]" and just not "" the length of the empty cell will be 2 accordingly
# .drop function will drop all the rows for the given indices
#we are storing the cleaned data in a new dataframe called completetmdb2
complete_tmdb2 = complete_tmdb1.drop(complete_tmdb1[complete_tmdb1['keywords'].apply(len)<3].index, inplace= False)

In [16]:
#Data Cleaning 2
#Number of movies after deleting empty values in 'keywords' column
len(complete_tmdb2.keywords)

4391

In [17]:
#Data Cleaning 3
#Average of 'runtime' before replacing 0 runtime with average runtime
complete_tmdb2.runtime.mean()

108.11642743221691

In [18]:
#Data Cleaning 3
#Replacing 0 with the mean of the runtime
complete_tmdb2['runtime'] = complete_tmdb2['runtime'].replace(0, complete_tmdb2.runtime.mean())

In [19]:
#Data Cleaning 3
#Average of 'runtime' afer replacing 0 runtime with average runtime
complete_tmdb2.runtime.mean()

108.38739592202197

In [20]:
#Data Cleaning 4
#Handling outlier: normalize popularity with z-score. Remove movies if normal-ized popularity is larger than 3 or less than -3 
#Output the number of movies before and after this step.
#we are storing the updated dataset in a new dataframe called completetmdb3
complete_tmdb3 = complete_tmdb2

In [21]:
#Data Cleaning 4
#Number of movies before the above step
len(complete_tmdb3.popularity)

4391

In [22]:
#stats.zscore will provide the zscore values of the column 'popularity'
stats.zscore(complete_tmdb3.popularity)

0       3.888483
1       3.541590
2       2.572979
3       2.723778
4       0.634592
          ...   
4795   -0.679327
4796    0.004681
4798   -0.271434
4800   -0.663247
4802   -0.648417
Name: popularity, Length: 4391, dtype: float64

In [23]:
#Data Cleaning 4
#here we are dropping the rows which have z-score greater than 3
# complete_tmdb3[(stats.zscore(complete_tmdb3['popularity']))>3].index will provide indices of all the zscores of 'popularity' which are greater than 3
# .drop function will take this indices as an inupt and drop them
#we are storing the updated dataset in a new dataframe called completetmdb4
complete_tmdb4 = complete_tmdb3.drop(complete_tmdb3[(stats.zscore(complete_tmdb3['popularity']))>3].index, inplace= False)

In [24]:
#Data Cleaning 4
#Number of movies after dropping rows of z-score greater than 3
len(complete_tmdb4.popularity)

4343

In [25]:
#Data Cleaning 4
#here we are dropping the rows which have z-score lesser than -3
# complete_tmdb4[(stats.zscore(complete_tmdb4['popularity']))<-3].index will provide indices of all the zscores of 'popularity' which are lesser than -3
#we are storing the updated dataset in a new dataframe called completetmdb4
complete_tmdb4 = complete_tmdb4.drop(complete_tmdb4[(stats.zscore(complete_tmdb4['popularity']))<-3].index, inplace= False)

In [26]:
#Number of movies after dropping rows of z-score lesser than -3
len(complete_tmdb4.popularity)
#As we see there are no z-score values less than -3

4343

In [27]:
#Using the 5th version of the dataset
complete_tmdb5 = complete_tmdb4

In [28]:
#to add all the keywords in a list called 'keywords'
keywds = complete_tmdb5.keywords.to_list()
keywds

['[{"id": 470, "name": "spy"}, {"id": 818, "name": "based on novel"}, {"id": 4289, "name": "secret agent"}, {"id": 9663, "name": "sequel"}, {"id": 14555, "name": "mi6"}, {"id": 156095, "name": "british secret service"}, {"id": 158431, "name": "united kingdom"}]',
 '[{"id": 849, "name": "dc comics"}, {"id": 853, "name": "crime fighter"}, {"id": 949, "name": "terrorist"}, {"id": 1308, "name": "secret identity"}, {"id": 1437, "name": "burglar"}, {"id": 3051, "name": "hostage drama"}, {"id": 3562, "name": "time bomb"}, {"id": 6969, "name": "gotham city"}, {"id": 7002, "name": "vigilante"}, {"id": 9665, "name": "cover-up"}, {"id": 9715, "name": "superhero"}, {"id": 9990, "name": "villainess"}, {"id": 10044, "name": "tragic hero"}, {"id": 13015, "name": "terrorism"}, {"id": 14796, "name": "destruction"}, {"id": 18933, "name": "catwoman"}, {"id": 156082, "name": "cat burglar"}, {"id": 156395, "name": "imax"}, {"id": 173272, "name": "flood"}, {"id": 179093, "name": "criminal underworld"}, {"id

In [29]:
# declaring empty list to store keyword dictionary
keywords2=[]

In [30]:
#Removing the square bracket from 'genres' to convert it into a string of dictionaries
for value in range(0,len(keywds)):
    var1=keywds[value].replace("[","")
    var1=var1.replace("]","")
    keywords2.append(var1)
    
keywords2

['{"id": 470, "name": "spy"}, {"id": 818, "name": "based on novel"}, {"id": 4289, "name": "secret agent"}, {"id": 9663, "name": "sequel"}, {"id": 14555, "name": "mi6"}, {"id": 156095, "name": "british secret service"}, {"id": 158431, "name": "united kingdom"}',
 '{"id": 849, "name": "dc comics"}, {"id": 853, "name": "crime fighter"}, {"id": 949, "name": "terrorist"}, {"id": 1308, "name": "secret identity"}, {"id": 1437, "name": "burglar"}, {"id": 3051, "name": "hostage drama"}, {"id": 3562, "name": "time bomb"}, {"id": 6969, "name": "gotham city"}, {"id": 7002, "name": "vigilante"}, {"id": 9665, "name": "cover-up"}, {"id": 9715, "name": "superhero"}, {"id": 9990, "name": "villainess"}, {"id": 10044, "name": "tragic hero"}, {"id": 13015, "name": "terrorism"}, {"id": 14796, "name": "destruction"}, {"id": 18933, "name": "catwoman"}, {"id": 156082, "name": "cat burglar"}, {"id": 156395, "name": "imax"}, {"id": 173272, "name": "flood"}, {"id": 179093, "name": "criminal underworld"}, {"id": 

In [31]:
keywords_super=[] #declaring empty master list which will contain keywords of each movie for example [[Keywords Movie1],[Keywords Movie2], [Keywords Movie 3]...]
#running loop through genre2 to convert string dictionaries to a tuple of dictionaries from which keyword is extracted
#genre3 is a list of all the keywords of a single movie
#list2 is a list of all the keyword lists
for value in range(0,len(keywords2)): #running a loop across keywords2 and append keywords of each movie to the master list
    keywords3=ast.literal_eval(keywords2[value]) #literal_eval is used to convert string of dictionaries to tuple of dictionaries which is then stored in keywords3
    list_keywords=[] #this will store the keywords of a single movie
    for value1 in range(0,len(keywords3)): #running a loop across each individual movie to collect keywords and append it into list_keywords
        if str(type(keywords3))=="<class 'tuple'>": #if a movie has only one single keyword it is stored as dictionary and if there are more than 1 keywords it is stored as tuple ,checking if the type of keywords3 is a tuple or a dictionary and extracting keywords
            list_keywords.append(keywords3[value1]["name"])
        elif str(type(keywords3))=="<class 'dict'>": #it takes a single keyword as a dictionary hence checking the type here and extracting keyword accordingly
            list_keywords.append(keywords3["name"])
    keywords_super.append(list_keywords) #appending all the above lists in a master list [[Keywords Movie1],[Keywords Movie2], [Keywords Movie 3]...]

In [32]:
#converting the master list into a series since in the function mlb.fit_transform, the input is a series
keywordsdf=pd.Series(keywords_super,name="Genre")
keywordsdf

0       [spy, based on novel, secret agent, sequel, mi...
1       [dc comics, crime fighter, terrorist, secret i...
2       [based on novel, mars, medallion, space travel...
3       [dual identity, amnesia, sandstorm, love of on...
4       [hostage, magic, horse, fairy tale, musical, p...
                              ...                        
4338     [gang, audition, police fake, homeless, actress]
4339    [distrust, garage, identity crisis, time trave...
4340    [united states–mexico barrier, legs, arms, pap...
4341    [date, love at first sight, narration, investi...
4342            [obsession, camcorder, crush, dream girl]
Name: Genre, Length: 4343, dtype: object

In [33]:
#to create a multi-hot encoded vector we are using the mlb.fit_transform function and inputing the 'keywordsdf' which is a series 
keywords_mlb = mlb.fit_transform(keywordsdf)
keywords_mlb

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [34]:
#converting the series to a list so as to append in the main dataset as a column
keywordslst=keywords_mlb.tolist()

In [35]:
#appending a new column 'Vector' to the dataset
complete_tmdb4["Vector"]=keywordslst
complete_tmdb4.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,status,tagline,title_x,vote_average,vote_count,movie_id,title_y,cast,crew,Vector
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,Released,A Plan No One Escapes,Spectre,6.3,4466,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,Released,The Legend Ends,The Dark Knight Rises,7.6,9106,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,Released,"Lost in our world, found in another.",John Carter,6.1,2124,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
5,258000000,"[{""id"": 14, ""name"": ""Fantasy""}, {""id"": 28, ""na...",http://www.sonypictures.com/movies/spider-man3/,559,"[{""id"": 851, ""name"": ""dual identity""}, {""id"": ...",en,Spider-Man 3,The seemingly invincible Spider-Man goes up ag...,115.699814,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,Released,The battle within.,Spider-Man 3,5.9,3576,559,Spider-Man 3,"[{""cast_id"": 30, ""character"": ""Peter Parker / ...","[{""credit_id"": ""52fe4252c3a36847f80151a5"", ""de...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6,260000000,"[{""id"": 16, ""name"": ""Animation""}, {""id"": 10751...",http://disney.go.com/disneypictures/tangled/,38757,"[{""id"": 1562, ""name"": ""hostage""}, {""id"": 2343,...",en,Tangled,When the kingdom's most wanted-and most charmi...,48.681969,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,Released,They're taking adventure to new lengths.,Tangled,7.4,3330,38757,Tangled,"[{""cast_id"": 34, ""character"": ""Flynn Rider (vo...","[{""credit_id"": ""52fe46db9251416c91062101"", ""de...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
First_Movie = input("Please enter the First Movie: ")
Second_Movie = input("Please enter the Second Movie: ")

#Creating a function which takes two movies as an input to find the l2distance between them
def l2distance(First_Movie,Second_Movie):
    Movie1Index = complete_tmdb4.index[complete_tmdb4['original_title'] == First_Movie] #this is to find the index of the first movie inputed 
    Movie1Vector = complete_tmdb4.Vector.loc[Movie1Index] #this will find the vector[0,0,0..] which is associated to the indexed First_Movie
    Movie1Vector = Movie1Vector.tolist() #converting it in a list so that we can use this in the function distance.euclidean()
    Movie2Index = complete_tmdb4.index[complete_tmdb4['original_title'] == Second_Movie] #this is to find the index of the second movie inputed 
    Movie2Vector = complete_tmdb4.Vector.loc[Movie2Index] #this will find the vector[0,0,0..] which is associated to the indexed Second_Movie
    Movie2Vector = Movie2Vector.tolist() #converting it in a list so that we can use this in the function distance.euclidean()
    l2=distance.euclidean(Movie1Vector,Movie2Vector) #this function is used to find the l2 distance between the 2 movies
    print("The L2 distance between "+First_Movie+" and " + Second_Movie+" is: ",l2)
    
l2distance(First_Movie,Second_Movie) #calling the function to print the l2 distance between the 2 movies