In [1]:
import numpy as np
import pandas as pd

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [3]:
# movies.shape # (4803, 20)
# credits.shape # (4803, 4) 

#Now there are two df but it will be easier for us if had only one df. So we merge these two and the colm of credits will go 
# the end of the df
movies.merge(credits, on='title').shape # (4809, 23)

# now since the merging happened on 'title' colm so it doesn't get add again in the merged df 
# hence we have only 23 colm and not 24 

movies = movies.merge(credits, on='title') # now we reassigned the merged df to movies 

In [4]:
# now there are lot of unnecessary columns in out df, first we will remove them
movies = movies[[ 'movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

In [5]:
# now lets clean out data

# First check if the df contains any null value
movies.isnull().sum() #this gives the number of null entries in each colm

# there are 3 movies for which the overview is null so we can drop them
movies.loc[movies.overview.isnull()] # here are the 3 movies which have overview as null

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
2662,370980,Chiamatemi Francesco - Il Papa della gente,,"[{""id"": 18, ""name"": ""Drama""}]","[{""id"": 717, ""name"": ""pope""}, {""id"": 5565, ""na...","[{""cast_id"": 5, ""character"": ""Jorge Mario Berg...","[{""credit_id"": ""5660019ac3a36875f100252b"", ""de..."
4147,459488,"To Be Frank, Sinatra at 100",,"[{""id"": 99, ""name"": ""Documentary""}]","[{""id"": 6027, ""name"": ""music""}, {""id"": 225822,...","[{""cast_id"": 0, ""character"": ""Narrator"", ""cred...","[{""credit_id"": ""592b25e4c3a368783e065a2f"", ""de..."
4437,292539,Food Chains,,"[{""id"": 99, ""name"": ""Documentary""}]",[],[],"[{""credit_id"": ""5470c3b1c3a368085e000abd"", ""de..."


In [6]:
# to remove them we simply use dropna()
movies.dropna(inplace=True)

In [7]:
# now we check if there exists any duplicate data
movies.duplicated().sum() # there is no duplicate data

0

In [8]:
# now if we look closely the format of genres, keywords, cast and crew are list of dictionaries
movies.iloc[0].genres

# '[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'
# # we have to convert the above format into below format for every columns
# ['Action', 'Adventure', 'Fantasy', 'Science Fiction']

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [9]:
import ast
ast.literal_eval(movies.iloc[0].genres) #this converts the list in string to normal list

[{'id': 28, 'name': 'Action'},
 {'id': 12, 'name': 'Adventure'},
 {'id': 14, 'name': 'Fantasy'},
 {'id': 878, 'name': 'Science Fiction'}]

In [10]:
# so lets write a helper func with the help of above method
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [11]:
movies['genres'].apply(convert) # here we get the result we want

0       [Action, Adventure, Fantasy, Science Fiction]
1                        [Adventure, Fantasy, Action]
2                          [Action, Adventure, Crime]
3                    [Action, Crime, Drama, Thriller]
4                [Action, Adventure, Science Fiction]
                            ...                      
4804                        [Action, Crime, Thriller]
4805                                [Comedy, Romance]
4806               [Comedy, Drama, Romance, TV Movie]
4807                                               []
4808                                    [Documentary]
Name: genres, Length: 4806, dtype: object

In [12]:
movies['genres'] = movies['genres'].apply(convert) # assign it to the original df

In [13]:
movies['keywords'].apply(convert) # using the same func we also get the keywords col in the same format

0       [culture clash, future, space war, space colon...
1       [ocean, drug abuse, exotic island, east india ...
2       [spy, based on novel, secret agent, sequel, mi...
3       [dc comics, crime fighter, terrorist, secret i...
4       [based on novel, mars, medallion, space travel...
                              ...                        
4804    [united states–mexico barrier, legs, arms, pap...
4805                                                   []
4806    [date, love at first sight, narration, investi...
4807                                                   []
4808            [obsession, camcorder, crush, dream girl]
Name: keywords, Length: 4806, dtype: object

In [14]:
movies['keywords'] = movies['keywords'].apply(convert) # now we also got the keywords colm

In [15]:
# now there are a lot of cast in our data of cast but we will extract the the names of top 3 characters only

def convert_cast(obj):
    L = []    # empty list to store the names
    l = ast.literal_eval(obj) # convert the string dict to python dict
    l = l[:3]    # take only first 3 dict 
    for i in l:  # then store the names of the 3 characters in the list
        L.append(i['name'])
    return L

In [16]:
movies['cast'] = movies['cast'].apply(convert_cast) # this does the cast column

In [17]:
# now only crew column is remaining to process

# in crew we will only take the name of the director 
def convert_crew(obj):
    cr = ast.literal_eval(obj)
    L = []
    for i in cr:
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

In [18]:
movies['crew'] = movies['crew'].apply(convert_crew)

In [19]:
# now if we see the overview then it's a stirng let's convert it into the list
movies.iloc[0][2]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.'

In [20]:
# code for coverting it into the list
movies.overview.apply(lambda x: x.split())

0       [In, the, 22nd, century,, a, paraplegic, Marin...
1       [Captain, Barbossa,, long, believed, to, be, d...
2       [A, cryptic, message, from, Bond’s, past, send...
3       [Following, the, death, of, District, Attorney...
4       [John, Carter, is, a, war-weary,, former, mili...
                              ...                        
4804    [El, Mariachi, just, wants, to, play, his, gui...
4805    [A, newlywed, couple's, honeymoon, is, upended...
4806    ["Signed,, Sealed,, Delivered", introduces, a,...
4807    [When, ambitious, New, York, attorney, Sam, is...
4808    [Ever, since, the, second, grade, when, he, fi...
Name: overview, Length: 4806, dtype: object

In [21]:
movies['overview'] = movies.overview.apply(lambda x: x.split())

In [22]:
# now we will have to merge the stirng which is separated by sopce 
# eg 'Sam Worthington' -> 'SamWorthington'

# we are doing this because if it were separate then 2 tags will be created 'Sam' and 'Worthington' and if we like the movies 
# by 'Sam' then the movies of other sam's also gets recommended and we do this for all 4 columns

# to do this we use list comprehension
movies['cast'].apply(lambda x:[i.replace(" ", "") for i in x]) # we do this for last 4 col and assign to the columns

0        [SamWorthington, ZoeSaldana, SigourneyWeaver]
1           [JohnnyDepp, OrlandoBloom, KeiraKnightley]
2            [DanielCraig, ChristophWaltz, LéaSeydoux]
3            [ChristianBale, MichaelCaine, GaryOldman]
4          [TaylorKitsch, LynnCollins, SamanthaMorton]
                             ...                      
4804    [CarlosGallardo, JaimedeHoyos, PeterMarquardt]
4805         [EdwardBurns, KerryBishé, MarshaDietlein]
4806           [EricMabius, KristinBooth, CrystalLowe]
4807            [DanielHenney, ElizaCoupe, BillPaxton]
4808    [DrewBarrymore, BrianHerzlinger, CoreyFeldman]
Name: cast, Length: 4806, dtype: object

In [23]:
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])

In [24]:
# now for creating tags we will merge all the 5 columns into one 

movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [25]:
# now we can remove the columns as we have merged them into single column
movies = movies[['movie_id', 'title', 'tags']]

In [26]:
# now we convert the tags which is list into string
movies['tags'].apply(lambda x: " ".join(x))

0       In the 22nd century, a paraplegic Marine is di...
1       Captain Barbossa, long believed to be dead, ha...
2       A cryptic message from Bond’s past sends him o...
3       Following the death of District Attorney Harve...
4       John Carter is a war-weary, former military ca...
                              ...                        
4804    El Mariachi just wants to play his guitar and ...
4805    A newlywed couple's honeymoon is upended by th...
4806    "Signed, Sealed, Delivered" introduces a dedic...
4807    When ambitious New York attorney Sam is sent t...
4808    Ever since the second grade when he first saw ...
Name: tags, Length: 4806, dtype: object

In [27]:
movies['tags'] = movies['tags'].apply(lambda x: " ".join(x))

In [28]:
# now we will convert everything in tags colm to lowercase
movies['tags'].apply(lambda x: x.lower())

0       in the 22nd century, a paraplegic marine is di...
1       captain barbossa, long believed to be dead, ha...
2       a cryptic message from bond’s past sends him o...
3       following the death of district attorney harve...
4       john carter is a war-weary, former military ca...
                              ...                        
4804    el mariachi just wants to play his guitar and ...
4805    a newlywed couple's honeymoon is upended by th...
4806    "signed, sealed, delivered" introduces a dedic...
4807    when ambitious new york attorney sam is sent t...
4808    ever since the second grade when he first saw ...
Name: tags, Length: 4806, dtype: object

In [29]:
movies['tags'] = movies['tags'].apply(lambda x: x.lower())

In [30]:
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [31]:
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y);

In [32]:
movies['tags'].apply(stem)

0       in the 22nd century, a parapleg marin is dispa...
1       captain barbossa, long believ to be dead, ha c...
2       a cryptic messag from bond’ past send him on a...
3       follow the death of district attorney harvey d...
4       john carter is a war-weary, former militari ca...
                              ...                        
4804    el mariachi just want to play hi guitar and ca...
4805    a newlyw couple' honeymoon is upend by the arr...
4806    "signed, sealed, delivered" introduc a dedic q...
4807    when ambiti new york attorney sam is sent to s...
4808    ever sinc the second grade when he first saw h...
Name: tags, Length: 4806, dtype: object

In [33]:
movies['tags'] = movies['tags'].apply(stem)

In [34]:
# now we will do text vectorization using sklearn

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english') 

# we are first concatenate all the tags columns into one single stirng and then calculate the 5000 most frequent words 
# using max_features param and we also remove the stop_words before finding the most frequent words
# stop_words -> the, is, are, in, from, ... which are not keywords 

# then based on the 5000 freq words we will match the 5000 words in every row and calc the freq of the 5000 words 
# in that particular row and if we do that we will get a df of (no_of_movies X 5000 words), in which every 
# row is considered as vector for that movie and if we liked one movie then we first check its vector and then 
# give the 5 most nearest vectors as recommendations to the user

In [35]:
vectors = cv.fit_transform(movies['tags']).toarray()

In [36]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [37]:
# we can also check the most common 5000 words
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      dtype=object)

In [38]:
# now if we look at the most frequent word then if we observe then the word (accept, accepted, accepts) are coming as separate 
# words but they means the same so we are taking the same word with slighly different spelling multiple times
# to resolve this problem we can use the technique called 'stemming'

# stemming -> Stemming is a natural language processing technique that is used to reduce words to their base form, 
#             also known as the root form. The process of stemming is used to normalize text and make it easier to process
    
# so if there are words in a list [loving, lovely, loved] then stemming will change this list into [love, love, love]
# in this way then the word can only be taken at once and it will also give more accurate count for the frequency

# for stemming we use nltk library used above

In [39]:
# now we are ready with to calc the distances between the vectors
# for calc dist we will use cosine_distance because if there are more dimensions then the eucledian dist fails

from sklearn.metrics.pairwise import cosine_similarity

In [40]:
cosine_similarity(vectors)
# this calc the dist of each movie with other movies so the shape of this distances is 4806 X 4806

array([[1.        , 0.08346223, 0.0860309 , ..., 0.04499213, 0.        ,
        0.        ],
       [0.08346223, 1.        , 0.06063391, ..., 0.02378257, 0.        ,
        0.02615329],
       [0.0860309 , 0.06063391, 1.        , ..., 0.02451452, 0.        ,
        0.        ],
       ...,
       [0.04499213, 0.02378257, 0.02451452, ..., 1.        , 0.03962144,
        0.04229549],
       [0.        , 0.        , 0.        , ..., 0.03962144, 1.        ,
        0.08714204],
       [0.        , 0.02615329, 0.        , ..., 0.04229549, 0.08714204,
        1.        ]])

In [41]:
similarity = cosine_similarity(vectors)

In [42]:
# now we will write our recommend() func
# now we have the similarity of each movie then suppose if one ask the recommendations for  movie_x then we simply the sort
# the similarity array of that movie and then give the 5 other movies which have minimum distance

# now to get the index of the movie
movies[movies['title'] == 'Inception']

Unnamed: 0,movie_id,title,tags
96,27205,Inception,"cobb, a skill thief who commit corpor espionag..."


In [43]:
movies[movies['title'] == 'Inception'].index

Int64Index([96], dtype='int64')

In [44]:
movies[movies['title'] == 'Inception'].index[0] # this gives us our index of the movie to check the similarity of that position

96

In [45]:
# now the problem is that if we sort the similarity then we lost the index of the movie 
# because now at index=0 there is dist with movie 0, at 1 there is dist with movie 1 and so on..
# but if we sort this then this changes hence we have to first preserve the the dist with the movie no
# and enumerate will help us do that
list(enumerate(similarity[0])) # now even if we sort it then also we have our movie no

[(0, 1.0000000000000002),
 (1, 0.08346223261119858),
 (2, 0.08603090020146065),
 (3, 0.0734718358370645),
 (4, 0.1892994097121204),
 (5, 0.10838874619051501),
 (6, 0.04024218182927669),
 (7, 0.14673479641335554),
 (8, 0.05923488777590923),
 (9, 0.0967301666813349),
 (10, 0.10259783520851541),
 (11, 0.09464970485606021),
 (12, 0.09037128496931669),
 (13, 0.04499212706658476),
 (14, 0.12824729401064427),
 (15, 0.06282808624375433),
 (16, 0.07894736842105264),
 (17, 0.13977653617040256),
 (18, 0.09493290614465533),
 (19, 0.0830812984794528),
 (20, 0.058038100008800934),
 (21, 0.10968169942141635),
 (22, 0.0662266178532522),
 (23, 0.08740748201220976),
 (24, 0.0533380747062665),
 (25, 0.05101627678885769),
 (26, 0.15389675281277312),
 (27, 0.18693292157876878),
 (28, 0.116543309349613),
 (29, 0.065033247714309),
 (30, 0.06684847767323797),
 (31, 0.15907119074394446),
 (32, 0.08520286456846099),
 (33, 0.09733285267845754),
 (34, 0.0),
 (35, 0.09933992677987831),
 (36, 0.17316974359835272),


In [46]:
def recommend(movie):
    movie_index = movies[movies['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    
    for i in movies_list:
        print(movies.iloc[i[0]].title)

In [47]:
recommend("Schindler's List") # this recommends movies based on the movie 'Fight Club'

Black Book
Red Tails
Fort McCoy
Saints and Soldiers
Unbroken


In [48]:
# to fetch the name of the movies
movies.iloc[4763].title

'This Is Martin Bonner'

In [49]:
import pickle 

In [50]:
pickle.dump(movies.to_dict(), open('movies_dict.pkl', 'wb'))

In [51]:
pickle.dump(similarity, open('similarity.pkl', 'wb'))