In [1]:
import numpy as np
import pandas as pd
import ast  #to convert string to list
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
movies = pd.read_csv('../archive/tmdb_5000_movies.csv')
credits = pd.read_csv('../archive/tmdb_5000_credits.csv')

In [3]:
print(movies.shape)
movies.head(2)

(4803, 20)


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [4]:
print(credits.shape)
credits.head()

(4803, 4)


Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [5]:
movies = movies.merge(credits, on="title")
print(movies.shape)
movies.head(2)

(4809, 23)


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


**Remove Unwanted Columns**

In [6]:
movies['original_language'].value_counts()[:5]  #as most of the movies are English, so this col will not make any difference

en    4510
fr      70
es      32
zh      27
de      27
Name: original_language, dtype: int64

In [7]:
movies = movies[['genres', 'id', 'keywords', 'title', 'overview', 'cast', 'crew']]    # 'release_date'
movies.head(1)

Unnamed: 0,genres,id,keywords,title,overview,cast,crew
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


**Missing & Duplicate Values**

In [8]:
movies.isnull().sum()

genres      0
id          0
keywords    0
title       0
overview    3
cast        0
crew        0
dtype: int64

In [9]:
movies.dropna(inplace=True)

In [10]:
movies.isnull().sum()

genres      0
id          0
keywords    0
title       0
overview    0
cast        0
crew        0
dtype: int64

In [11]:
movies.duplicated().sum()

0

**Column Format Preprocessing to Create paragraph for `tags` Column**

In [12]:
# genres, id, keywords, title, overview, cast, crew

In [13]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [14]:
# Have : '[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'
# Need : ["Action", "Adventure", "Fantasy", "Science Fiction"]

In [15]:
def extract_listof_tags(argobj):
    # convert the given string of list to actual list
    argobj = ast.literal_eval(argobj)
    # initialize empty list
    L = []
    # run loop through the list of dict
    for dictele in argobj:
        # to remove white-space from the string
        # i.e., to make "Abc Def" -> "AbcDef"
        tagname = dictele["name"].replace(" ", "")
        # append the tagname to list L
        L.append(tagname)
    return L
movies['genres'] = movies['genres'].apply(extract_listof_tags)
movies['keywords'] = movies['keywords'].apply(extract_listof_tags)

In [16]:
movies['cast'][0]
# movies['crew'][0]

'[{"cast_id": 242, "character": "Jake Sully", "credit_id": "5602a8a7c3a3685532001c9a", "gender": 2, "id": 65731, "name": "Sam Worthington", "order": 0}, {"cast_id": 3, "character": "Neytiri", "credit_id": "52fe48009251416c750ac9cb", "gender": 1, "id": 8691, "name": "Zoe Saldana", "order": 1}, {"cast_id": 25, "character": "Dr. Grace Augustine", "credit_id": "52fe48009251416c750aca39", "gender": 1, "id": 10205, "name": "Sigourney Weaver", "order": 2}, {"cast_id": 4, "character": "Col. Quaritch", "credit_id": "52fe48009251416c750ac9cf", "gender": 2, "id": 32747, "name": "Stephen Lang", "order": 3}, {"cast_id": 5, "character": "Trudy Chacon", "credit_id": "52fe48009251416c750ac9d3", "gender": 1, "id": 17647, "name": "Michelle Rodriguez", "order": 4}, {"cast_id": 8, "character": "Selfridge", "credit_id": "52fe48009251416c750ac9e1", "gender": 2, "id": 1771, "name": "Giovanni Ribisi", "order": 5}, {"cast_id": 7, "character": "Norm Spellman", "credit_id": "52fe48009251416c750ac9dd", "gender": 

In [17]:
def extract_3_cast_names(argobj):
    # convert the given string of list to actual list
    argobj = ast.literal_eval(argobj)
    # initialize empty list
    L = []
    # run loop through the list of dict
    for i in range(3):
        try:
            dictele = argobj[i]
            # to remove white-space from the string
            # i.e., to make "Abc Def" -> "AbcDef"
            castname = dictele["name"].replace(" ", "")
            # append the castname to list L
            L.append(castname)
        except:
            pass
    return L
def extract_director_name(argobj):
    # convert the given string of list to actual list
    argobj = ast.literal_eval(argobj)
    # initialize empty list
    L = []
    # run loop through the list of dict
    for dictele in argobj:
        if dictele['job']=='Director':
            # to remove white-space from the string
            # i.e., to make "Abc Def" -> "AbcDef"
            directorname = dictele["name"].replace(" ", "")
            # append the directorname to list L
            L.append(directorname)
            break
    return L

movies['cast'] = movies['cast'].apply(extract_3_cast_names)
movies['crew'] = movies['crew'].apply(extract_director_name)
movies['crew']

0           [JamesCameron]
1          [GoreVerbinski]
2              [SamMendes]
3       [ChristopherNolan]
4          [AndrewStanton]
               ...        
4804     [RobertRodriguez]
4805         [EdwardBurns]
4806          [ScottSmith]
4807          [DanielHsia]
4808     [BrianHerzlinger]
Name: crew, Length: 4806, dtype: object

In [18]:
movies['overview'] = movies['overview'].apply(lambda x : x.split())

In [19]:
movies.head(2)

Unnamed: 0,genres,id,keywords,title,overview,cast,crew
0,"[Action, Adventure, Fantasy, ScienceFiction]",19995,"[cultureclash, future, spacewar, spacecolony, ...",Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]
1,"[Adventure, Fantasy, Action]",285,"[ocean, drugabuse, exoticisland, eastindiatrad...",Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski]


**Now, Create `tags` column marging `genres, keywords, overview, cast, crew` columns**

In [20]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
movies_newdf = movies[['id', 'title', 'tags']]
movies_newdf.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili..."


**Make a Paragraph (in lower case) from list of tags**

In [21]:
movies_newdf['tags'] = movies_newdf['tags'].apply(lambda x : " ".join(x).lower())
movies_newdf.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_newdf['tags'] = movies_newdf['tags'].apply(lambda x : " ".join(x).lower())


Unnamed: 0,id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


**Stemming**

In [22]:
ps = PorterStemmer()
def stemparagraph(paragraph):
    res = []
    for word in paragraph.split():
        res.append(ps.stem(word))
    return " ".join(res)
movies_newdf['tags'] = movies_newdf['tags'].apply(stemparagraph)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_newdf['tags'] = movies_newdf['tags'].apply(stemparagraph)


In [23]:
movies_newdf['tags'][0]

'in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. action adventur fantasi sciencefict cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d samworthington zoesaldana sigourneyweav jamescameron'

**Vectorization Of Tags Paragraph**

In [24]:
ct_vec = CountVectorizer(max_features=5000, stop_words='english')
vectors = ct_vec.fit_transform(movies_newdf['tags']).toarray()
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [25]:
# ct_vec.get_feature_names()

**Calculate Cosine Distance**

In [26]:
similarity_score = cosine_similarity(vectors)
similarity_score[0]

array([1.        , 0.08346223, 0.0860309 , ..., 0.04499213, 0.        ,
       0.        ])

In [27]:
sorted(list(enumerate(similarity_score[0])), reverse=True, key=lambda x : x[1])[1:6]

[(1216, 0.28676966733820225),
 (2409, 0.26901379342448517),
 (3730, 0.2605130246476754),
 (507, 0.255608593705383),
 (539, 0.25038669783359574)]

**Recommend Function**

In [28]:
def recommend(movie):
    movieindx = movies_newdf[ movies_newdf['title']==movie ].index[0]
    distances = similarity_score[movieindx]
    recommandationlist = sorted(list(enumerate(distances)), reverse=True, key=lambda x : x[1])[1:6]
    for rec_movie in recommandationlist:
        print(movies_newdf.iloc[rec_movie[0]].title)

In [29]:
recommend('Batman')

Batman
Batman & Robin
Batman Begins
Batman Returns
The R.M.


In [30]:
import pickle

In [31]:
pickle.dump(movies_newdf.to_dict(), open('moviesdict.pkl', 'wb'))

In [32]:
# movies_newdf.to_dict()

In [33]:
pickle.dump(similarity_score, open('simimat.pkl', 'wb'))