In [30]:
import numpy as np
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import PorterStemmer
import pickle

In [31]:
import os
MOVIE_DATASET_LINK = os.path.join("dataset", "tmdb_5000_movies.csv")
CREDIT_DATASET_LINK = os.path.join("dataset", "tmdb_5000_credits.csv")

In [32]:
movies = pd.read_csv(MOVIE_DATASET_LINK)
credits = pd.read_csv(CREDIT_DATASET_LINK)

print("movies shape: ", movies.shape)
print("credits shape: ", credits.shape)

print("\nColumns of movies: ", movies.columns)
print("\nColumns of credits: ", credits.columns)

movies shape:  (4803, 20)
credits shape:  (4803, 4)

Columns of movies:  Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

Columns of credits:  Index(['movie_id', 'title', 'cast', 'crew'], dtype='object')


In [33]:
credits.rename(columns = { 'movie_id' : 'id' }, inplace=True)

In [34]:
movies.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [35]:
credits.head(2)

Unnamed: 0,id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [36]:
movies = movies.merge(credits, on = 'id')
movies = movies[['id','title_x','overview','genres','keywords','cast','crew']]
movies.rename(columns={"title_x":"title"}, inplace=True)
movies.head(2)

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [37]:
def convert_dict_list(d, k, lim=0, cond_dict=dict()):
    l = []
    counter = 0
    
    for i in ast.literal_eval(d):
        # if job == director, append the name
        # in this kinds of scenarios this condition will work
        # cond_dict = { job: Director }
        if len(cond_dict) != 0:
            for key, val in cond_dict.items():
                if i[key] in val or i[key] == val:
                    l.append(i[k].replace(" ", "").lower())
        else:
            l.append(i[k].replace(" ", "").lower())
            
        counter += 1
        
        # the counter is for the number of entries of the dictionary
        # that we want to append in the list
        if lim != 0 and counter == lim:
            break
    return l

In [38]:
movies['genres'] = movies['genres'].apply(lambda x: convert_dict_list(x, 'name'))
movies['keywords'] = movies['keywords'].apply(lambda x: convert_dict_list(x, 'name'))
movies['cast'] = movies['cast'].apply(lambda x: convert_dict_list(x, 'name', lim=3))

# we want only th director & producers
cond_dict = {"job": ["Director", "Producer"]}
movies['crew'] = movies['crew'].apply(lambda x: convert_dict_list(x, 'name', lim=0, cond_dict = cond_dict))
movies['overview'] = movies['overview'].apply(lambda x: str(x).split())

movies['tag'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
movies['tag'] = movies['tag'].apply(lambda x: " ".join(x))
movies.head(2)

Unnamed: 0,id,title,overview,genres,keywords,cast,crew,tag
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[action, adventure, fantasy, sciencefiction]","[cultureclash, future, spacewar, spacecolony, ...","[samworthington, zoesaldana, sigourneyweaver]","[jamescameron, jamescameron, jonlandau]","In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[adventure, fantasy, action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[johnnydepp, orlandobloom, keiraknightley]","[goreverbinski, jerrybruckheimer, ericmcleod, ...","Captain Barbossa, long believed to be dead, ha..."


In [39]:
# removing the unnessary columns
movies.drop(columns = ["overview", "genres", "keywords", "cast", "crew"], axis=1, inplace=True)
movies.head(5)

Unnamed: 0,id,title,tag
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


## Stemming
Words like 'LOVE', 'LOVING', 'LOVED', although the root words are same, but they will be considered differently. Hence, `Stemming` is applied so that only the root words are considered.

In [40]:
def stem_words(text):
    res = []
    ps = PorterStemmer()
    for word in text.split():
        res.append(ps.stem(word))
    return " ".join(res)

In [41]:
movies['tag'] = movies['tag'].apply(stem_words)
movies.head(5)

Unnamed: 0,id,title,tag
0,19995,Avatar,"in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believ to be dead, ha c..."
2,206647,Spectre,a cryptic messag from bond’ past send him on a...
3,49026,The Dark Knight Rises,follow the death of district attorney harvey d...
4,49529,John Carter,"john carter is a war-weary, former militari ca..."


In [42]:
cv = CountVectorizer(max_features=5000, stop_words='english')
vector = cv.fit_transform(movies['tag']).toarray()
vector.shape

(4803, 5000)

In [43]:
similarity = cosine_similarity(vector)
similarity

array([[1.        , 0.07808688, 0.07919455, ..., 0.04331481, 0.        ,
        0.        ],
       [0.07808688, 1.        , 0.05634362, ..., 0.02311251, 0.        ,
        0.02541643],
       [0.07919455, 0.05634362, 1.        , ..., 0.02344036, 0.        ,
        0.        ],
       ...,
       [0.04331481, 0.02311251, 0.02344036, ..., 1.        , 0.04003204,
        0.04229549],
       [0.        , 0.        , 0.        , ..., 0.04003204, 1.        ,
        0.08804509],
       [0.        , 0.02541643, 0.        , ..., 0.04229549, 0.08804509,
        1.        ]])

## Recommendation

In [44]:
movies[movies['title'] == 'The Lego Movie'].index[0]

744

In [45]:
similarity_score = list(enumerate(similarity[0]))

In [46]:
def recommend_movie(data, data_similarity, movie_name, top=5):
    # fetching the movie index
    idx = data[data['title'] == movie_name].index[0]
    
    # selecting the top similarity scores for movies
    similarity_score = list(enumerate(data_similarity[idx]))
    similarity_score = sorted(similarity_score, reverse=True, key=lambda x: x[1])[1:top+1]
    
    # recommend movies
    recommend_moviename = []
    recommend_movieidx = []
    for i in similarity_score:
        recommend_movieidx.append(i)
        recommend_moviename.append(data.iloc[i[0]]['title'])
    return recommend_moviename, recommend_movieidx

In [47]:
recommend_movie(movies, similarity, 'Batman Begins')

(['The Dark Knight',
  'The Dark Knight Rises',
  'Batman',
  'Batman v Superman: Dawn of Justice',
  '10th & Wolf'],
 [(65, 0.420210157631365),
  (3, 0.3400921207590813),
  (1359, 0.3227486121839514),
  (9, 0.3045547950507524),
  (3293, 0.3014302822441094)])

In [48]:
recommend_movie(movies, similarity, 'The Lego Movie', top=6)

(['Curious George',
  'The Adventures of Rocky & Bullwinkle',
  'The Croods',
  'Percy Jackson: Sea of Monsters',
  'Penguins of Madagascar',
  'The Boxtrolls'],
 [(934, 0.28644594961577313),
  (503, 0.26609057819508725),
  (234, 0.2597621667330656),
  (368, 0.2594372608313854),
  (179, 0.2567762955065478),
  (742, 0.2549249642552304)])

In [49]:
pickle.dump(movies, open('movies.pkl', 'wb'))
pickle.dump(similarity, open('movies_similarity_score.pkl', 'wb'))