# Keywords based Filtering

Load the data first

In [20]:
import pandas as pd
import numpy as np
df1 = pd.read_csv('./datasets/tmdb_5000_credits.csv')
df2 = pd.read_csv('./datasets/tmdb_5000_movies.csv')

In [21]:
df1.head(3)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."


<br>The first dataset contains four different features - movie_id, title, cast and crew<br>

In [22]:
df2.head(3)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466


<br>The second dataset contains 20 different features.<br>
Now merging the two datasets on the basis of a feature which both the datasets have i.e. movie_id.<br>

In [23]:
df1.columns = ['id','title','cast','crew']
df2 = df2.merge(df1,on='id')

In [24]:
#parse the strinfied feature into their corresponding python objects
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    df2[feature] = df2[feature].apply(literal_eval)

<br>We need to design a function for making the Director of each movie as a different column in the dataset.<br>

In [25]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

<br>We design the function to get the list of top three actors,keywords and genres.<br>

In [26]:
def get_list(x):
    if isinstance(x,list):
        names = [i['name'] for i in x]
        
        if len(names)>3:
            names = names[:3]
        return names
    return []

In [27]:
df2['director'] = df2['crew'].apply(get_director)

features = ['cast','keywords','genres']

for feature in features:
    df2[feature] = df2[feature].apply(get_list)

<br>The next step would be to convert the names and keyword instances into lowercase and strip all the spaces between them.<br>

In [28]:
def clean_data(x):
    if isinstance(x,list):
        return [str.lower(i.replace(" ","")) for i in x]
    else:
        if isinstance(x,str):
            return str.lower(x.replace(" ",""))
        else:
            return ''

In [29]:
features = ['cast','director','keywords','genres']

for feature in features:
    df2[feature] = df2[feature].apply(clean_data)

<br>Creating a string which we can feed to Vectorizer(namely director,actors,genres and keywords).<br>

In [30]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])
df2['soup'] = df2.apply(create_soup, axis=1)

<br>Vectorizing the metadata of all the movies and storing it into a matrix.<br>

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
clf = CountVectorizer(stop_words='english')
count_matrix = clf.fit_transform(df2['soup'])

<br>Construct a reverse map of indices and movie titles.<br>

In [32]:
indices = pd.Series(df2.index, index=df2['original_title']).drop_duplicates()

<br>Designing the function to recommend the movies which are close to the given movie,on the basis of cosine_similarity.<br>

<b>Note : </b> We would use CountVectorizer() method instead of TF-IDF because here we don't want to down-weight any director/actor as he/she directed/acted in more than one movie.<br>


In [34]:
#Calculating the pairwise cosine similarity of the whole count_matrix

from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(count_matrix)

In [35]:
def get_recommendations(title,cosine_sim=cosine_sim):
    
    #Get the index of the movie which matches with the title
    idx = indices[title]
    
    #Get the pairwise similarity of the given movie with all the movies
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    #Sort all the movies based on the similarity scores 
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    #Get the top 10 similarity scores
    sim_scores = sim_scores[1:11]
    
    #Get the indices of the top 10 similarity scores
    scores = [i[0] for i in sim_scores]
    
    #Return the top 10 similar movies
    return df2['original_title'].loc[scores]

In [36]:
get_recommendations('Legal Eagles')

3526                     The Sting
4247         Me You and Five Bucks
4638      Amidst the Devil's Wings
3391                 Dom Hemingway
2110    Madea's Witness Protection
2485                   The Cookout
3817                    Four Lions
4730                 Cheap Thrills
298        The Wolf of Wall Street
752            My Favorite Martian
Name: original_title, dtype: object