In [66]:
%%HTML
<h2>Feature : Developing a Movie Recommendation System using Content based Filtering Algorithm </h2>
    
    <p><font size="4">Description -:</font>
        <br>
        <br>
       1) Display the top 10 Movies  <br>
        &nbsp;&nbsp;&nbsp;    a) Select movies present in the 90th percentile <br>
        &nbsp;&nbsp;&nbsp;     b) Calculate weighted rating using IMDB's weighted rating formula <br>
        &nbsp;&nbsp;&nbsp;     c) Display the top movies. <br>
            <br>
       2) Recommend similar movies to the movie taken as input from the user depending upon the top three actors present, the director and the plot of the movie. <br> 
           a) Use Cosine Similarity to calculate similarity between two movies <br>
           b) Display 10 similar movies
    </p>


In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
import warnings; warnings.simplefilter('ignore')

In [2]:
# Reads the movies_metadata.csv file and displays the first 5 rows
movie_data = pd. read_csv('C:/Users/Priyanka Dabadge/Documents/Sem 1/Python/Project/movies_metadata.csv')
movie_data.head()

# Extracting the data from JSON format to a list containing the genres of each movie in the dataset
movie_data['genres'] = movie_data['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])



In [3]:
''' Using IMDB's weighted rating formula to find the rating for each movie to find top movies based upon the following variables -:
    v = number of votes for the movie
    m = minimum votes required to be listed in the chart
    R = average rating of the movie
    C = mean vote across the entire dataset
'''
# storing the count of votes for each movie
vote_counts = movie_data[movie_data['vote_count'].notnull()]['vote_count'].astype('int')

# storing the vote average for each movie 
vote_averages = movie_data[movie_data['vote_average'].notnull()]['vote_average'].astype('int')

# selecting movies present in the 90 th percentile for finding the top 10 all time movies.
m = vote_counts.quantile(0.90) 

# Finding mean of votes across the dataset
C = vote_averages.mean()


In [4]:
# Filtering out movies that qualify for the chart based on the vote count
movie_data['year'] = pd.to_datetime(movie_data['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
top_movies = movie_data[(movie_data['vote_count'] >= m) & (movie_data['vote_count'].notnull()) & (movie_data['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
top_movies['vote_count'] = top_movies['vote_count'].astype('int') 
top_movies['vote_average'] = top_movies['vote_average'].astype('int')
top_movies.shape

(4555, 6)

In [5]:
# Function to calculate the weighted rating using IMDB's formula
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    weighted_rating = (v/(v+m) * R) + (m/(m+v) * C)
    return weighted_rating

In [6]:
# stores the calculated weighted rating in a list
top_movies['wr'] = top_movies.apply(weighted_rating, axis=1)
# sorting based on the calculated weighted rating
top_movies = top_movies.sort_values('wr', ascending=False).head(250)
# Displays the top 10 all time movies
top_movies.head(10) 

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,wr
10309,Dilwale Dulhania Le Jayenge,1995,661,9,34.457,"[Comedy, Drama, Romance]",8.268189
15480,Inception,2010,14075,8,29.1081,"[Action, Thriller, Science Fiction, Mystery, A...",7.969033
12481,The Dark Knight,2008,12269,8,123.167,"[Drama, Action, Crime, Thriller]",7.964533
22879,Interstellar,2014,11187,8,32.2135,"[Adventure, Drama, Science Fiction]",7.961151
2843,Fight Club,1999,9678,8,63.8696,[Drama],7.955192
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.0707,"[Adventure, Fantasy, Action]",7.951302
292,Pulp Fiction,1994,8670,8,140.95,"[Thriller, Crime]",7.950077
314,The Shawshank Redemption,1994,8358,8,51.6454,"[Drama, Crime]",7.948249
7000,The Lord of the Rings: The Return of the King,2003,8226,8,29.3244,"[Adventure, Fantasy, Action]",7.947434
351,Forrest Gump,1994,8147,8,48.3072,"[Comedy, Drama, Romance]",7.946934


In [12]:
# Load keywords and credits
credits = pd.read_csv('C:/Users/Priyanka Dabadge/Documents/Sem 1/Python/Project/credits.csv')
keywords = pd.read_csv('C:/Users/Priyanka Dabadge/Documents/Sem 1/Python/Project/keywords.csv')

# Remove rows with bad IDs.
movie_data = movie_data.drop([19730, 29503, 35587])

# Convert IDs to int. Required for merging
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
movie_data['id'] = movie_data['id'].astype('int')

# Merge keywords and credits into the main dataset
movie_data = movie_data.merge(credits, on='id')
movie_data = movie_data.merge(keywords, on='id')

# Printing the first two movies of the newly merged movie_data
movie_data.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,year,cast_x,crew_x,keywords_x,cast_y,crew_y,keywords_y,cast,crew,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."


In [13]:
#Parse the stringified features into their corresponding python objects
movie_data['cast'] = movie_data['cast'].apply(literal_eval)
movie_data['crew'] = movie_data['crew'].apply(literal_eval)
movie_data['keywords'] = movie_data['keywords'].apply(literal_eval)
movie_data['cast_size'] = movie_data['cast'].apply(lambda x: len(x))
movie_data['crew_size'] = movie_data['crew'].apply(lambda x: len(x))

In [14]:
# Get the director's name from the crew feature. If director is not listed, return NaN
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

movie_data['director'] = movie_data['crew'].apply(get_director)



In [15]:
# To find the top 3 actors from cast
movie_data['cast'] = movie_data['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
movie_data['cast'] = movie_data['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

In [16]:
# to find keywords
movie_data['keywords'] = movie_data['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [17]:
# strip spaces and converting to lowercase
movie_data['cast'] = movie_data['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
# Mention Director 3 times to give it more weight relative to the entire cast
movie_data['director'] = movie_data['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
movie_data['director'] = movie_data['director'].apply(lambda x: [x,x, x])

In [18]:
# calculating frequency count of every keyword that appears in the dataset
fc = movie_data.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
fc.name = 'keyword'
fc = fc.value_counts()
fc[:5]

woman director      3817
independent film    2265
murder              1564
based on novel      1032
biography            965
Name: keyword, dtype: int64

In [19]:
# Removing keywords appearing only once
fc = fc[fc > 1]

In [20]:
# converting every word to its stem word
stemmer = SnowballStemmer('english')
stemmer.stem('dogs')

'dog'

In [40]:
# Filtering keywords
def filter_keywords(x):
    words = []
    for i in x:
        if i in fc:
            words.append(i)
    
    return words


In [23]:
movie_data['keywords'] = movie_data['keywords'].apply(filter_keywords)
# converting every word to its stem word
movie_data['keywords'] = movie_data['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
# to convert all strings to lower case and strip names of spaces
movie_data['keywords'] = movie_data['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [24]:
# creating a new soup which contains a string that contains all the data that has to be sent to the vectorizer
movie_data['soup'] = movie_data['keywords'] + movie_data['cast'] + movie_data['director'] + movie_data['genres']
movie_data['soup'] = movie_data['soup'].apply(lambda x: ' '.join(x))

In [26]:
#creating the count matrix
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(movie_data['soup'])

In [27]:
# calculating cosine similarity
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [28]:
# Resetting the index of your main DataFrame and construct reverse mapping
movie_data = movie_data.reset_index()
titles = movie_data['title']
indices = pd.Series(movie_data.index, index=movie_data['title'])

In [29]:
# Function to recommend the top 10 movies. The input is the name of movie and 
# the output is the 10 most similar movies to the movie name provided by the user

def get_recommendations(movie_name, cosine_sim=cosine_sim):
    # Storing the index of the movie name
    idx = indices[movie_name]

    # Storing the pairwise similarity scores for all movies with respect to the movie name provided.
    similarity_scores = list(enumerate(cosine_sim[idx]))

    # Sorting the movies based on cosine similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    similarity_scores = similarity_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in similarity_scores]

    # Return the top 10 most similar movies
    return movie_data['title'].iloc[movie_indices]

In [38]:
# top 10 similar movies to The Shawshank Redemption
get_recommendations('The Shawshank Redemption').head(10)

5625              The Majestic
12489             Buried Alive
3537            The Green Mile
14237                 The Mist
3535          Cradle Will Rock
11421                   Bopha!
4976                     Ariel
23936               You and Me
40568     Duffy of San Quentin
34312    Women in Cell Block 7
Name: title, dtype: object