# Movie Recommender

In [305]:
import numpy as np
import pandas as pd

from ast import literal_eval

import warnings
warnings.simplefilter('ignore')

In [306]:
md = pd. read_csv('movie-dataset/movies_metadata.csv')
md.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


### Filtering Genres from the data

In [307]:
md['genres'].head()

0    [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
1    [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...
2    [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...
3    [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...
4                       [{'id': 35, 'name': 'Comedy'}]
Name: genres, dtype: object

In [308]:
# "literal_eval" checks if the data is a single datatype, else throws an exception | lambda function extracts genre from the data.

md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [309]:
md['genres'].head()

0     [Animation, Comedy, Family]
1    [Adventure, Fantasy, Family]
2               [Romance, Comedy]
3        [Comedy, Drama, Romance]
4                        [Comedy]
Name: genres, dtype: object

### Converting Date to Year

In [310]:
md['release_date'].head()

0    1995-10-30
1    1995-12-15
2    1995-12-22
3    1995-12-22
4    1995-02-10
Name: release_date, dtype: object

In [311]:
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
md['year'].head()

0    1995
1    1995
2    1995
3    1995
4    1995
Name: year, dtype: object

## Calculating Weighted Rating

Weighted Rating (WR) = $(\frac{v}{v + m} . R) + (\frac{m}{v + m} . C)$
where,
* *v* is the number of votes for the movie
* *m* is the minimum votes required to be listed in the chart
* *R* is the average rating of the movie
* *C* is the mean vote across the whole report

We will use **95th percentile** as our cutoff for *m*

In [154]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

## Top Movies of All Time

In [155]:
#Collect Non - Null data
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
#Collect Non - Null data
vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
m = vote_counts.quantile(0.95)
topmat = md[(md['vote_count'] >= m) & (md['vote_count'].notnull()) & (md['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
topmat['vote_count'] = topmat['vote_count'].astype('int')
topmat['vote_average'] = topmat['vote_average'].astype('int')
topmat.shape

(2274, 6)

In [156]:
topmat['wr'] = topmat.apply(weighted_rating, axis=1)
topmat = topmat.sort_values('wr', ascending=False)

In [157]:
top250 = topmat.head(250)
top250.head()

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,wr
15480,Inception,2010,14075,8,29.108149,"[Action, Thriller, Science Fiction, Mystery, A...",7.917588
12481,The Dark Knight,2008,12269,8,123.167259,"[Drama, Action, Crime, Thriller]",7.905871
22879,Interstellar,2014,11187,8,32.213481,"[Adventure, Drama, Science Fiction]",7.897107
2843,Fight Club,1999,9678,8,63.869599,[Drama],7.881753
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.070725,"[Adventure, Fantasy, Action]",7.871787


In [313]:
#Function for Top N Movies
def topnm (md, n):
    vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
    #Collect Non - Null data
    vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.95)
    topmat = md[(md['vote_count'] >= m) & (md['vote_count'].notnull()) & (md['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
    topmat['vote_count'] = topmat['vote_count'].astype('int')
    topmat['vote_average'] = topmat['vote_average'].astype('int')
    topmat['wr'] = topmat.apply(weighted_rating, axis=1)
    topmat = topmat.sort_values('wr', ascending=False)

    return topmat.head(n)


In [314]:
topnm(md,5)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,wr
15480,Inception,2010,14075,8,29.108149,"[Action, Thriller, Science Fiction, Mystery, A...",7.917588
12481,The Dark Knight,2008,12269,8,123.167259,"[Drama, Action, Crime, Thriller]",7.905871
22879,Interstellar,2014,11187,8,32.213481,"[Adventure, Drama, Science Fiction]",7.897107
2843,Fight Club,1999,9678,8,63.869599,[Drama],7.881753
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.070725,"[Adventure, Fantasy, Action]",7.871787


## Genre based charts

In [158]:
s = md.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
s.head()

0    Animation
0       Comedy
0       Family
1    Adventure
1      Fantasy
Name: genre, dtype: object

In [159]:
#New dataframe for genre classified movies (For each genre for a movie a seperate entry is created)
gen_md = md.drop('genres', axis=1).join(s)
gen_md.head()

Unnamed: 0,adult,belongs_to_collection,budget,homepage,id,imdb_id,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year,genre
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995,Animation
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995,Comedy
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995,Family
1,False,,65000000,,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995,Adventure
1,False,,65000000,,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995,Fantasy


In [303]:
def build_chart(genres, n, percentile=0.85):

    final_list = pd.DataFrame(columns = ['title', 'year', 'vote_count', 'vote_average', 'popularity','wr'])

    for genre in genres:
        df = gen_md[gen_md['genre'] == genre]
        vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int') #Series of Vote Counts
        vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int') #Series of Vote Averages
        C = vote_averages.mean() #Mean of vote averages
        m = vote_counts.quantile(percentile) # Get the 'percentile' value of vote counts
        #Creating a new dataframe for the requested chart
        qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
        qualified['vote_count'] = qualified['vote_count'].astype('int')
        qualified['vote_average'] = qualified['vote_average'].astype('int')
        #Calculating the weighted rating for each movie
        qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
        #Select the top 250 Movies
        qualified = qualified.sort_values('wr', ascending=False).head(n)

        final_list = pd.concat([ final_list,qualified])

        #TODO : The final list may contain duplicate values of some entries (We may need to merge the duplicates by taking mean of wr)

    final_list = final_list.sort_values('wr',ascending=False)
    final_list.reset_index(drop=True, inplace=True)
    
    return (final_list.head(n))

In [304]:
#Give the list of top 'n' movies in the list of genre based of wr rating.
build_chart(['Action','Romance'],250)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
0,Dilwale Dulhania Le Jayenge,1995,661,9,34.457024,8.565285
1,Forrest Gump,1994,8147,8,48.307194,7.971357
2,Inception,2010,14075,8,29.108149,7.955099
3,The Dark Knight,2008,12269,8,123.167259,7.948610
4,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.070725,7.929579
...,...,...,...,...,...,...
245,House of Flying Daggers,2004,452,7,9.3002,6.732801
246,The Phantom of the Opera,2004,448,7,7.682562,6.730818
247,Pay It Forward,2000,447,7,10.479723,6.730318
248,Ip Man,2008,1309,7,6.004257,6.729747
