# Data Science Project - Movies Recommender System

### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise        import linear_kernel, cosine_similarity
from nltk.stem.snowball              import SnowballStemmer
from nltk.stem.wordnet               import WordNetLemmatizer 
from nltk.corpus                     import wordnet
from surprise                        import Reader, Dataset, SVD
from surprise.model_selection        import cross_validate, KFold


In [2]:
# importing the dataset
md = pd.read_csv('movies_metadata.csv')
md.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [3]:
md['genres'] = md['genres'].fillna('[]')

In [4]:
# converting the strings in genres column in to lists 

md['genres'] = md['genres'].apply(literal_eval)

In [5]:
# extracting only the genre names from the list and excluding all the id's

md['genres'] = md['genres'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [6]:
md['genres'].head()

0     [Animation, Comedy, Family]
1    [Adventure, Fantasy, Family]
2               [Romance, Comedy]
3        [Comedy, Drama, Romance]
4                        [Comedy]
Name: genres, dtype: object

In [7]:
md.shape

(45466, 24)

In [8]:
# dectecting null value 'vote_count' rows

md[md['vote_count'].notnull()].shape

(45460, 24)

In [9]:
vote_count = md[md['vote_count'].notnull()]['vote_count'].astype('int')
vote_count

0        5415
1        2413
2          92
3          34
4         173
         ... 
45461       1
45462       3
45463       6
45464       0
45465       0
Name: vote_count, Length: 45460, dtype: int32

In [10]:
vote_average = md[md['vote_average'].notnull()]['vote_average'].astype('int')
vote_average

0        7
1        6
2        6
3        6
4        5
        ..
45461    4
45462    9
45463    3
45464    0
45465    0
Name: vote_average, Length: 45460, dtype: int32

In [11]:
top_movies = md.copy()

In [12]:
# for General Recommendations, getting the top movies by considering the vote_average

top_movies1 = top_movies.sort_values('vote_average', ascending=False).head(250)
top_movies1[['original_title','vote_average','vote_count']].head()

Unnamed: 0,original_title,vote_average,vote_count
21642,Ice Age Columbus: Who Were the First Americans?,10.0,1.0
15710,If God Is Willing and da Creek Don't Rise,10.0,1.0
22396,Meat the Truth,10.0,1.0
22395,Marvin Hamlisch: What He Did For Love,10.0,1.0
35343,Elaine Stritch: At Liberty,10.0,1.0


### The above general recommendation is not ideal, since the vote_count for each of them is only 1, so there is clear bias and cannot be used as suitable recommendation logic.

In [13]:
# General Recommendations now also considering the vote_count greater than 5000

top_movies2 = top_movies[top_movies['vote_count']>5000].sort_values('vote_average', ascending=False).head(250)
top_movies2[['original_title','vote_average','vote_count']].head()

Unnamed: 0,original_title,vote_average,vote_count
314,The Shawshank Redemption,8.5,8358.0
834,The Godfather,8.5,6024.0
292,Pulp Fiction,8.3,8670.0
12481,The Dark Knight,8.3,12269.0
2843,Fight Club,8.3,9678.0


### Here theses top movies can be presented as general recommendation.

## Getting new recommendations by applying IMDB formula

In [14]:
C = vote_average.mean()
C

5.244896612406511

In [15]:
m = vote_count.quantile(0.95)
m

434.0

In [16]:
# creating a year cloumn

top_movies['year'] = pd.to_datetime(top_movies['release_date'], 
                                    errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
top_movies['year'].sample(5)

148      1995
18056    2007
25034    1973
39801    1995
18391    2008
Name: year, dtype: object

In [17]:
top_movies3 = top_movies[(top_movies['vote_count'] >= m) & 
                         (top_movies['vote_count'].notnull()) & 
                         (top_movies['vote_average'].notnull())][['title','year','vote_count',
                                                                  'vote_average','popularity','genres']]
top_movies3['vote_count'] = top_movies3['vote_count'].astype('int')
top_movies3['vote_average'] = top_movies3['vote_average'].astype('int')
top_movies3.shape

(2274, 6)

In [18]:
# defining a function to apply imdb formula on the above dataset

def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m)*R + (m/(v+m))*C)

In [19]:
# creating a 'weight_rate' column

top_movies3['weight_rate'] = top_movies3.apply(weighted_rating, axis=1)

In [20]:
# sorting the new dataset using 'weight_rate'

top_movies3 = top_movies3.sort_values('weight_rate', ascending=False).head(10)
top_movies3

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,weight_rate
15480,Inception,2010,14075,8,29.108149,"[Action, Thriller, Science Fiction, Mystery, A...",7.917588
12481,The Dark Knight,2008,12269,8,123.167259,"[Drama, Action, Crime, Thriller]",7.905871
22879,Interstellar,2014,11187,8,32.213481,"[Adventure, Drama, Science Fiction]",7.897107
2843,Fight Club,1999,9678,8,63.869599,[Drama],7.881753
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.070725,"[Adventure, Fantasy, Action]",7.871787
292,Pulp Fiction,1994,8670,8,140.950236,"[Thriller, Crime]",7.86866
314,The Shawshank Redemption,1994,8358,8,51.645403,"[Drama, Crime]",7.864
7000,The Lord of the Rings: The Return of the King,2003,8226,8,29.324358,"[Adventure, Fantasy, Action]",7.861927
351,Forrest Gump,1994,8147,8,48.307194,"[Comedy, Drama, Romance]",7.860656
5814,The Lord of the Rings: The Two Towers,2002,7641,8,29.423537,"[Adventure, Fantasy, Action]",7.851924


### These are the top movies scooped using the imdb weightage formula.

# Top Movies in accordance with Genres

In [21]:
# Seperating the genres.

genre_TM = top_movies.apply(lambda x: pd.Series(x['genres']), axis=1).stack().reset_index(level=1, drop=True)
genre_TM.name = 'genre'
genre_top_movies = top_movies.drop('genres', axis=1).join(genre_TM)
genre_top_movies[['title','year','vote_count','vote_average','popularity','genre']].head()

  genre_TM = top_movies.apply(lambda x: pd.Series(x['genres']), axis=1).stack().reset_index(level=1, drop=True)


Unnamed: 0,title,year,vote_count,vote_average,popularity,genre
0,Toy Story,1995,5415.0,7.7,21.946943,Animation
0,Toy Story,1995,5415.0,7.7,21.946943,Comedy
0,Toy Story,1995,5415.0,7.7,21.946943,Family
1,Jumanji,1995,2413.0,6.9,17.015539,Adventure
1,Jumanji,1995,2413.0,6.9,17.015539,Fantasy


In [22]:
def build_chart(genre, percentile=0.85):
    df = genre_top_movies[genre_top_movies['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    qualified = df[(df['vote_count'] >= m) & 
                   (df['vote_count'].notnull()) & 
                   (df['vote_average'].notnull())][['title','year','vote_count','vote_average','popularity','genre']]
    qualified['vote_count']   = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    
    return qualified

# imdb formula
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m)*R + (m/(v+m))*C)

### top movies according to the user specified genres

In [23]:
build_chart('Animation').head(10)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genre,wr
359,The Lion King,1994,5520,8,21.605761,Animation,7.799175
5481,Spirited Away,2001,3968,8,41.048867,Animation,7.72837
9698,Howl's Moving Castle,2004,2049,8,16.136048,Animation,7.518439
2884,Princess Mononoke,1997,2041,8,17.166725,Animation,7.516883
5833,My Neighbor Totoro,1988,1730,8,13.507299,Animation,7.447452
40251,Your Name.,2016,1030,8,34.461252,Animation,7.183255
5553,Grave of the Fireflies,1988,974,8,0.010902,Animation,7.150771
19901,Paperman,2012,734,8,7.198633,Animation,6.976272
13724,Up,2009,7048,7,19.330884,Animation,6.898194
30315,Inside Out,2015,6737,7,23.985587,Animation,6.893778


In [24]:
build_chart('Family').head(10)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genre,wr
1225,Back to the Future,1985,6239,8,25.778509,Family,7.820813
359,The Lion King,1994,5520,8,21.605761,Family,7.799175
5481,Spirited Away,2001,3968,8,41.048867,Family,7.72837
5833,My Neighbor Totoro,1988,1730,8,13.507299,Family,7.447452
926,It's a Wonderful Life,1946,1103,8,15.031588,Family,7.222046
19901,Paperman,2012,734,8,7.198633,Family,6.976272
4766,Harry Potter and the Philosopher's Stone,2001,7188,7,38.187238,Family,6.900064
13724,Up,2009,7048,7,19.330884,Family,6.898194
30315,Inside Out,2015,6737,7,23.985587,Family,6.893778
15472,Despicable Me,2010,6595,7,22.274502,Family,6.891633


In [25]:
build_chart('Action').head(10)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genre,wr
15480,Inception,2010,14075,8,29.108149,Action,7.917588
12481,The Dark Knight,2008,12269,8,123.167259,Action,7.905871
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.070725,Action,7.871787
7000,The Lord of the Rings: The Return of the King,2003,8226,8,29.324358,Action,7.861927
5814,The Lord of the Rings: The Two Towers,2002,7641,8,29.423537,Action,7.851924
256,Star Wars,1977,6778,8,42.149697,Action,7.834205
1154,The Empire Strikes Back,1980,5998,8,19.470959,Action,7.814099
4135,Scarface,1983,3017,8,11.299673,Action,7.653516
9430,Oldboy,2003,2000,8,10.616859,Action,7.508745
1910,Seven Samurai,1954,892,8,15.01777,Action,7.098254


# Content based Recommender

In [40]:
links_small = pd.read_csv('links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype(int)

In [41]:
# excluding few rows using index values

top_movies = top_movies.drop([19730,29503,35587])

In [42]:
top_movies['id'] = top_movies['id'].astype('int')

In [44]:
top_movies4 = top_movies[top_movies['id'].isin(links_small)]

In [45]:
top_movies4.shape

(9099, 25)

#### The dataset has been reduced significantly using the selected id's from links_small dataset

## Movies Description Based Recommender

##### Here we are using movie description and taglines. 

In [52]:
top_movies4.loc['tagline'] = top_movies4['tagline'].fillna('')
# creating 'description' column
top_movies4.loc['description'] = top_movies4['overview'] + top_movies4['tagline']
top_movies4.loc['description'] = top_movies4['description'].fillna('')

In [53]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(top_movies4['description'])

In [54]:
tfidf_matrix

<9101x268124 sparse matrix of type '<class 'numpy.float64'>'
	with 540591 stored elements in Compressed Sparse Row format>

In [55]:
tfidf_matrix.shape

(9101, 268124)

In [56]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [57]:
cosine_sim

array([[1.        , 0.00680489, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.00680489, 1.        , 0.01531088, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.01531088, 1.        , ..., 0.00472191, 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.00472191, ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [58]:
cosine_sim[0]

array([1.        , 0.00680489, 0.        , ..., 0.        , 0.        ,
       0.        ])

In [59]:
top_movies4 = top_movies4.reset_index()
titles = top_movies4['title']
indices = pd.Series(top_movies4.index, index=top_movies4['title'])

In [60]:
# function that looks for movies with similar title or premise as of the title specified by the user using the prepared tfid_matrix
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [64]:
get_recommendations('GoldenEye').head(10)

5172                          Octopussy
2398                   Live and Let Die
4309                      Casino Royale
5175              Never Say Never Again
2396                 For Your Eyes Only
1883                   A View to a Kill
2901        The Man with the Golden Gun
5171                You Only Live Twice
2397                    Licence to Kill
2896    On Her Majesty's Secret Service
Name: title, dtype: object

In [65]:
get_recommendations('The Dark Knight').head(10)

7931                      The Dark Knight Rises
132                              Batman Forever
1113                             Batman Returns
8227    Batman: The Dark Knight Returns, Part 2
7565                 Batman: Under the Red Hood
524                                      Batman
7901                           Batman: Year One
2579               Batman: Mask of the Phantasm
2696                                        JFK
8165    Batman: The Dark Knight Returns, Part 1
Name: title, dtype: object

In [66]:
get_recommendations('The Godfather').head(10)

973      The Godfather: Part II
8387                 The Family
3509                       Made
4196         Johnny Dangerously
29               Shanghai Triad
5667                       Fury
2412             American Movie
1582    The Godfather: Part III
4221                    8 Women
2159              Summer of Sam
Name: title, dtype: object