# <center> Recommendation System on Movie Lens Dataset

## 2. Content Based Recommendation System

1. Description based
2. Metadata based

### 2.2 Metadata Based Recommendation

**(i) Read Data**

In [12]:
# Necessary imports

%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings; warnings.simplefilter('ignore')

from scipy import stats
from sklearn.cross_validation import train_test_split
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from math import sqrt
from ast import literal_eval

In [2]:
# Jupyter Notebook with Matplotlib Inline
%matplotlib notebook

# Importing necessary modules
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import metrics
from subprocess import check_output

from sklearn.metrics import pairwise_distances

In [42]:
# Reading the full dataset
credits = pd.read_csv('data_full/credits.csv')
keywords = pd.read_csv('data_full/keywords.csv')

In [43]:
# Metadata dataset [contains genre and movie description details]
metadata = pd.read_csv('data_full/movies_metadata.csv')

In [44]:
credits.head(2)

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844


In [45]:
keywords.head(2)

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."


In [46]:
metadata.head(1)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0


In [47]:
type(metadata.id[0])

str

In [48]:
# Converting string to int
def str_to_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [49]:
# Applying str_to_int()
metadata['id'] = metadata['id'].apply(str_to_int)

In [50]:
# Checking for null values
metadata[metadata['id'].isnull()]['id']

19730   NaN
29503   NaN
35587   NaN
Name: id, dtype: float64

In [51]:
# Index of null values
list(metadata[metadata['id'].isnull()]['id'].index)

[19730, 29503, 35587]

In [52]:
# Dropping null values using index
metadata = metadata.drop(list(metadata[metadata['id'].isnull()]['id'].index))

In [53]:
# Converting string to int
metadata['id'] = metadata['id'].astype('int')

In [54]:
# Merging credits and keywords to metadata_full 
metadata_full = metadata.merge(credits, on = 'id')
metadata_full = metadata_full.merge(keywords, on = 'id')

In [55]:
metadata_full.shape

(46628, 27)

**(ii) Data Pre-Processing**

In [56]:
# Sample version of the full dataset
links_small = pd.read_csv('data_full/links_small.csv')

In [57]:
links_small.head(1)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0


In [58]:
# Null values in tmdbId
links_small.tmdbId.isnull().sum()

13

In [59]:
# Removing null records in 'tmdbid'
links_small = links_small[links_small['tmdbId'].notnull()]

In [60]:
# Converting float tmdbid to int 
links_small['tmdbId'] = links_small['tmdbId'].astype(int)

In [61]:
# Links small new
links_small_new = links_small.copy(deep = True)

In [62]:
# Extracting director from 'crew'
def director(x):
    for i in literal_eval(x):
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [63]:
# Apply 'director' method
metadata_full['director'] = metadata_full['crew'].apply(director)

In [64]:
# Converting genre dictionary to list
def dict_to_list(x):
    ls = []
    for i in literal_eval(x):
        ls.append(i['name'])
    return ls

In [65]:
# Apply 'dict_to_list' method
for col in ['cast', 'crew', 'keywords']:
    metadata_full[col] = metadata_full[col].apply(dict_to_list)

In [94]:
metadata_full['genres'] = metadata_full['genres'].apply(dict_to_list)

In [67]:
# Null values
links_small['tmdbId'].isnull().sum()

0

In [68]:
# DataType of tmdbID entries 
type(links_small['tmdbId'][0])

numpy.int32

In [69]:
# Dataframe containing metadata with tmdbIds in 'links_small' 
links_small_new = metadata_full[metadata_full['id'].isin(links_small['tmdbId'])]
links_small_new.shape

(9219, 28)

In [70]:
links_small_new.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'cast', 'crew', 'keywords', 'director'],
      dtype='object')

In [71]:
# Null values in tagline = 2137
print(links_small_new['tagline'].isnull().sum())

# Null values in tagline = 12
print(links_small_new['overview'].isnull().sum())

2137
12


**Note:** Since there are null values in 'tagline' and 'overview', we cannot simply join them together to create a new columns ('description'). 

**Solution:** Strip off the white spaces

In [72]:
# Strip off white spaces from 'tagline'
links_small_new['tagline'] = links_small_new['tagline'].fillna('')

# Create new column 'description' = 'overview' + 'tagline'
links_small_new['description'] = links_small_new['overview'] + links_small_new['tagline']

# Strip off white spaces from 'description', if any
links_small_new['description'] = links_small_new['description'].fillna('')

**Note:** So far, `links_small_new` has cast, crew, credits and genres. But we do not need all the data in them. To efficiently use them, I clean each column further.

In [73]:
# Creating new features 'cast_size' and 'crew size'
links_small_new['cast_size'] = links_small_new['cast'].apply(lambda x: len(x))
links_small_new['crew_size'] = links_small_new['crew'].apply(lambda x: len(x))

In [74]:
# Cast of a movie
links_small_new['cast'][0]

['Tom Hanks',
 'Tim Allen',
 'Don Rickles',
 'Jim Varney',
 'Wallace Shawn',
 'John Ratzenberger',
 'Annie Potts',
 'John Morris',
 'Erik von Detten',
 'Laurie Metcalf',
 'R. Lee Ermey',
 'Sarah Freeman',
 'Penn Jillette']

**Note:** Cast can include actors and actress that are both famous and infamous. However, famous artists are most likely to play a significant role in affecting the users opinion than others. 

**Solution:** Select 4 artists [lead actor 1, lead actor 2, supporting actor 1, supporting actor 2] rather than considering all. 

In [75]:
# Selecting top 4 artists
links_small_new['cast'] = links_small_new['cast'].apply(lambda x: x[:4] if len(x) >= 4 else x)

In [76]:
links_small_new['cast'][1]

['Robin Williams', 'Jonathan Hyde', 'Kirsten Dunst', 'Bradley Pierce']

These are steps I follow in the preparation of genres and credits data:
1. **Strip Spaces and Convert to Lowercase** from all our features. This way, engine will not confuse between **Johnny Depp** and **Johnny Galecki.** 
2. **Mention Director 2 times** to give it more weight relative to the entire cast.

In [77]:
# Strip spaces from 'cast' and convert to lowercase
links_small_new['cast'] = links_small_new['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [78]:
# Strip spaces from 'director'
links_small_new['director'] = links_small_new['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))

# Adding weight to 'director'
links_small_new['director'] = links_small_new['director'].apply(lambda x: [x,x])

#### Keywords

We will do a small amount of pre-processing of our keywords before putting them to any use. As a first step, we calculate the frequenct counts of every keyword that appears in the dataset.

In [79]:
links_small_new['keywords'][:3]

0    [jealousy, toy, boy, friendship, friends, riva...
1    [board game, disappearance, based on children'...
2    [fishing, best friend, duringcreditsstinger, o...
Name: keywords, dtype: object

In [82]:
# Stacking all words from 'keywords'
w = links_small_new.apply(lambda x: pd.Series(x['keywords']), axis = 1).stack().reset_index(level = 1, drop = True)
w.name = 'keyword'

In [83]:
# Value counts
w = w.value_counts()
w[:5]

independent film        610
woman director          550
murder                  399
duringcreditsstinger    327
based on novel          318
Name: keyword, dtype: int64

**Note:** Keywords occur in frequencies ranging from 1 to 610. We do not have any use for keywords that occur only once. <br>
**Interpretation:** Keywords that occur just once.

In [84]:
w = w[w > 1]

**Stemming:** Play, Played, Playing words can be stemmed to *stem*.

Convert every word to its stem.

In [85]:
# Initialize stemmer object
stemmer = SnowballStemmer('english')

In [86]:
# Function to filter keywords
def filter_keywords(x):
    words = []
    for i in x:                     # For each word in input
        if i in w:                  # If that word in keywords
            words.append(i)         # append i to words 
    return words

In [87]:
# Apply filter_keywords to 'keywords'
links_small_new['keywords'] = links_small_new['keywords'].apply(filter_keywords)

# Stem keywords
links_small_new['keywords'] = links_small_new['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])

# Convert string to lower case and strip spaces
links_small_new['keywords'] = links_small_new['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [88]:
links_small_new['keywords'][1]

['boardgam',
 'disappear',
 "basedonchildren'sbook",
 'newhom',
 'reclus',
 'giantinsect']

**Soup:** Soup is the metadata of `genres`, `director`, `cast` and `keywords`. 

In [95]:
links_small_new['genres'] = links_small_new['genres'].apply(dict_to_list)

In [96]:
# Soup = 'keywords' + 'cast' + 'director' + 'genres'
links_small_new['soup'] = links_small_new['keywords'] + links_small_new['cast'] + links_small_new['director'] + links_small_new['genres']

In [97]:
links_small_new['soup'][1]

['boardgam',
 'disappear',
 "basedonchildren'sbook",
 'newhom',
 'reclus',
 'giantinsect',
 'robinwilliams',
 'jonathanhyde',
 'kirstendunst',
 'bradleypierce',
 'joejohnston',
 'joejohnston',
 'Adventure',
 'Fantasy',
 'Family']

In [98]:
# Remove quotations ('') and commas (,) from soup 
links_small_new['soup'] = links_small_new['soup'].apply(lambda x: ' '.join(x))

In [99]:
links_small_new['soup'][1]

"boardgam disappear basedonchildren'sbook newhom reclus giantinsect robinwilliams jonathanhyde kirstendunst bradleypierce joejohnston joejohnston Adventure Fantasy Family"

**Count Vectorizer** Create a count matrix and calculate the cosine similarities to find movies that are most similar.

In [100]:
# Count Vectorizer
count = CountVectorizer(analyzer = 'word', ngram_range = (1, 2), min_df = 0, stop_words = 'english')

# Build a count matrix by fitting and transforming 'soup'
count_matrix = count.fit_transform(links_small_new['soup'])

In [101]:
# Calculating cosine similarity of count matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [102]:
# Reset Index
links_small_new = links_small_new.reset_index()

# Titles
titles = links_small_new['title']

# Indices
indices = pd.Series(links_small_new.index, index = links_small_new['title'])

We will reuse the get_recommendations function that we had written earlier. Since our cosine similarity scores have changed, we expect it to give us different (and probably better) results. Let us check for **The Dark Knight** again and see what recommendations I get this time around.

In [103]:
# Recommendation Engine 
def recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:10]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [104]:
recommendations('The Dark Knight').head(10)

8031         The Dark Knight Rises
6218                 Batman Begins
7659    Batman: Under the Red Hood
6623                  The Prestige
1134                Batman Returns
5943                      Thursday
8927       Kidnapping Mr. Heineken
1260                Batman & Robin
2085                     Following
Name: title, dtype: object

In [105]:
recommendations('Pulp Fiction').head(10)

1381         Jackie Brown
8905    The Hateful Eight
5200    Kill Bill: Vol. 2
4595                Basic
4764             S.W.A.T.
898        Reservoir Dogs
6939              Cleaner
4903    Kill Bill: Vol. 1
231         Kiss of Death
Name: title, dtype: object

**Limitation:** This recommendation system returns only the movies based on soup. It does not consider `popularity`.

**Solution:** We use the results returned from our Count Vectorizer (indices) and return the movies that are popular based on the IMDB's weighted average. Additionally, I use three different criteria to cut-off the movies (75% percentile, Mean and No Cut-Off criteria)

**Weighted Average**

**IMDB's *weighted rating* formula:**

[Weighted Rating](https://math.stackexchange.com/questions/169032/understanding-the-imdb-weighted-rating-function-for-usage-on-my-own-website) (WR) = $(\frac{v}{v + m} . R) + (\frac{m}{v + m} . C)$

where,
* *v* - number of ratings for the movie
* *m* - number of ratings needed to qualify (usually mean)
* *R* - average rating of the movie
* *C* - mean rating of the population (whole dataset)

Before we could use the above weighted average formula, `m` and `C` should be determined.

In [106]:
# Claculation of c 
vote_counts = metadata[metadata['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = metadata[metadata['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
C

5.244896612406511

In [107]:
## Claculation of m
m = vote_counts.quantile(0.95)
m

434.0

In [108]:
# Function to calculate 'weighted_rating'
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

I try three different cutoff criteria: 
1. 95th percentile
2. Mean
3. No cut-off

In [None]:
# Apply weighted rating method to qualified_perc, qualified_mean, new_qualified, sm_df, metadata
for df in [qualified_perc, qualified_mean, new_qualified, sm_df, metadata, metadata_full]:
    df['weighted_rating'] = df.apply(weighted_rating, axis=1)

In [None]:
# Columns for qualified movies
col_list = ['title', 'release_date', 'vote_count', 'vote_average', 'popularity', 'genres']

# qualification criteria
qualified_perc = metadata[(metadata_full['vote_count'] >= m) 
                 & (metadata_full['vote_count'].notnull()) 
                 & (metadata_full['vote_average'].notnull())][col_list]

# converting vote_count and vote_average colums to integer
qualified_perc['vote_count'] = qualified['vote_count'].astype('int')
qualified_perc['vote_average'] = qualified['vote_average'].astype('int')

qualified_perc.shape

#### Getting qualified movies (cutoff: 95%)

In [111]:
# Better recommendation engine
def better_recommendations_percentile_popularity(title):
    idx = indices[title]                                    # Considers indices of the previous recommedation system
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:51]
    movie_indices = [i[0] for i in sim_scores]
    
    improved_movies = links_small_new.iloc[movie_indices][['title', 'vote_count', 'vote_average']]
    vote_counts = improved_movies[improved_movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = improved_movies[improved_movies['vote_average'].notnull()]['vote_average'].astype('int')
    
    C = vote_averages.mean()
    m = vote_counts.quantile(0.75)
    
    qualified = improved_movies[(improved_movies['vote_count'] >= m) & (improved_movies['vote_count'].notnull()) & (improved_movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

In [112]:
# Better recommendations
better_recommendations_percentile_popularity('The Dark Knight')

Unnamed: 0,title,vote_count,vote_average,wr
7648,Inception,14075,8,7.917588
6623,The Prestige,4510,8,7.758148
8031,The Dark Knight Rises,9263,7,6.921448
6218,Batman Begins,7511,7,6.904127
7583,Kick-Ass,4747,7,6.852979
1134,Batman Returns,1706,6,5.846862
4145,Insomnia,1181,6,5.797081
8970,Hitman: Agent 47,1183,5,5.06573
132,Batman Forever,1529,5,5.054144
9162,London Has Fallen,1656,5,5.050854


#### Getting qualified movies (cutoff: mean)

In [113]:
vote_counts.mean()

109.89733831940167

In [114]:
# Better recommendation engine
def better_recommendations_mean_popularity(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:51]
    movie_indices = [i[0] for i in sim_scores]
    
    improved_movies = links_small_new.iloc[movie_indices][['title', 'vote_count', 'vote_average']]
    vote_counts = improved_movies[improved_movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = improved_movies[improved_movies['vote_average'].notnull()]['vote_average'].astype('int')
    
    C = vote_averages.mean()
    m = vote_counts.mean()
    
    qualified = improved_movies[(improved_movies['vote_count'] >= m) & (improved_movies['vote_count'].notnull()) & (improved_movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

In [115]:
# Better recommendations
better_recommendations_mean_popularity('The Dark Knight')

Unnamed: 0,title,vote_count,vote_average,wr
7648,Inception,14075,8,7.917588
6623,The Prestige,4510,8,7.758148
8031,The Dark Knight Rises,9263,7,6.921448
6218,Batman Begins,7511,7,6.904127
7583,Kick-Ass,4747,7,6.852979
1134,Batman Returns,1706,6,5.846862
132,Batman Forever,1529,5,5.054144
9162,London Has Fallen,1656,5,5.050854
9163,London Has Fallen,1656,5,5.050854
9024,Batman v Superman: Dawn of Justice,7189,5,5.013943


#### Getting qualified movies (cutoff: none)

In [116]:
# Better recommendation engine
def better_recommendations_no_cutoff(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:51]
    movie_indices = [i[0] for i in sim_scores]
    
    improved_movies = links_small_new.iloc[movie_indices][['title', 'vote_count', 'vote_average']]
    vote_counts = improved_movies[improved_movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = improved_movies[improved_movies['vote_average'].notnull()]['vote_average'].astype('int')
    
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    
    qualified = improved_movies[(improved_movies['vote_count'].notnull()) & (improved_movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

In [117]:
# Better recommendations
better_recommendations_no_cutoff('The Dark Knight')

Unnamed: 0,title,vote_count,vote_average,wr
7648,Inception,14075,8,7.917588
6623,The Prestige,4510,8,7.758148
8031,The Dark Knight Rises,9263,7,6.921448
6218,Batman Begins,7511,7,6.904127
7583,Kick-Ass,4747,7,6.852979
7659,Batman: Under the Red Hood,459,7,6.147016
2085,Following,363,7,6.044272
8001,Batman: Year One,255,7,5.894463
2952,Magnum Force,251,7,5.888007
1134,Batman Returns,1706,6,5.846862
