# <center> Recommendation System on Movie Lens Dataset

## 2. Content Based Recommendation System

1. Description based
2. Metadata based

In [8]:
# Necessary imports

%matplotlib inline
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings; warnings.simplefilter('ignore')

from scipy import stats
from sklearn.cross_validation import train_test_split
from sklearn.metrics.pairwise import pairwise_distances
from sklearn import metrics

from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from subprocess import check_output


from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from math import sqrt
from ast import literal_eval

**(i) Removing 'NaN' values and Converting tmdbId to numeric**

In [2]:
# Sample version of the full dataset
links_small = pd.read_csv('data_full/links_small.csv')

In [3]:
links_small.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [4]:
# Null values in tmdbId
links_small.tmdbId.isnull().sum()

13

In [5]:
# Removing null records in 'tmdbid'
links_small = links_small[links_small['tmdbId'].notnull()]

In [6]:
# Converting float tmdbid to int 
links_small['tmdbId'] = links_small['tmdbId'].astype(int)

**(ii) Converting genre dictionary to list**

In [9]:
# Metadata dataset [contains genre and movie description details]
metadata = pd.read_csv('data_full/movies_metadata.csv')

In [10]:
metadata.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [11]:
# Genres are saved in the form of a dictionary
metadata.genres[0]

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [12]:
# Using literval_eval
literal_eval(metadata.genres[0])

[{'id': 16, 'name': 'Animation'},
 {'id': 35, 'name': 'Comedy'},
 {'id': 10751, 'name': 'Family'}]

In [13]:
# Converting genre dictionary to list
def dict_to_list(x):
    ls = []
    for i in literal_eval(x):
        ls.append(i['name'])
    return ls

In [14]:
# Cleaning 'genres' column
metadata['genres'] = metadata['genres'].apply(dict_to_list)

In [15]:
metadata.genres.head(2)

0     [Animation, Comedy, Family]
1    [Adventure, Fantasy, Family]
Name: genres, dtype: object

In [16]:
# Format 'title' i.e. remove 'year' from title
metadata['release_date'] = metadata['release_date'].astype(str).str[:4]

In [17]:
metadata['release_date'].value_counts().head()

2014    1974
2015    1905
2013    1889
2012    1722
2011    1667
Name: release_date, dtype: int64

**(iii) Type-Casting**

In [18]:
# Ids are of type string
type(metadata.id[0])

str

In [19]:
# Converting string to int
def str_to_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [20]:
# Applying str_to_int()
metadata['id'] = metadata['id'].apply(str_to_int)

In [21]:
# Checking for null values
metadata[metadata['id'].isnull()]['id']

19730   NaN
29503   NaN
35587   NaN
Name: id, dtype: float64

In [22]:
# Index of null values
list(metadata[metadata['id'].isnull()]['id'].index)

[19730, 29503, 35587]

In [23]:
# Dropping null values using index
metadata = metadata.drop(list(metadata[metadata['id'].isnull()]['id'].index))

In [24]:
# Converting string to int
metadata['id'] = metadata['id'].astype('int')

### 2.1 Description Based Recommendation

We use three columns for our description based recommendation:
- overview
- tagline
- description (overview + tagline)

**Reading the movielens dataset**

In [27]:
# Columns for sample dataset 
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url','unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy','Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

In [28]:
# Users, Ratings and Movies datasets
users_1 = pd.read_csv("data/u.user", sep = '|', names = u_cols)
ratings = pd.read_csv('data/u.data',sep = '\t', names = r_cols)
movies = pd.read_csv('data/u.item', sep = '|', names = m_cols, encoding = 'latin-1')

In [30]:
# Users, Ratings and Movies merged into Movielens dataset
zz = pd.merge(users_1 , ratings)
zz = pd.merge(zz, movies)
zz.head(3)

Unnamed: 0,user_id,age,sex,occupation,zip_code,movie_id,rating,timestamp,title,release_date,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,24,M,technician,85711,61,4,878542420,Three Colors: White (1994),01-Jan-1994,...,0,0,0,0,0,0,0,0,0,0
1,13,47,M,educator,29206,61,4,882140552,Three Colors: White (1994),01-Jan-1994,...,0,0,0,0,0,0,0,0,0,0
2,18,35,F,other,37212,61,4,880130803,Three Colors: White (1994),01-Jan-1994,...,0,0,0,0,0,0,0,0,0,0


In [38]:
# Format 'title' i.e. remove 'year' from title
zz['title'] = zz['title'].astype(str).str[:-7]

In [39]:
# Recommendation Engine 
def recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:10]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

**(i) metadata['overview']**

In [40]:
zz_metadata = metadata[metadata['id'].isin(zz['movie_id'])]

In [41]:
# tf-idf vectorizer
tf = TfidfVectorizer(analyzer = 'word', ngram_range = (1, 2), min_df = 0, stop_words = 'english') 
tfidf_matrix = tf.fit_transform(zz_metadata['overview'])                               # Fit Transform 'overview'
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)                                 # Cosine Similarity of td-idf matrix

zz_metadata_1 = zz_metadata.reset_index()                                              # Reset Index
titles = zz_metadata_1['title']                                                        # Titles
indices = pd.Series(zz_metadata_1.index, index = zz_metadata_1['title'])               # Indices

In [42]:
# Recommendation Engine 
def recommendations_overview(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:10]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [43]:
recommendations_overview('The Dark Knight')

21                   Batman Forever
233                  Batman Returns
71                           Batman
427                             JFK
843                   Batman Begins
248                  Batman & Robin
324                  A Few Good Men
435    Teenage Mutant Ninja Turtles
261             Tomorrow Never Dies
Name: title, dtype: object

**Interpretation:** This model provides robust recommendations using metadata['overview']. 

**Limitation:** But there are few not-so meaningful recommendations. Example: (Teenage Mutant Ninja Turtles, Tomorrow Never Dies)

**(ii) metadata['tagline']**

In [45]:
zz_metadata['tagline'].isnull().sum()

244

In [46]:
# Dropping null values using index
zz_metadata = zz_metadata.drop(list(zz_metadata[zz_metadata['tagline'].isnull()]['id'].index))

In [47]:
# tf-idf vectorizer
tf = TfidfVectorizer(analyzer = 'word', ngram_range = (1, 2), min_df = 0, stop_words = 'english') 
tfidf_matrix = tf.fit_transform(zz_metadata['tagline'])                              # Fit Transform 'overview'
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)                               # Cosine Similarity of td-idf matrix
    
zz_metadata_1 = zz_metadata.reset_index()                                            # Reset Index
titles = zz_metadata_1['title']                                                      # Titles
indices = pd.Series(zz_metadata_1.index, index=zz_metadata_1['title'])               # Indices

In [48]:
# Recommendation Engine 
def recommendations_tagline(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:10]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [49]:
recommendations_tagline('The Dark Knight')

1                    GoldenEye
2             Cutthroat Island
3                       Casino
4                   Four Rooms
5            Leaving Las Vegas
6    The City of Lost Children
7               Twelve Monkeys
8                   To Die For
9                        Se7en
Name: title, dtype: object

**Interpretation:** The model built with respect to 'tagline' is not as robust as the previous model. It is apparent that the first model (using metadata['overview']) provides highly similar movies than the model using 'taglines'. 

**(iii) metadata['description'] = metadata['overview'] + metadata['tagline']**

In [50]:
# Filling nans with empty strings
zz_metadata['tagline'] = zz_metadata['tagline'].fillna('')

# Create a new column 'description' = 'overview' + 'tagline'
zz_metadata['description'] = zz_metadata['overview'] + zz_metadata['tagline']

# Filling nans with empty strings 
zz_metadata['description'] = zz_metadata['description'].fillna('')

In [51]:
# tf-idf vectorizer
tf = TfidfVectorizer(analyzer = 'word', ngram_range = (1, 2), min_df = 0, stop_words = 'english')  
tfidf_matrix = tf.fit_transform(zz_metadata['description'])                                # Fit Transform 'overview'
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)                                     # Cosine Similarity of td-idf matrix
    
zz_metadata_1 = zz_metadata.reset_index()                                                  # Reset Index
titles = zz_metadata_1['title']                                                            # Titles
indices = pd.Series(zz_metadata_1.index, index=zz_metadata_1['title'])                     # Indices

In [52]:
# Recommendation Engine
def recommendations_description(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:10]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [53]:
recommendations_description('The Dark Knight')

19                   Batman Forever
210                  Batman Returns
61                           Batman
392                             JFK
697                   Batman Begins
223                  Batman & Robin
399    Teenage Mutant Ninja Turtles
236             Tomorrow Never Dies
506                         48 Hrs.
Name: title, dtype: object

**Interpretation:** This model provides similar recommendations to that of the initial model (using metadata['overview']). We can infer that 'tagline' is not the best feature to consider to build a recommendation system.