# <center> Recommendation System on Movie Lens Dataset

In [1]:
# Necessary imports

%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings; warnings.simplefilter('ignore')

from sklearn.cross_validation import train_test_split
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from math import sqrt
from ast import literal_eval



In [2]:
# Set height, width, maximum rows and columns
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

**Note:** In this project, due to limited computational resources, I use the full Movielens dataset (4 Million records), extract the metadata and keywords, merge with the smaller Movielens dataset (100k records) to build a recommendation system.

### (i) Reading the full dataset

In [2]:
# Credits dataset [contains cast and crew details] 
credits = pd.read_csv('data_full/credits.csv')

# Keywords dataset [contains important keywords of the movie] 
keywords = pd.read_csv('data_full/keywords.csv')

In [82]:
# Links dataset [contains ids such as moviedId, imdbId, tmdbId]
links = pd.read_csv('data_full/links.csv')

# Sample version of the full dataset
links_small = pd.read_csv('data_full/links_small.csv')

In [344]:
links_small.head(1)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862


In [243]:
# Metadata dataset [contains genre and movie description details]
metadata = pd.read_csv('data_full/movies_metadata.csv')

In [None]:
# Ratings dataset [contains ratings of each user]
new_ratings = pd.read_csv('data_full/ratings.csv')
new_ratings_small = pd.read_csv('data_full/ratings_small.csv')

### (ii) Reading the sample dataset

In [110]:
# Columns for sample dataset 
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url','unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy','Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

In [None]:
# Users, Ratings and Movies datasets
users_1 = pd.read_csv("data/u.user", sep = '|', names = u_cols)
ratings = pd.read_csv('data/u.data',sep = '\t', names = r_cols)
movies = pd.read_csv('data/u.item', sep = '|', names = m_cols, encoding = 'latin-1')

In [111]:
# Users, Ratings and Movies merged into Movielens dataset
movielens = pd.merge(users_1 , ratings)
movielens = pd.merge(movielens, movies)
movielens.head(3)

Unnamed: 0,user_id,age,sex,occupation,zip_code,movie_id,rating,timestamp,title,release_date,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,24,M,technician,85711,61,4,878542420,Three Colors: White (1994),01-Jan-1994,...,0,0,0,0,0,0,0,0,0,0
1,13,47,M,educator,29206,61,4,882140552,Three Colors: White (1994),01-Jan-1994,...,0,0,0,0,0,0,0,0,0,0
2,18,35,F,other,37212,61,4,880130803,Three Colors: White (1994),01-Jan-1994,...,0,0,0,0,0,0,0,0,0,0


In [112]:
# 'zz' is a copy of Movielens data (sample) to build a basic recommendation system
zz = movielens.copy(deep = True)

In [113]:
# Dropping unimportant columns
zz.drop(['sex', 'zip_code', 'timestamp', 'video_release_date', 'imdb_url'], axis = 1, inplace = True)

In [114]:
zz.head(1)

Unnamed: 0,user_id,age,occupation,movie_id,rating,title,release_date,unknown,Action,Adventure,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,24,technician,61,4,Three Colors: White (1994),01-Jan-1994,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**Note:** Genre columns is dummied already.

### (iii) Transformations

**Transformation 1:** Format `title`

In [115]:
# Format 'title' i.e. remove 'year' from title
zz['title'] = zz['title'].astype(str).str[:-7]

**Transformation 2:** Categorize `rating`

In [116]:
# Categorize 'rating'
zz['rating_cat'] = zz['rating']

In [117]:
# Function to categorize 'rating'
def transformation_1(df):
    df['rating_cat'].replace([1, 2, 3, 4, 5],
                      ['below_avg', 'below_avg', 'avg', 'above_avg', 'above_avg'], 
                      inplace = True)

In [118]:
# Apply transformation_1
transformation_1(zz)

In [119]:
# Updated column
zz.rating_cat.value_counts()

above_avg    55375
avg          27145
below_avg    17480
Name: rating_cat, dtype: int64

**Transformation 3:** Categorize `occupation`

In [120]:
# Categorize 'occupation'
zz['occupation_cat'] = zz['occupation']

In [121]:
# Function to categorize 'occupation'
def transformation_3(df):
    df['occupation_cat'].replace(['student', 'other', 'educator', 'engineer', 'programmer', 'administrator', 'writer', 'librarian', 'technician', 'executive', 'healthcare', 'artist', 'entertainment', 'scientist', 'marketing', 'retired', 'lawyer', 'none', 'salesman', 'doctor', 'homemaker'],
                         ['category_1', 'category_2', 'category_2', 'category_2', 'category_2', 'category_2', 'category_3', 'category_3', 'category_4', 'category_4', 'category_4', 'category_4', 'category_4', 'category_4', 'category_5', 'category_5', 'category_5', 'category_5', 'category_5', 'category_5', 'category_5'], 
                      inplace = True)

In [122]:
# Apply transformation_3
transformation_3(zz)

In [123]:
# Updated column
zz.occupation_cat.value_counts()

category_2    43560
category_1    21957
category_4    16174
category_3    10809
category_5     7500
Name: occupation_cat, dtype: int64

## 1. Simple Recommendation System (Popularity based - Ratings)

In [14]:
# Ratings dataset
ratings = ratings_small.copy(deep = True)

In [26]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [27]:
# Ratings matrix with movie_id as columns and user_id as rows and ratings as values
ratings_matrix = ratings.pivot_table(index = ['movie_id'], columns = ['user_id'], values = 'rating').reset_index(drop = True)

# Fill nans with 0
ratings_matrix.fillna(0, inplace = True)
ratings_matrix.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0,0.0,...,0.0,4.0,3.5,0.0,0.0,0.0,0.0,0.0,4.0,5.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
# Cosine similarity between the movies
cosine_movie_similarity = 1 - pairwise_distances(ratings_matrix.as_matrix(), metric = "cosine")

# Filling diagonals with 0s [Helps in sorting the movies based on similarity]
np.fill_diagonal(cosine_movie_similarity, 0) 

In [31]:
# Cosine similarity matrix 
cosine_similarity_matrix = pd.DataFrame(cosine_movie_similarity)
cosine_similarity_matrix.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9056,9057,9058,9059,9060,9061,9062,9063,9064,9065
0,0.0,0.394511,0.306516,0.133614,0.245102,0.377086,0.278629,0.063031,0.117499,0.310689,...,0.055829,0.031902,0.079755,0.079755,0.079755,0.079755,0.079755,0.0,0.0,0.055829
1,0.394511,0.0,0.217492,0.164651,0.278476,0.222003,0.207299,0.223524,0.113669,0.418124,...,0.0,0.055038,0.068797,0.082557,0.082557,0.137594,0.068797,0.0,0.0,0.0
2,0.306516,0.217492,0.0,0.177012,0.370732,0.247499,0.435648,0.127574,0.306717,0.191255,...,0.0,0.0,0.0,0.116226,0.116226,0.0,0.0,0.0,0.0,0.0
3,0.133614,0.164651,0.177012,0.0,0.179556,0.072518,0.184626,0.501513,0.25463,0.111447,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.245102,0.278476,0.370732,0.179556,0.0,0.272645,0.388476,0.194113,0.367941,0.246846,...,0.0,0.176845,0.0,0.117897,0.117897,0.0,0.0,0.0,0.0,0.0


In [38]:
# Reading movies dataset
m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url','unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy','Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

movies = pd.read_csv('data/u.item', sep='|', names=m_cols, encoding='latin-1')

In [40]:
# Format 'title' i.e. remove 'year' from title
movies['title'] = movies['title'].astype(str).str[:-7]

In [69]:
# Popularity based Recommendation System
def pop_rec_system(user_input):
    
    # If movie is already in the existing dataframe
    if (any(movies.title == user_input)):
        
        # Index of the user imput (movie)
        inp = movies[movies['title']==user_inp].index.tolist()
        inp = inp[0]

        # similar Movies [dataframe with id, title]
        similar_movies = movies[['movie_id', 'title']]
        # 'similarity' column contains cosine values of each movie with user input
        similar_movies['similarity'] = cosine_similarity_matrix.iloc[inp]
        # rename columns
        similar_movies.columns = ['movie_id', 'title', 'similarity']

        # Reccommended Movies 
        print("Reccommended movies")
        print("------------------")
        print(similar_movies.sort_values( ["similarity"], ascending = False )[1:10])
        
    # If movie is not in existing dataframe
    else:
        print("Movie doesn't exist in the database")

In [339]:
# Cosine Similarity Matrix
cosine_similarity_matrix[:4]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9056,9057,9058,9059,9060,9061,9062,9063,9064,9065
0,0.0,0.394511,0.306516,0.133614,0.245102,0.377086,0.278629,0.063031,0.117499,0.310689,...,0.055829,0.031902,0.079755,0.079755,0.079755,0.079755,0.079755,0.0,0.0,0.055829
1,0.394511,0.0,0.217492,0.164651,0.278476,0.222003,0.207299,0.223524,0.113669,0.418124,...,0.0,0.055038,0.068797,0.082557,0.082557,0.137594,0.068797,0.0,0.0,0.0
2,0.306516,0.217492,0.0,0.177012,0.370732,0.247499,0.435648,0.127574,0.306717,0.191255,...,0.0,0.0,0.0,0.116226,0.116226,0.0,0.0,0.0,0.0,0.0
3,0.133614,0.164651,0.177012,0.0,0.179556,0.072518,0.184626,0.501513,0.25463,0.111447,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [76]:
# Reccommended movies for 'Golden Eye'
pop_rec_system('GoldenEye')

Recommended movies
------------------
      movie_id                   title  similarity
321        322          Murder at 1600    0.564534
644        645        Paris Is Burning    0.562946
1019      1020                Gaslight    0.548023
1024      1025         Fire Down Below    0.536700
427        428        Harold and Maude    0.535197
966        967  Little Lord Fauntleroy    0.529334
266        267                            0.526862
1253      1254            Gone Fishin'    0.521122
284        285          Secrets & Lies    0.518560


**Limitation:** The recommendation system suggests movies only based on the popularity and IRRSPECTIVE OF USER PREFERENCES. 

## 2. Content Based Recommendation System

1. Description based
2. Metadata based

**(i) Removing 'NaN' values and Converting tmdbId to numeric**

In [83]:
links_small.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [84]:
# Null values in tmdbId
links_small.tmdbId.isnull().sum()

13

In [85]:
# Removing null records in 'tmdbid'
links_small = links_small[links_small['tmdbId'].notnull()]

In [86]:
# Converting float tmdbid to int 
links_small['tmdbId'] = links_small['tmdbId'].astype(int)

**(ii) Converting genre dictionary to list**

In [89]:
metadata.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [92]:
# Genres are saved in the form of a dictionary
metadata.genres[0]

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [93]:
# Using literval_eval
literal_eval(metadata.genres[0])

[{'id': 16, 'name': 'Animation'},
 {'id': 35, 'name': 'Comedy'},
 {'id': 10751, 'name': 'Family'}]

In [244]:
# Converting genre dictionary to list
def dict_to_list(x):
    ls = []
    for i in literal_eval(x):
        ls.append(i['name'])
    return ls

In [245]:
# Cleaning 'genres' column
metadata['genres'] = metadata['genres'].apply(dict_to_list)

In [246]:
metadata.genres.head(2)

0     [Animation, Comedy, Family]
1    [Adventure, Fantasy, Family]
Name: genres, dtype: object

In [247]:
# Format 'title' i.e. remove 'year' from title
metadata['release_date'] = metadata['release_date'].astype(str).str[:4]

In [248]:
metadata['release_date'].value_counts().head()

2014    1974
2015    1905
2013    1889
2012    1722
2011    1667
Name: release_date, dtype: int64

In [99]:
# Ids are of type string
type(metadata.id[0])

str

In [249]:
# Converting string to int
def str_to_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [250]:
# Applying str_to_int()
metadata['id'] = metadata['id'].apply(str_to_int)

In [251]:
# Checking for null values
metadata[metadata['id'].isnull()]['id']

19730   NaN
29503   NaN
35587   NaN
Name: id, dtype: float64

In [252]:
# Index of null values
list(metadata[metadata['id'].isnull()]['id'].index)

[19730, 29503, 35587]

In [253]:
# Dropping null values using index
metadata = metadata.drop(list(metadata[metadata['id'].isnull()]['id'].index))

In [254]:
# Converting string to int
metadata['id'] = metadata['id'].astype('int')

### 2.1 Description Based Recommendation

We use three columns for our description based recommendation:
- overview
- tagline
- description (overview + tagline)

In [493]:
# Recommendation Engine 
def recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:10]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

**(i) metadata['overview']**

In [124]:
zz_metadata = metadata[metadata['id'].isin(zz['movie_id'])]

dfs = [zz_metadata['overview'], zz_metadata['tagline'], zz_metadata['description']]

In [390]:
# tf-idf vectorizer
tf = TfidfVectorizer(analyzer = 'word', ngram_range = (1, 2), min_df = 0, stop_words = 'english') 
tfidf_matrix = tf.fit_transform(zz_metadata['overview'])                               # Fit Transform 'overview'
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)                                 # Cosine Similarity of td-idf matrix

zz_metadata_1 = zz_metadata.reset_index()                                              # Reset Index
titles = zz_metadata_1['title']                                                        # Titles
indices = pd.Series(zz_metadata_1.index, index = zz_metadata_1['title'])               # Indices

In [391]:
# Recommendation Engine 
def recommendations_overview(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:10]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [392]:
recommendations_overview('The Dark Knight')

21                   Batman Forever
233                  Batman Returns
71                           Batman
427                             JFK
843                   Batman Begins
248                  Batman & Robin
324                  A Few Good Men
435    Teenage Mutant Ninja Turtles
261             Tomorrow Never Dies
Name: title, dtype: object

**Interpretation:** This model provides robust recommendations using metadata['overview']. However, there are some exceptions (possibly) with some of the recommendations provided by the model (Teenage Mutant Ninja Turtles is recommended for The Dark Knight) 

**(ii) metadata['tagline']**

In [393]:
# tf-idf vectorizer
tf = TfidfVectorizer(analyzer = 'word', ngram_range = (1, 2), min_df = 0, stop_words = 'english') 
tfidf_matrix = tf.fit_transform(zz_metadata['tagline'])                              # Fit Transform 'overview'
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)                               # Cosine Similarity of td-idf matrix
    
zz_metadata_1 = zz_metadata.reset_index()                                            # Reset Index
titles = zz_metadata_1['title']                                                      # Titles
indices = pd.Series(zz_metadata_1.index, index=zz_metadata_1['title'])               # Indices

In [394]:
# Recommendation Engine 
def recommendations_tagline(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:10]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [395]:
recommendations_tagline('The Dark Knight')

1                         Heat
2                    GoldenEye
3             Cutthroat Island
4                       Casino
5                   Four Rooms
6            Leaving Las Vegas
7    The City of Lost Children
8               Twelve Monkeys
9             Dead Man Walking
Name: title, dtype: object

** Interpretation:** The model built with respect to 'tagline' is not as robust as the previous model. It is apparent that the first model (using metadata['overview']) provides highly similar movies than the model using 'taglines'. 

**(iii) metadata['description'] = metadata['overview'] + metadata['tagline']**

In [396]:
# Filling nans with empty strings
zz_metadata['tagline'] = zz_metadata['tagline'].fillna('')

# Create a new column 'description' = 'overview' + 'tagline'
zz_metadata['description'] = zz_metadata['overview'] + zz_metadata['tagline']

# Filling nans with empty strings 
zz_metadata['description'] = zz_metadata['description'].fillna('')

In [398]:
# tf-idf vectorizer
tf = TfidfVectorizer(analyzer = 'word', ngram_range = (1, 2), min_df = 0, stop_words = 'english')  
tfidf_matrix = tf.fit_transform(zz_metadata['description'])                                # Fit Transform 'overview'
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)                                     # Cosine Similarity of td-idf matrix
    
zz_metadata_1 = zz_metadata.reset_index()                                                  # Reset Index
titles = zz_metadata_1['title']                                                            # Titles
indices = pd.Series(zz_metadata_1.index, index=zz_metadata_1['title'])                     # Indices

In [399]:
# Recommendation Engine
def recommendations_description(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:10]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [400]:
recommendations_description('The Dark Knight')

21                   Batman Forever
233                  Batman Returns
71                           Batman
427                             JFK
843                   Batman Begins
248                  Batman & Robin
324                  A Few Good Men
435    Teenage Mutant Ninja Turtles
261             Tomorrow Never Dies
Name: title, dtype: object

**Interpretation:** This model provides similar recommendations to that of the initial model (using metadata['overview']). We can infer that 'tagline' is not the best feature to consider to build a recommendation system.

### 2.2 Metadata Based Recommendation

**(i) Read Data**

In [77]:
# Reading the full dataset
credits = pd.read_csv('data_full/credits.csv')
keywords = pd.read_csv('data_full/keywords.csv')

In [78]:
credits.head(2)

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844


In [79]:
keywords.head(2)

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."


In [402]:
metadata.head(1)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,weighted_rating
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,7.51783


In [257]:
# Merging credits and keywords to metadata_full 
metadata_full = metadata.merge(credits, on = 'id')
metadata_full = metadata_full.merge(keywords, on = 'id')

In [258]:
metadata_full.shape

(46628, 27)

**(ii) Data Pre-Processing**

In [260]:
# Converting genre dictionary to list
def dict_to_list(x):
    ls = []
    for i in literal_eval(x):
        ls.append(i['name'])
    return ls

In [263]:
# Apply 'dict_to_list' method
for col in ['cast', 'crew', 'keywords']:
    metadata_full[col] = metadata_full[col].apply(dict_to_list)

In [261]:
# Extracting director from 'crew'
def director(x):
    for i in literal_eval(x):
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [262]:
# Apply 'director' method
metadata_full['director'] = metadata_full['crew'].apply(director)

In [446]:
# Null values
links_small['tmdbId'].isnull().sum()

0

In [439]:
# DataType of tmdbID entries 
type(links_small['tmdbId'][0])

numpy.int32

In [447]:
# Dataframe containing metadata with tmdbIds in 'links_small' 
links_small_new = metadata_full[metadata_full['id'].isin(links_small['tmdbId'])]
links_small_new.shape

(9219, 29)

In [448]:
links_small_new.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'cast', 'crew', 'keywords', 'director',
       'weighted_rating'],
      dtype='object')

In [455]:
# Null values in tagline = 2137
print(links_small_new['tagline'].isnull().sum())

# Null values in tagline = 12
print(links_small_new['overview'].isnull().sum())

2137
12


**Note:** Since there are null values in 'tagline' and 'overview', we cannot simply join them together to create a new columns ('description'). 

**Solution:** Strip off the white spaces

In [449]:
# Strip off white spaces from 'tagline'
links_small_new['tagline'] = links_small_new['tagline'].fillna('')

# Create new column 'description' = 'overview' + 'tagline'
links_small_new['description'] = links_small_new['overview'] + links_small_new['tagline']

# Strip off white spaces from 'description', if any
links_small_new['description'] = links_small_new['description'].fillna('')

**Note:** So far, `links_small_new` has cast, crew, credits and genres. But we do not need all the data in them. To efficiently use them, I clean each column further.

In [279]:
# Creating new features 'cast_size' and 'crew size'
links_small_new['cast_size'] = links_small_new['cast'].apply(lambda x: len(x))
links_small_new['crew_size'] = links_small_new['crew'].apply(lambda x: len(x))

In [457]:
# Cast of a movie
links_small_new['cast'][0]

['Tom Hanks',
 'Tim Allen',
 'Don Rickles',
 'Jim Varney',
 'Wallace Shawn',
 'John Ratzenberger',
 'Annie Potts',
 'John Morris',
 'Erik von Detten',
 'Laurie Metcalf',
 'R. Lee Ermey',
 'Sarah Freeman',
 'Penn Jillette']

**Note:** Cast can include actors and actress that are both famous and infamous. However, famous artists are most likely to play a significant role in affecting the users opinion than others. 

**Solution:** Select 4 artists [lead actor 1, lead actor 2, supporting actor 1, supporting actor 2] rather than considering all. 

In [461]:
# Selecting top 4 artists
links_small_new['cast'] = links_small_new['cast'].apply(lambda x: x[:4] if len(x) >= 4 else x)

In [463]:
links_small_new['cast'][1]

['Robin Williams', 'Jonathan Hyde', 'Kirsten Dunst', 'Bradley Pierce']

These are steps I follow in the preparation of genres and credits data:
1. **Strip Spaces and Convert to Lowercase** from all our features. This way, engine will not confuse between **Johnny Depp** and **Johnny Galecki.** 
2. **Mention Director 2 times** to give it more weight relative to the entire cast.

In [477]:
# Strip spaces from 'cast' and convert to lowercase
links_small_new['cast'] = links_small_new['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [478]:
# Strip spaces from 'director'
links_small_new['director'] = links_small_new['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))

# Adding weight to 'director'
links_small_new['director'] = links_small_new['director'].apply(lambda x: [x,x])

#### Keywords

We will do a small amount of pre-processing of our keywords before putting them to any use. As a first step, we calculate the frequenct counts of every keyword that appears in the dataset.

In [470]:
links_small_new['keywords'][:3]

0    [jealousy, toy, boy, friendship, friends, riva...
1    [board game, disappearance, based on children'...
2    [fishing, best friend, duringcreditsstinger, o...
Name: keywords, dtype: object

In [471]:
# Stacking all words from 'keywords'
s = links_small_new.apply(lambda x: pd.Series(x['keywords']), axis = 1).stack().reset_index(level = 1, drop = True)
s.name = 'keyword'

In [479]:
# Value counts
s = s.value_counts()
s[:5]

independent film        610
woman director          550
murder                  399
duringcreditsstinger    327
based on novel          318
Name: keyword, dtype: int64

**Note:** Keywords occur in frequencies ranging from 1 to 610. We do not have any use for keywords that occur only once. <br>
**Interpretation:** Keywords that occur just once.

In [480]:
s = s[s > 1]

**Stemming:** Play, Played, Playing words can be stemmed to *stem*.

Convert every word to its stem.

In [481]:
# Initialize stemmer object
stemmer = SnowballStemmer('english')

In [482]:
# Function to filter keywords
def filter_keywords(x):
    words = []
    for i in x:                     # For each word in input
        if i in s:                  # If that word in keywords
            words.append(i)         # append i to words 
    return words

In [484]:
# Apply filter_keywords to 'keywords'
links_small_new['keywords'] = links_small_new['keywords'].apply(filter_keywords)

# Stem keywords
links_small_new['keywords'] = links_small_new['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])

# Convert string to lower case and strip spaces
links_small_new['keywords'] = links_small_new['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [487]:
links_small_new['keywords'][1]

['boardgam',
 'disappear',
 "basedonchildren'sbook",
 'newhom',
 'reclus',
 'giantinsect']

**Soup:** Soup is the metadata of `genres`, `director`, `cast` and `keywords`. 

In [488]:
# Soup = 'keywords' + 'cast' + 'director' + 'genres'
links_small_new['soup'] = links_small_new['keywords'] + links_small_new['cast'] + links_small_new['director'] + links_small_new['genres']

In [489]:
links_small_new['soup'][1]

['boardgam',
 'disappear',
 "basedonchildren'sbook",
 'newhom',
 'reclus',
 'giantinsect',
 'robinwilliams',
 'jonathanhyde',
 'kirstendunst',
 'bradleypierce',
 'joejohnston',
 'joejohnston',
 'Adventure',
 'Fantasy',
 'Family']

In [490]:
# Remove quotations ('') and commas (,) from soup 
links_small_new['soup'] = links_small_new['soup'].apply(lambda x: ' '.join(x))

In [491]:
links_small_new['soup'][1]

"boardgam disappear basedonchildren'sbook newhom reclus giantinsect robinwilliams jonathanhyde kirstendunst bradleypierce joejohnston joejohnston Adventure Fantasy Family"

**Count Vectorizer** Create a count matrix and calculate the cosine similarities to find movies that are most similar.

In [492]:
# Count Vectorizer
count = CountVectorizer(analyzer = 'word', ngram_range = (1, 2), min_df = 0, stop_words = 'english')

# Build a count matrix by fitting and transforming 'soup'
count_matrix = count.fit_transform(links_small_new['soup'])

In [494]:
# Calculating cosine similarity of count matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [498]:
# Reset Index
links_small_new = links_small_new.reset_index()

# Titles
titles = links_small_new['title']

# Indices
indices = pd.Series(links_small_new.index, index = links_small_new['title'])

We will reuse the get_recommendations function that we had written earlier. Since our cosine similarity scores have changed, we expect it to give us different (and probably better) results. Let us check for **The Dark Knight** again and see what recommendations I get this time around.

In [493]:
# Recommendation Engine 
def recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:10]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [295]:
recommendations('The Dark Knight').head(10)

6218                 Batman Begins
8031         The Dark Knight Rises
1134                Batman Returns
7659    Batman: Under the Red Hood
8927       Kidnapping Mr. Heineken
3647               An Innocent Man
1260                Batman & Robin
5943                      Thursday
6623                  The Prestige
Name: title, dtype: object

In [296]:
recommendations('Pulp Fiction').head(10)

898           Reservoir Dogs
8905       The Hateful Eight
1381            Jackie Brown
5200       Kill Bill: Vol. 2
6788             Death Proof
7280    Inglourious Basterds
17                Four Rooms
231            Kiss of Death
4313          Billy Bathgate
Name: title, dtype: object

**Limitation:** This recommendation system returns only the movies based on soup. It does not consider `popularity`.

**Solution:** We use the results returned from our Count Vectorizer (indices) and return the movies that are popular based on the IMDB's weighted average. Additionally, I use three different criteria to cut-off the movies (75% percentile, Mean and No Cut-Off criteria)

**Weighted Average**

##### IMDB's *weighted rating* formula

[Weighted Rating](https://math.stackexchange.com/questions/169032/understanding-the-imdb-weighted-rating-function-for-usage-on-my-own-website) (WR) = $(\frac{v}{v + m} . R) + (\frac{m}{v + m} . C)$

where,
* *v* - number of ratings for the movie
* *m* - number of ratings needed to qualify (usually mean)
* *R* - average rating of the movie
* *C* - mean rating of the population (whole dataset)

Before we could use the above weighted average formula, `m` and `C` should be determined.

In [297]:
# Claculation of c 
vote_counts = metadata[metadata['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = metadata[metadata['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
C

5.244896612406511

In [298]:
## Claculation of m
m = vote_counts.quantile(0.95)
m

434.0

In [214]:
# Function to calculate 'weighted_rating'
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

I try three different cutoff criteria: 
1. 95th percentile
2. Mean
3. No cut-off

In [308]:
# Apply weighted rating method to qualified_perc, qualified_mean, new_qualified, sm_df, metadata
for df in [qualified_perc, qualified_mean, new_qualified, sm_df, metadata, metadata_full]:
    df['weighted_rating'] = df.apply(weighted_rating, axis=1)

In [304]:
# Columns for qualified movies
col_list = ['title', 'release_date', 'vote_count', 'vote_average', 'popularity', 'genres']

# qualification criteria
qualified_perc = metadata[(metadata_full['vote_count'] >= m) 
                 & (metadata_full['vote_count'].notnull()) 
                 & (metadata_full['vote_average'].notnull())][col_list]

# converting vote_count and vote_average colums to integer
qualified_perc['vote_count'] = qualified['vote_count'].astype('int')
qualified_perc['vote_average'] = qualified['vote_average'].astype('int')

qualified_perc.shape

(2285, 6)

#### Getting qualified movies (cutoff: 95%)

In [502]:
# Better recommendation engine
def better_recommendations_percentile_popularity(title):
    idx = indices[title]                                    # Considers indices of the previous recommedation system
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:51]
    movie_indices = [i[0] for i in sim_scores]
    
    improved_movies = links_small_new.iloc[movie_indices][['title', 'vote_count', 'vote_average']]
    vote_counts = improved_movies[improved_movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = improved_movies[improved_movies['vote_average'].notnull()]['vote_average'].astype('int')
    
    C = vote_averages.mean()
    m = vote_counts.quantile(0.75)
    
    qualified = improved_movies[(improved_movies['vote_count'] >= m) & (improved_movies['vote_count'].notnull()) & (improved_movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

In [503]:
# Better recommendations
better_recommendations_percentile_popularity('The Dark Knight')

Unnamed: 0,title,vote_count,vote_average,wr
7648,Inception,14075,8,7.917588
6623,The Prestige,4510,8,7.758148
8031,The Dark Knight Rises,9263,7,6.921448
6218,Batman Begins,7511,7,6.904127
7583,Kick-Ass,4747,7,6.852979
1134,Batman Returns,1706,6,5.846862
4145,Insomnia,1181,6,5.797081
8970,Hitman: Agent 47,1183,5,5.06573
132,Batman Forever,1529,5,5.054144
9162,London Has Fallen,1656,5,5.050854


#### Getting qualified movies (cutoff: mean)

In [300]:
vote_counts.mean()

109.89733831940167

In [504]:
# Better recommendation engine
def better_recommendations_mean_popularity(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:51]
    movie_indices = [i[0] for i in sim_scores]
    
    improved_movies = links_small_new.iloc[movie_indices][['title', 'vote_count', 'vote_average']]
    vote_counts = improved_movies[improved_movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = improved_movies[improved_movies['vote_average'].notnull()]['vote_average'].astype('int')
    
    C = vote_averages.mean()
    m = vote_counts.mean()
    
    qualified = improved_movies[(improved_movies['vote_count'] >= m) & (improved_movies['vote_count'].notnull()) & (improved_movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

In [505]:
# Better recommendations
better_recommendations_mean_popularity('The Dark Knight')

Unnamed: 0,title,vote_count,vote_average,wr
7648,Inception,14075,8,7.917588
6623,The Prestige,4510,8,7.758148
8031,The Dark Knight Rises,9263,7,6.921448
6218,Batman Begins,7511,7,6.904127
7583,Kick-Ass,4747,7,6.852979
1134,Batman Returns,1706,6,5.846862
132,Batman Forever,1529,5,5.054144
9162,London Has Fallen,1656,5,5.050854
9163,London Has Fallen,1656,5,5.050854
9024,Batman v Superman: Dawn of Justice,7189,5,5.013943


#### Getting qualified movies (cutoff: none)

In [506]:
# Better recommendation engine
def better_recommendations_no_cutoff(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:51]
    movie_indices = [i[0] for i in sim_scores]
    
    improved_movies = links_small_new.iloc[movie_indices][['title', 'vote_count', 'vote_average']]
    vote_counts = improved_movies[improved_movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = improved_movies[improved_movies['vote_average'].notnull()]['vote_average'].astype('int')
    
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    
    qualified = improved_movies[(improved_movies['vote_count'].notnull()) & (improved_movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

In [507]:
# Better recommendations
better_recommendations_no_cutoff('The Dark Knight')

Unnamed: 0,title,vote_count,vote_average,wr
7648,Inception,14075,8,7.917588
6623,The Prestige,4510,8,7.758148
8031,The Dark Knight Rises,9263,7,6.921448
6218,Batman Begins,7511,7,6.904127
7583,Kick-Ass,4747,7,6.852979
7659,Batman: Under the Red Hood,459,7,6.147016
2085,Following,363,7,6.044272
8001,Batman: Year One,255,7,5.894463
2952,Magnum Force,251,7,5.888007
1134,Batman Returns,1706,6,5.846862


## 3. Collaborative Filtering

In [2]:
# Read 'ratings' data
# ratings = pd.read_csv('data_full/ratings.csv')
ratings_small = pd.read_csv('data_full/ratings_small.csv')

**Note**: Below code snippet pertains to 'ratings' dataset (2 Mil). For this project, I consider only the smaller dataset.  

ratings.columns = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = ratings.drop(['timestamp'], axis = 1)
ratings.head()

In [3]:
ratings_small.columns = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings_small = ratings_small.drop(['timestamp'], axis = 1)
ratings_small.head()

Unnamed: 0,user_id,movie_id,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [4]:
# Fill NaN values in user_id and movie_id column with 0
ratings_small['user_id'] = ratings_small['user_id'].fillna(0)
ratings_small['movie_id'] = ratings_small['movie_id'].fillna(0)

In [5]:
# Replace NaN values in rating column with average of all values
ratings_small['rating'] = ratings_small['rating'].fillna(ratings_small['rating'].mean())

In [6]:
ratings_small.head()

Unnamed: 0,user_id,movie_id,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [7]:
print(ratings_small.shape)

(100004, 3)


**Note:** Python throws 'Memory' Error when I use the full dataset. Hence, I pick 25% of the dataset and perform collborative filtering on it.

In [8]:
# Randomly sample 25% of the ratings dataset
small_data = ratings_small.sample(frac=0.25)

In [9]:
# Check the sample info
print(small_data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25001 entries, 40662 to 14056
Data columns (total 3 columns):
user_id     25001 non-null int64
movie_id    25001 non-null int64
rating      25001 non-null float64
dtypes: float64(1), int64(2)
memory usage: 781.3 KB
None


In [10]:
train_data, test_data = train_test_split(small_data, test_size=0.2)

In [11]:
# Test and Train data matrix
train_data_matrix = train_data.as_matrix(columns = ['user_id', 'movie_id', 'rating'])
test_data_matrix = test_data.as_matrix(columns = ['user_id', 'movie_id', 'rating'])

In [12]:
train_data.head()

Unnamed: 0,user_id,movie_id,rating
36850,265,1150,2.0
47899,353,1073,2.0
30170,213,118997,1.0
12670,79,49530,3.5
29983,213,70336,3.0


In [13]:
train_data_matrix[:4]

array([[2.65000e+02, 1.15000e+03, 2.00000e+00],
       [3.53000e+02, 1.07300e+03, 2.00000e+00],
       [2.13000e+02, 1.18997e+05, 1.00000e+00],
       [7.90000e+01, 4.95300e+04, 3.50000e+00]])

**Idea behind user and item similarity:**

User similarity can be calculated by measuring 'pairwise distances' between ratings datset.

However, if you have to calculate the 'item similarity', we have to transpose the 'ratings' data and then calculate the pairwise distances.

In [16]:
# User Similarity Matrix
user_correlation = 1 - pairwise_distances(train_data, metric='correlation')
user_correlation[np.isnan(user_correlation)] = 0
print(user_correlation[:4, :4])

[[1.         0.70454029 0.78791522 0.68269215]
 [0.70454029 1.         0.99211756 0.99953994]
 [0.78791522 0.99211756 1.         0.98786048]
 [0.68269215 0.99953994 0.98786048 1.        ]]


In [26]:
# Item Similarity Matrix (Train_data_matrix.Transpose)
item_correlation = 1 - pairwise_distances(train_data_matrix.T, metric='correlation')
item_correlation[np.isnan(item_correlation)] = 0
print(item_correlation[:4, :4])

[[ 1.          0.0136781   0.00742387]
 [ 0.0136781   1.         -0.0216893 ]
 [ 0.00742387 -0.0216893   1.        ]]


In [37]:
train_data.head()

Unnamed: 0,user_id,movie_id,rating
90716,602,783,4.0
3655,21,466,3.0
69220,480,2797,4.0
52058,380,81847,4.0
3430,19,1225,5.0


In [36]:
train_data.T.head()

Unnamed: 0,90716,3655,69220,52058,3430,87698,35374,921,17022,94911,...,90928,4487,33998,43512,13969,66920,96030,15203,91589,66064
user_id,602.0,21.0,480.0,380.0,19.0,584.0,253.0,13.0,111.0,624.0,...,605.0,23.0,243.0,311.0,91.0,468.0,638.0,99.0,607.0,468.0
movie_id,783.0,466.0,2797.0,81847.0,1225.0,4699.0,3745.0,3396.0,457.0,94777.0,...,608.0,4641.0,442.0,647.0,54286.0,74545.0,1378.0,2085.0,2174.0,2053.0
rating,4.0,3.0,4.0,4.0,5.0,4.0,2.0,3.5,4.0,2.5,...,3.0,4.0,3.0,4.0,5.0,3.0,4.0,3.0,3.5,1.5


In [24]:
user_correlation[:1]

array([[1.        , 0.70454029, 0.78791522, ..., 0.70915885, 0.85219278,
        0.82267086]])

In [27]:
item_correlation[:1]

array([[1.        , 0.0136781 , 0.00742387]])

In [52]:
# Function to predict ratings
def predict(ratings, similarity, type='user'):
    
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        # Use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    
    return pred

In [53]:
# Predict ratings on the training data with both similarity score
user_prediction = predict(train_data_matrix, user_correlation, type='user')
item_prediction = predict(train_data_matrix, item_correlation, type='item')

In [54]:
# Function to calculate RMSE
def rmse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return sqrt(mean_squared_error(pred, actual))

In [55]:
# RMSE on the test data
print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE: 17596.17284084236
Item-based CF RMSE: 20855.26385843227
