# Context Based Filtering

In [1]:
!pip install pandas --upgrade --quiet
import os

import pandas as pd
import numpy as np

import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')


**Reading the Data**

In [2]:
movies_df = pd.read_csv("movies_metadata.csv")
keywords_df = pd.read_csv("keywords.csv")
credits_df = pd.read_csv("credits.csv")

In [3]:
movies_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


**Selecting movies which have more than 55 votes**

In [4]:
movies_df = movies_df[movies_df['vote_count']>=55]
movies_df = movies_df[['id','original_title','overview','genres']]

In [5]:
#creating a duplicate column for movie titles
movies_df['title'] = movies_df['original_title'].copy()

In [6]:
movies_df.reset_index(inplace = True, drop = True)
movies_df.head()

Unnamed: 0,id,original_title,overview,genres,title
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",Toy Story
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",Jumanji
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",Grumpier Old Men
3,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,"[{'id': 35, 'name': 'Comedy'}]",Father of the Bride Part II
4,949,Heat,"Obsessive master thief, Neil McCauley leads a ...","[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",Heat


**We take Genres, Original Title, Overview and id columns from movies_df to work on**


**Keywords**



In [7]:
keywords_df.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


**In keywords_df, we use keywords to retrieve keywords and id to merge the dataframe**

**Credits**

In [8]:
credits_df.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


**In credits_df, we use cast to get the actors names and id to merge the dataframe**

In [9]:
credits_df = credits_df[['id','cast']]

### Data cleaning and Preprocessing

In [10]:
#Merging all the dataframe into one signle dataframe
movies_df['id'] = movies_df['id'].astype(int)

#Merging keywords and resetting the index
df = pd.merge(movies_df, keywords_df, on = 'id', how = 'left')
df.reset_index(inplace = True, drop = True)

#Merging credits and resetting the index
df = pd.merge(df, credits_df, on = 'id', how = 'left')
df.reset_index(inplace = True, drop = True)

In [11]:
df.head()

Unnamed: 0,id,original_title,overview,genres,title,keywords,cast
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",Toy Story,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'cast_id': 14, 'character': 'Woody (voice)',..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",Jumanji,"[{'id': 10090, 'name': 'board game'}, {'id': 1...","[{'cast_id': 1, 'character': 'Alan Parrish', '..."
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",Grumpier Old Men,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392...","[{'cast_id': 2, 'character': 'Max Goldman', 'c..."
3,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,"[{'id': 35, 'name': 'Comedy'}]",Father of the Bride Part II,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...","[{'cast_id': 1, 'character': 'George Banks', '..."
4,949,Heat,"Obsessive master thief, Neil McCauley leads a ...","[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",Heat,"[{'id': 642, 'name': 'robbery'}, {'id': 703, '...","[{'cast_id': 25, 'character': 'Lt. Vincent Han..."


In [12]:
#Cleaning movies metadata by fetching genre list
df['genres'] = df['genres'].apply(lambda x: [i['name'] for i in eval(x)])

#Replacing spaces and making it a string
df['genres'] = df['genres'].apply(lambda x: ' '.join([i.replace(" ","") for i in x]))

In [13]:
#Replacing null values
df['keywords'].fillna('[]', inplace=True)

In [14]:
#Cleaning the keywords dataframe by fetching keyword list from the column
df['keywords'] = df['keywords'].apply(lambda x: [i['name'] for i in eval(x)])

In [15]:
#Removing empty spaces
df['keywords'] = df['keywords'].apply(lambda x: ' '.join([i.replace(" ",'') for i in x]))

In [16]:
#Replacing null values
df['cast'].fillna('[]', inplace=True)

In [17]:
#Cleaning the cast dataframe by fetching cast list from the column
df['cast'] = df['cast'].apply(lambda x: [i['name'] for i in eval(x)])


In [18]:
df['cast'] = df['cast'].apply(lambda x: ' '.join([i.replace(" ",'') for i in x]))

In [19]:
df.head()

Unnamed: 0,id,original_title,overview,genres,title,keywords,cast
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",Animation Comedy Family,Toy Story,jealousy toy boy friendship friends rivalry bo...,TomHanks TimAllen DonRickles JimVarney Wallace...
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,Adventure Fantasy Family,Jumanji,boardgame disappearance basedonchildren'sbook ...,RobinWilliams JonathanHyde KirstenDunst Bradle...
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,Romance Comedy,Grumpier Old Men,fishing bestfriend duringcreditsstinger oldmen,WalterMatthau JackLemmon Ann-Margret SophiaLor...
3,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,Comedy,Father of the Bride Part II,baby midlifecrisis confidence aging daughter m...,SteveMartin DianeKeaton MartinShort KimberlyWi...
4,949,Heat,"Obsessive master thief, Neil McCauley leads a ...",Action Crime Drama Thriller,Heat,robbery detective bank obsession chase shootin...,AlPacino RobertDeNiro ValKilmer JonVoight TomS...


**Merging the content as a single feature**

In [20]:
df['tags'] = df['overview'] + ' ' + df['genres'] +  ' ' + df['original_title'] + ' ' + df['keywords'] + ' ' + df['cast']

Deleting columns that are not necessary

In [21]:
df.drop(columns=['genres','overview','original_title','keywords','cast'], inplace=True)

In [22]:
df.head()

Unnamed: 0,id,title,tags
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...
4,949,Heat,"Obsessive master thief, Neil McCauley leads a ..."


In [23]:
df.isnull().sum()

id        0
title     0
tags     35
dtype: int64

**Removing all the null values from the data**

In [24]:
df.drop(df[df['tags'].isnull()].index, inplace=True)
df.head()

Unnamed: 0,id,title,tags
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...
4,949,Heat,"Obsessive master thief, Neil McCauley leads a ..."


In [25]:
df.shape

(8735, 3)

In [26]:
df.drop_duplicates(inplace = True)
df.shape

(8595, 3)

### Converting this data into Vectors
We convert the textual data into numerical values(feature vector) and then find which movies are similar by giving them a similarity score

To suggest similar movies, we use Cosine similarty. This algorithm is used to find similarity between vectors


In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [28]:
tfidf = TfidfVectorizer(max_features=5000)

# Transform the data
vectorized_df = tfidf.fit_transform(df['tags'].values)

In [29]:
vectorized_df

<8595x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 375635 stored elements in Compressed Sparse Row format>

In [30]:
vectorized_data = pd.DataFrame(vectorized_df.toarray(), index=df['tags'].index.tolist())

**We are performing Dimension Reduction**

In [31]:
from sklearn.decomposition import TruncatedSVD

In [32]:
#Intitalizing a PCA object
svd = TruncatedSVD(n_components=3000)

In [33]:
#Fit transform the data
reduced_data = svd.fit_transform(vectorized_data)


In [34]:
reduced_data.shape

(8595, 3000)

In [35]:
svd.explained_variance_ratio_.cumsum()

array([0.00470896, 0.01167374, 0.01736822, ..., 0.92190004, 0.92197319,
       0.92204617])

### Computing Cosine Similarity on Vectors

In [36]:
from sklearn.metrics.pairwise import cosine_similarity

In [37]:
similarity = cosine_similarity(reduced_data)

In [54]:
sorted(list(enumerate(similarity[0])), reverse = True, key = lambda x:x[1])[1:6]

[(1549, 0.595728933274662),
 (5222, 0.5252227047204967),
 (3792, 0.3267832095790518),
 (6462, 0.29503989186293905),
 (7118, 0.2909454989920591)]

## Movie recommendations based on given movie title

In [58]:
def recommend(movie_title):
    id_of_movie = df[df['title']==movie_title].index[0]
    distances = similarity[id_of_movie]
    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:10]
    
    for i in movie_list:
        print(df.iloc[i[0]].title)


In [59]:
movie_title = input("Movies similar to: ")
print(recommend(movie_title))

Movies similar to: The Matrix
The Matrix Revisited
The Matrix Revolutions
The Matrix Reloaded
The Animatrix
Commando
Terminator 3: Rise of the Machines
GHOST IN THE SHELL
Hackers
Who Am I - Kein System ist sicher
None


In [40]:
import pickle

In [41]:
pickle.dump(df.to_dict(),open('movies_dict.pkl', 'wb'))

In [42]:
df

Unnamed: 0,id,title,tags
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...
4,949,Heat,"Obsessive master thief, Neil McCauley leads a ..."
...,...,...,...
8765,430365,À bras ouverts,Jean-Étienne Fougerole is an intellectual bohe...
8766,248705,Les Visiteurs: La Révolution,"Stuck in the corridors of time, Godefroy de Mo..."
8767,44918,Titanic II,On the 100th anniversary of the original voyag...
8768,455661,In a Heartbeat,A closeted boy runs the risk of being outed by...


In [43]:
df['title'].values

array(['Toy Story', 'Jumanji', 'Grumpier Old Men', ..., 'Titanic II',
       'In a Heartbeat', 'Cadet Kelly'], dtype=object)

In [44]:
pickle.dump(similarity,open('similarity.pkl','wb'))

In [48]:
movies_meta = pd.read_csv("movies_metadata.csv")