# Building A Movie Recommender System using Content Based Filtering

### 1. Imports

In [80]:
import pandas as pd
import ast
import string 
import nltk
import numpy as np
import pickle

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from gensim.models import Word2Vec

import warnings
warnings.simplefilter('ignore')

### 2. Loading Data
Dataset being used : [The Movies Dataset](https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset)

In [61]:
movies = pd.read_csv('archive/movies_metadata.csv')# load movies

  movies = pd.read_csv('archive/movies_metadata.csv')# load movies


In [63]:
credits = pd.read_csv('archive/credits.csv')

In [64]:
credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [65]:
keywords = pd.read_csv('archive/keywords.csv')

In [66]:
keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


### 3. Explore Data

In [67]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [68]:
credits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45476 entries, 0 to 45475
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   cast    45476 non-null  object
 1   crew    45476 non-null  object
 2   id      45476 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.0+ MB


In [69]:
keywords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46419 entries, 0 to 46418
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        46419 non-null  int64 
 1   keywords  46419 non-null  object
dtypes: int64(1), object(1)
memory usage: 725.4+ KB


### 4. Cleaning The Data

In [11]:
#removing unncessary columns and storing it to a dataframe named 'df'
df = movies[['id','title','original_title', 'overview', 'genres']] 

In [12]:
df['id'] = pd.to_numeric(df['id'], errors = 'coerce', downcast='integer')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['id'] = pd.to_numeric(df['id'], errors = 'coerce', downcast='integer')


In [13]:
df = df.dropna()

In [14]:
df['id'] = df['id'].astype(int)

In [15]:
df['id']

0           862
1          8844
2         15602
3         31357
4         11862
          ...  
45461    439050
45462    111109
45463     67758
45464    227506
45465    461257
Name: id, Length: 44506, dtype: int64

In [16]:
df = pd.merge(df, keywords, on='id')
df = pd.merge(df, credits, on = 'id')

In [17]:
df.isna().sum()

id                0
title             0
original_title    0
overview          0
genres            0
keywords          0
cast              0
crew              0
dtype: int64

In [18]:
print(len(df))

45629


In [19]:
#checking if there are any duplicated rows
df.duplicated().sum()

1147

In [20]:
df = df.drop_duplicates()

In [21]:
len(df)

44482

In [22]:
df.head()

Unnamed: 0,id,title,original_title,overview,genres,keywords,cast,crew
0,862,Toy Story,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,8844,Jumanji,Jumanji,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[{'id': 10090, 'name': 'board game'}, {'id': 1...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,15602,Grumpier Old Men,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392...","[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
3,31357,Waiting to Exhale,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","[{'id': 818, 'name': 'based on novel'}, {'id':...","[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de..."
4,11862,Father of the Bride Part II,Father of the Bride Part II,Just when George Banks has recovered from his ...,"[{'id': 35, 'name': 'Comedy'}]","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...","[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."


**We can observe that 'keywords' , 'cast', 'crew' columns are in a dictionary format**

In [23]:
df['keywords'][0]

"[{'id': 931, 'name': 'jealousy'}, {'id': 4290, 'name': 'toy'}, {'id': 5202, 'name': 'boy'}, {'id': 6054, 'name': 'friendship'}, {'id': 9713, 'name': 'friends'}, {'id': 9823, 'name': 'rivalry'}, {'id': 165503, 'name': 'boy next door'}, {'id': 170722, 'name': 'new toy'}, {'id': 187065, 'name': 'toy comes to life'}]"

In [24]:
# we convert these dictionaries to a suitable format.
print(type(df['keywords'][0]))

# literal_eval converts the string format of original to dictionaries.
ast.literal_eval(df['keywords'][0])

<class 'str'>


[{'id': 931, 'name': 'jealousy'},
 {'id': 4290, 'name': 'toy'},
 {'id': 5202, 'name': 'boy'},
 {'id': 6054, 'name': 'friendship'},
 {'id': 9713, 'name': 'friends'},
 {'id': 9823, 'name': 'rivalry'},
 {'id': 165503, 'name': 'boy next door'},
 {'id': 170722, 'name': 'new toy'},
 {'id': 187065, 'name': 'toy comes to life'}]

In [25]:
def convert_keywords(text):
    l = []
    for i in ast.literal_eval(text):
        l.append(i['name'])
    return l

In [26]:
convert_keywords(df['keywords'][0])

['jealousy',
 'toy',
 'boy',
 'friendship',
 'friends',
 'rivalry',
 'boy next door',
 'new toy',
 'toy comes to life']

In [27]:
df['keywords'] = df['keywords'].apply(convert_keywords) #applying for all the rows

In [28]:
df.head()

Unnamed: 0,id,title,original_title,overview,genres,keywords,cast,crew
0,862,Toy Story,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","[jealousy, toy, boy, friendship, friends, riva...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,8844,Jumanji,Jumanji,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[board game, disappearance, based on children'...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,15602,Grumpier Old Men,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...","[fishing, best friend, duringcreditsstinger, o...","[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
3,31357,Waiting to Exhale,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","[based on novel, interracial relationship, sin...","[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de..."
4,11862,Father of the Bride Part II,Father of the Bride Part II,Just when George Banks has recovered from his ...,"[{'id': 35, 'name': 'Comedy'}]","[baby, midlife crisis, confidence, aging, daug...","[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."


In [29]:
#now let's do it for cast column.
ast.literal_eval(df['cast'][0])

[{'cast_id': 14,
  'character': 'Woody (voice)',
  'credit_id': '52fe4284c3a36847f8024f95',
  'gender': 2,
  'id': 31,
  'name': 'Tom Hanks',
  'order': 0,
  'profile_path': '/pQFoyx7rp09CJTAb932F2g8Nlho.jpg'},
 {'cast_id': 15,
  'character': 'Buzz Lightyear (voice)',
  'credit_id': '52fe4284c3a36847f8024f99',
  'gender': 2,
  'id': 12898,
  'name': 'Tim Allen',
  'order': 1,
  'profile_path': '/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg'},
 {'cast_id': 16,
  'character': 'Mr. Potato Head (voice)',
  'credit_id': '52fe4284c3a36847f8024f9d',
  'gender': 2,
  'id': 7167,
  'name': 'Don Rickles',
  'order': 2,
  'profile_path': '/h5BcaDMPRVLHLDzbQavec4xfSdt.jpg'},
 {'cast_id': 17,
  'character': 'Slinky Dog (voice)',
  'credit_id': '52fe4284c3a36847f8024fa1',
  'gender': 2,
  'id': 12899,
  'name': 'Jim Varney',
  'order': 3,
  'profile_path': '/eIo2jVVXYgjDtaHoF19Ll9vtW7h.jpg'},
 {'cast_id': 18,
  'character': 'Rex (voice)',
  'credit_id': '52fe4284c3a36847f8024fa5',
  'gender': 2,
  'id': 12900,
 

In [30]:
#let's make a dictionary to save all the actor names and their ids.

actor_id = {}

def convert_cast(text):
    l = []
    ctr = 0
    text= ast.literal_eval(text)
    sorted_text = sorted(text, key = lambda x : x['order'])
    for i in sorted_text:
        if ctr == 5:
            break
        l.append(i['name'])
        actor_id[i['name']] = i['id']
    return l

In [31]:
df['cast'] = df['cast'].apply(convert_cast)

In [32]:
df.head()

Unnamed: 0,id,title,original_title,overview,genres,keywords,cast,crew
0,862,Toy Story,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","[jealousy, toy, boy, friendship, friends, riva...","[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,8844,Jumanji,Jumanji,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[board game, disappearance, based on children'...","[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,15602,Grumpier Old Men,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...","[fishing, best friend, duringcreditsstinger, o...","[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
3,31357,Waiting to Exhale,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","[based on novel, interracial relationship, sin...","[Whitney Houston, Angela Bassett, Loretta Devi...","[{'credit_id': '52fe44779251416c91011acb', 'de..."
4,11862,Father of the Bride Part II,Father of the Bride Part II,Just when George Banks has recovered from his ...,"[{'id': 35, 'name': 'Comedy'}]","[baby, midlife crisis, confidence, aging, daug...","[Steve Martin, Diane Keaton, Martin Short, Kim...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."


In [33]:
ast.literal_eval(df['crew'][0])

[{'credit_id': '52fe4284c3a36847f8024f49',
  'department': 'Directing',
  'gender': 2,
  'id': 7879,
  'job': 'Director',
  'name': 'John Lasseter',
  'profile_path': '/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg'},
 {'credit_id': '52fe4284c3a36847f8024f4f',
  'department': 'Writing',
  'gender': 2,
  'id': 12891,
  'job': 'Screenplay',
  'name': 'Joss Whedon',
  'profile_path': '/dTiVsuaTVTeGmvkhcyJvKp2A5kr.jpg'},
 {'credit_id': '52fe4284c3a36847f8024f55',
  'department': 'Writing',
  'gender': 2,
  'id': 7,
  'job': 'Screenplay',
  'name': 'Andrew Stanton',
  'profile_path': '/pvQWsu0qc8JFQhMVJkTHuexUAa1.jpg'},
 {'credit_id': '52fe4284c3a36847f8024f5b',
  'department': 'Writing',
  'gender': 2,
  'id': 12892,
  'job': 'Screenplay',
  'name': 'Joel Cohen',
  'profile_path': '/dAubAiZcvKFbboWlj7oXOkZnTSu.jpg'},
 {'credit_id': '52fe4284c3a36847f8024f61',
  'department': 'Writing',
  'gender': 0,
  'id': 12893,
  'job': 'Screenplay',
  'name': 'Alec Sokolow',
  'profile_path': '/v79vlRYi94BZUQnkkyzn

In [34]:
director_id = {}

def convert_crew(text):
    l = []
    text=  ast.literal_eval(text)
    for i in text:
        if i['job'] == 'Director':
            l.append(i['name'])
            director_id[i['name']] = i['id']
    
    return l

In [35]:
df['director'] = df['crew'].apply(convert_crew)
df = df.drop('crew', axis = 1)

In [36]:
director_id

{'John Lasseter': 7879,
 'Joe Johnston': 4945,
 'Howard Deutch': 26502,
 'Forest Whitaker': 2178,
 'Charles Shyer': 56106,
 'Michael Mann': 638,
 'Sydney Pollack': 2226,
 'Peter Hewitt': 18357,
 'Peter Hyams': 37710,
 'Martin Campbell': 10702,
 'Rob Reiner': 3026,
 'Mel Brooks': 14639,
 'Simon Wells': 21879,
 'Oliver Stone': 1152,
 'Renny Harlin': 16938,
 'Martin Scorsese': 1032,
 'Ang Lee': 1614,
 'Allison Anders': 3110,
 'Alexandre Rockwell': 3111,
 'Robert Rodriguez': 2294,
 'Quentin Tarantino': 138,
 'Steve Oedekerk': 4489,
 'Joseph Ruben': 52629,
 'Barry Sonnenfeld': 5174,
 'Jon Amiel': 15148,
 'Richard Donner': 7187,
 'Victor Salva': 56349,
 'Mike Figgis': 6111,
 'Oliver Parker': 56710,
 'Lesli Linka Glatter': 15798,
 'Roger Michell': 7017,
 'Jean-Pierre Jeunet': 2419,
 'Marc Caro': 13680,
 'Zhang Yimou': 607,
 'John N. Smith': 60295,
 'Terry Gilliam': 280,
 'Chris Noonan': 58137,
 'Christopher Hampton': 12952,
 'Tim Robbins': 504,
 'Stephen Low': 77867,
 'Andy Tennant': 17167,
 

In [37]:
df.head()

Unnamed: 0,id,title,original_title,overview,genres,keywords,cast,director
0,862,Toy Story,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","[jealousy, toy, boy, friendship, friends, riva...","[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...",[John Lasseter]
1,8844,Jumanji,Jumanji,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[board game, disappearance, based on children'...","[Robin Williams, Jonathan Hyde, Kirsten Dunst,...",[Joe Johnston]
2,15602,Grumpier Old Men,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...","[fishing, best friend, duringcreditsstinger, o...","[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...",[Howard Deutch]
3,31357,Waiting to Exhale,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","[based on novel, interracial relationship, sin...","[Whitney Houston, Angela Bassett, Loretta Devi...",[Forest Whitaker]
4,11862,Father of the Bride Part II,Father of the Bride Part II,Just when George Banks has recovered from his ...,"[{'id': 35, 'name': 'Comedy'}]","[baby, midlife crisis, confidence, aging, daug...","[Steve Martin, Diane Keaton, Martin Short, Kim...",[Charles Shyer]


In [38]:
df['genres'][0]

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [39]:
def convert_genres(text):
    l = []
    text=  ast.literal_eval(text)
    for i in text:
        l.append(i['name'])
    return l

In [40]:
df['genres'] = df['genres'].apply(convert_genres)

In [41]:
df.head()

Unnamed: 0,id,title,original_title,overview,genres,keywords,cast,director
0,862,Toy Story,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[Animation, Comedy, Family]","[jealousy, toy, boy, friendship, friends, riva...","[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...",[John Lasseter]
1,8844,Jumanji,Jumanji,When siblings Judy and Peter discover an encha...,"[Adventure, Fantasy, Family]","[board game, disappearance, based on children'...","[Robin Williams, Jonathan Hyde, Kirsten Dunst,...",[Joe Johnston]
2,15602,Grumpier Old Men,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[Romance, Comedy]","[fishing, best friend, duringcreditsstinger, o...","[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...",[Howard Deutch]
3,31357,Waiting to Exhale,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[Comedy, Drama, Romance]","[based on novel, interracial relationship, sin...","[Whitney Houston, Angela Bassett, Loretta Devi...",[Forest Whitaker]
4,11862,Father of the Bride Part II,Father of the Bride Part II,Just when George Banks has recovered from his ...,[Comedy],"[baby, midlife crisis, confidence, aging, daug...","[Steve Martin, Diane Keaton, Martin Short, Kim...",[Charles Shyer]


In [42]:
def rem_spaces(l):
    l1 = []
    for i in l:
        l1.append(i.replace(" ",""))
    return l1

In [43]:
df['cast'] = df['cast'].apply(rem_spaces)
df['genres'] = df['genres'].apply(rem_spaces)
df['keywords'] = df['keywords'].apply(rem_spaces)
df['director'] = df['director'].apply(rem_spaces)


In [44]:
df.head()

Unnamed: 0,id,title,original_title,overview,genres,keywords,cast,director
0,862,Toy Story,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[Animation, Comedy, Family]","[jealousy, toy, boy, friendship, friends, riva...","[TomHanks, TimAllen, DonRickles, JimVarney, Wa...",[JohnLasseter]
1,8844,Jumanji,Jumanji,When siblings Judy and Peter discover an encha...,"[Adventure, Fantasy, Family]","[boardgame, disappearance, basedonchildren'sbo...","[RobinWilliams, JonathanHyde, KirstenDunst, Br...",[JoeJohnston]
2,15602,Grumpier Old Men,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[Romance, Comedy]","[fishing, bestfriend, duringcreditsstinger, ol...","[WalterMatthau, JackLemmon, Ann-Margret, Sophi...",[HowardDeutch]
3,31357,Waiting to Exhale,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[Comedy, Drama, Romance]","[basedonnovel, interracialrelationship, single...","[WhitneyHouston, AngelaBassett, LorettaDevine,...",[ForestWhitaker]
4,11862,Father of the Bride Part II,Father of the Bride Part II,Just when George Banks has recovered from his ...,[Comedy],"[baby, midlifecrisis, confidence, aging, daugh...","[SteveMartin, DianeKeaton, MartinShort, Kimber...",[CharlesShyer]


In [45]:
df['overview'] = df['overview'].apply(lambda x : x.split())

In [46]:
print(df['overview'][0])

['Led', 'by', 'Woody,', "Andy's", 'toys', 'live', 'happily', 'in', 'his', 'room', 'until', "Andy's", 'birthday', 'brings', 'Buzz', 'Lightyear', 'onto', 'the', 'scene.', 'Afraid', 'of', 'losing', 'his', 'place', 'in', "Andy's", 'heart,', 'Woody', 'plots', 'against', 'Buzz.', 'But', 'when', 'circumstances', 'separate', 'Buzz', 'and', 'Woody', 'from', 'their', 'owner,', 'the', 'duo', 'eventually', 'learns', 'to', 'put', 'aside', 'their', 'differences.']


In [47]:
df['tags'] = df['overview'] + df['genres'] + df['keywords'] + df['cast'] + df['director']

In [48]:
new_df = df.drop(columns = ['overview', 'genres', 'keywords', 'cast','director'])

In [49]:
new_df.head()

Unnamed: 0,id,title,original_title,tags
0,862,Toy Story,Toy Story,"[Led, by, Woody,, Andy's, toys, live, happily,..."
1,8844,Jumanji,Jumanji,"[When, siblings, Judy, and, Peter, discover, a..."
2,15602,Grumpier Old Men,Grumpier Old Men,"[A, family, wedding, reignites, the, ancient, ..."
3,31357,Waiting to Exhale,Waiting to Exhale,"[Cheated, on,, mistreated, and, stepped, on,, ..."
4,11862,Father of the Bride Part II,Father of the Bride Part II,"[Just, when, George, Banks, has, recovered, fr..."


In [50]:
new_df

Unnamed: 0,id,title,original_title,tags
0,862,Toy Story,Toy Story,"[Led, by, Woody,, Andy's, toys, live, happily,..."
1,8844,Jumanji,Jumanji,"[When, siblings, Judy, and, Peter, discover, a..."
2,15602,Grumpier Old Men,Grumpier Old Men,"[A, family, wedding, reignites, the, ancient, ..."
3,31357,Waiting to Exhale,Waiting to Exhale,"[Cheated, on,, mistreated, and, stepped, on,, ..."
4,11862,Father of the Bride Part II,Father of the Bride Part II,"[Just, when, George, Banks, has, recovered, fr..."
...,...,...,...,...
45624,439050,Subdue,رگ خواب,"[Rising, and, falling, between, a, man, and, w..."
45625,111109,Century of Birthing,Siglo ng Pagluluwal,"[An, artist, struggles, to, finish, his, work,..."
45626,67758,Betrayal,Betrayal,"[When, one, of, her, hits, goes, wrong,, a, pr..."
45627,227506,Satan Triumphant,Satana likuyushchiy,"[In, a, small, town, live, two, brothers,, one..."


In [51]:
new_df['tags'] = new_df['tags'].apply(lambda x : " ".join(x))
new_df.head(3)

Unnamed: 0,id,title,original_title,tags
0,862,Toy Story,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,8844,Jumanji,Jumanji,When siblings Judy and Peter discover an encha...
2,15602,Grumpier Old Men,Grumpier Old Men,A family wedding reignites the ancient feud be...


### 5. Encoding the texts

We can use several text encoders like BagofWords, TF-IDF, Word2Vec etc.

In [52]:
nltk.download('punkt')
stemmer = PorterStemmer()

[nltk_data] Downloading package punkt to /home/tillu_25/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [53]:
# first let's preprocess all the tags
stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    text = ''.join([word for word in text if word not in string.punctuation])
    tokens = word_tokenize(text)
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [54]:
new_df['tags'] = new_df['tags'].apply(preprocess)

In [55]:
new_df.head()

Unnamed: 0,id,title,original_title,tags
0,862,Toy Story,Toy Story,led woodi andi toy live happili room andi birt...
1,8844,Jumanji,Jumanji,sibl judi peter discov enchant board game open...
2,15602,Grumpier Old Men,Grumpier Old Men,famili wed reignit ancient feud nextdoor neigh...
3,31357,Waiting to Exhale,Waiting to Exhale,cheat mistreat step women hold breath wait elu...
4,11862,Father of the Bride Part II,Father of the Bride Part II,georg bank recov daughter wed receiv news she ...


In [56]:
new_df['tags'][0]

'led woodi andi toy live happili room andi birthday bring buzz lightyear onto scene afraid lose place andi heart woodi plot buzz circumst separ buzz woodi owner duo eventu learn put asid differ anim comedi famili jealousi toy boy friendship friend rivalri boynextdoor newtoy toycomestolif tomhank timallen donrickl jimvarney wallaceshawn johnratzenberg anniepott johnmorri erikvondetten lauriemetcalf rleeermey sarahfreeman pennjillett johnlasset'

In [57]:
len(director_id)

19400

In [58]:
len(actor_id)

200414

In [59]:
new_df.head()

Unnamed: 0,id,title,original_title,tags
0,862,Toy Story,Toy Story,led woodi andi toy live happili room andi birt...
1,8844,Jumanji,Jumanji,sibl judi peter discov enchant board game open...
2,15602,Grumpier Old Men,Grumpier Old Men,famili wed reignit ancient feud nextdoor neigh...
3,31357,Waiting to Exhale,Waiting to Exhale,cheat mistreat step women hold breath wait elu...
4,11862,Father of the Bride Part II,Father of the Bride Part II,georg bank recov daughter wed receiv news she ...


### 6. Using Bag of Words

In [109]:
cv = CountVectorizer(stop_words='english')

In [110]:
count_matrix = cv.fit_transform(new_df['tags'])

In [111]:
count_matrix = count_matrix.astype(np.int32)

In [112]:
count_matrix.shape

(44482, 292618)

In [113]:
cosine_similarity(count_matrix[0], count_matrix)[0][0:10]

array([1.        , 0.03822354, 0.03045725, 0.03360514, 0.01536644,
       0.        , 0.01274118, 0.08231545, 0.        , 0.        ])

In [114]:
def recommend(movie):
    index = new_df[new_df['title'] == movie].index[0]
    distances = sorted(list(enumerate(cosine_similarity(count_matrix[index], count_matrix)[0])), reverse = True, key = lambda x : x[1])
    
    for i in distances[1:10]:
        print(new_df.iloc[i[0]].title)

In [115]:
recommend('Toy Story')

Toy Story 2
Toy Story 3
Small Fry
Toy Story of Terror!
Andy Kaufman Plays Carnegie Hall
Andy Hardy's Blonde Trouble
The Champ
Toy Story That Time Forgot
Partysaurus Rex


In [117]:
recommend('The Avengers')

Black Plague
Thithi
The Love Letter
Mio in the Land of Faraway
The Wind and the Lion
Detective Dee and the Mystery of the Phantom Flame
The Extraordinary Adventures of Adèle Blanc-Sec
Journey to the Center of the Earth
The Blue Light


### 7. Using TF-IDF

In [118]:
new_df['tags'].head()

0    led woodi andi toy live happili room andi birt...
1    sibl judi peter discov enchant board game open...
2    famili wed reignit ancient feud nextdoor neigh...
3    cheat mistreat step women hold breath wait elu...
4    georg bank recov daughter wed receiv news she ...
Name: tags, dtype: object

In [119]:
tfidf = TfidfVectorizer(stop_words='english')

In [120]:
tfidf_matrix = tfidf.fit_transform(new_df['tags'])

In [121]:
tfidf_matrix.shape

(44482, 292618)

In [122]:
tfidf_matrix = tfidf_matrix.astype(np.float32)

In [123]:
def recommend(movie):
    index = new_df[new_df['title'] == movie].index[0]
    distances = sorted(list(enumerate(cosine_similarity(tfidf_matrix[index], tfidf_matrix)[0])), reverse = True, key = lambda x : x[1])
    
    for i in distances[1:10]:
        print(new_df.iloc[i[0]].title)

In [86]:
recommend('Toy Story')

Toy Story 2
Toy Story 3
Small Fry
Toy Story of Terror!
Andy Kaufman Plays Carnegie Hall
Hot Splash
Andy Hardy's Blonde Trouble
Toy Story That Time Forgot
Toy Reanimator


In [124]:
recommend('Iron Man')

True Legend
Second Nature
The Master
Legendary Weapons of China
Bodyguards and Assassins
1911
Sharpshooter
Born to Defend
The Assassin


In [126]:
recommend('The Avengers')

Black Plague
River Queen
The Love Letter
Red Bells Part I: Mexico on Fire
The Bunker
Urban Explorer
Daylight
The Tunnel
Thithi


### 7. Using Word2Vec

In [90]:
# let's use gensim to train a Word2Vec model
sentences = [sentence.split() for sentence in new_df['tags']]
w2v_model = Word2Vec(sentences, vector_size = 100, window = 5, min_count = 5, workers = 4)

In [91]:
def vectorize(sentence):
    words = sentence.split()
    words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if len(words_vecs) == 0:
        return np.zeros(100)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis = 0)

In [92]:
vectorized = np.array([vectorize(sentence) for sentence in new_df['tags']], dtype = np.float32)

In [93]:
print(vectorized[0]) #vectorized format of the first movie tags.

[ 0.17642124 -0.06909949  0.26839343 -0.0931903  -0.06861958 -0.7102529
  0.1369236   0.68807244 -0.3168909  -0.1663051  -0.20656107 -1.0407706
 -0.08958281 -0.04563318  0.11224592 -0.41853103 -0.43262058 -0.37028384
  0.33340704 -0.66129774 -0.02065405  0.00145412 -0.30782735 -0.17042066
 -0.25647262 -0.21214968 -0.32188898 -0.28250095  0.1452709   0.04689407
 -0.06926138  0.17029694 -0.02076747 -0.19171359 -0.5358299   0.29811805
  0.07535798 -0.17797269 -0.29884982 -0.50582206  0.38713756 -0.19014308
 -0.5706599   0.05979044  0.0663032   0.0757366  -0.28861132 -0.19858475
 -0.18826999  0.2708731   0.6384262  -0.35936624 -0.04829096  0.138439
 -0.37721944 -0.14897855  0.25262338  0.05789236 -0.11117521  0.02915125
 -0.23163186  0.0103796  -0.24800634  0.25479165 -0.4501942   0.63585114
  0.38980028  0.18221647 -0.56411415  0.68191123 -0.49229476  0.01102745
  0.77019894 -0.35474172  0.6431975   0.3985249   0.02505084 -0.38104326
 -0.24246359  0.60820925 -0.57393277 -0.15775497 -0.354

In [94]:
print(vectorized.shape) #shape of vectorized format of tags

(44482, 100)


In [95]:
vectorized = vectorized.astype('float32')

In [105]:
cosine_similarity(vectorized[0].reshape(1,-1), vectorized)

array([[1.        , 0.9433594 , 0.9493986 , ..., 0.79311097, 0.90224975,
        0.8391418 ]], dtype=float32)

In [106]:
def recommend(movie):
    index = new_df[new_df['title'] == movie].index[0]
    distances = sorted(list(enumerate(cosine_similarity(vectorized[index].reshape(1,-1), vectorized)[0])), reverse = True, key = lambda x : x[1])
    
    for i in distances[1:10]:
        print(new_df.iloc[i[0]].title)

In [108]:
recommend('Iron Man')

The Battalion
Defiance
The Hunley
Olga
Battle For Haditha
Brother's War
The Taking of Tiger Mountain
In Tranzit
47 Ronin
