# Content Based Filtering

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Reading movies file
movies = pd.read_csv('movies.csv', sep=',', encoding='latin-1', usecols=['title', 'genres'])
print(movies.shape)
movies.head()

(9742, 2)


Unnamed: 0,title,genres
0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy
1,Jumanji,Adventure|Children|Fantasy
2,Grumpier Old Men,Comedy|Romance
3,Waiting to Exhale,Comedy|Drama|Romance
4,Father of the Bride Part II,Comedy


In [3]:
# Break up the big genre string into a string array
movies['genres'] = movies['genres'].str.split('|')
movies.head()

Unnamed: 0,title,genres
0,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]"
1,Jumanji,"[Adventure, Children, Fantasy]"
2,Grumpier Old Men,"[Comedy, Romance]"
3,Waiting to Exhale,"[Comedy, Drama, Romance]"
4,Father of the Bride Part II,[Comedy]


In [4]:
# Convert genres to string value
movies['genres'] = movies['genres'].fillna("").astype('str')
movies.head()

Unnamed: 0,title,genres
0,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy..."
1,Jumanji,"['Adventure', 'Children', 'Fantasy']"
2,Grumpier Old Men,"['Comedy', 'Romance']"
3,Waiting to Exhale,"['Comedy', 'Drama', 'Romance']"
4,Father of the Bride Part II,['Comedy']


In [5]:
# movies['genres'][0]
#movies.iloc[0, 1]
movies.loc[0, 'genres']

"['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy']"

# Recommendation based on Genre

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['genres'])
tfidf_matrix.shape

(9742, 177)

In [7]:
9742*177

1724334

In [8]:
print(tf.vocabulary_)
len(tf.vocabulary_)

{'adventure': 17, 'animation': 33, 'children': 46, 'comedy': 59, 'fantasy': 108, 'adventure animation': 18, 'animation children': 34, 'children comedy': 47, 'comedy fantasy': 63, 'adventure children': 19, 'children fantasy': 51, 'romance': 160, 'comedy romance': 68, 'drama': 96, 'comedy drama': 62, 'drama romance': 103, 'action': 0, 'crime': 73, 'thriller': 168, 'action crime': 5, 'crime thriller': 84, 'action adventure': 1, 'adventure thriller': 30, 'horror': 128, 'comedy horror': 64, 'adventure romance': 28, 'crime drama': 75, 'action comedy': 4, 'comedy crime': 60, 'drama thriller': 105, 'mystery': 147, 'drama horror': 99, 'horror mystery': 131, 'mystery thriller': 151, 'sci': 166, 'fi': 119, 'drama sci': 104, 'sci fi': 167, 'children drama': 50, 'adventure drama': 23, 'drama fantasy': 97, 'fantasy mystery': 113, 'mystery sci': 150, 'fi thriller': 121, 'war': 172, 'drama war': 106, 'adventure fantasy': 24, 'musical': 139, 'drama musical': 101, 'musical romance': 142, 'crime mystery'

177

In [9]:
tfidf_matrix

<9742x177 sparse matrix of type '<class 'numpy.float64'>'
	with 36628 stored elements in Compressed Sparse Row format>

In [None]:
print(tfidf_matrix)

In [11]:
dense = tfidf_matrix.todense()
dense.shape

(9742, 177)

In [17]:
dense[1000,:]

matrix([[0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.39302394, 0.        , 0.        , 0.        ,
         0.63316708, 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0

In [19]:
dense[0]

matrix([[0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.26110809, 0.40088628, 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.32335863, 0.38369483,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.31623031, 0.3681885 , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.16761358,
         0.        , 0.        , 0.        , 0.40514303, 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0

In [26]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(tfidf_matrix[3], tfidf_matrix[2]).round(3)

array([[0.352]])

In [28]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix).round(3)
print(cosine_sim.shape)
cosine_sim[:8, :8]

(9742, 9742)


array([[1.   , 0.314, 0.061, 0.053, 0.168, 0.   , 0.061, 0.279],
       [0.314, 1.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.728],
       [0.061, 0.   , 1.   , 0.352, 0.365, 0.   , 1.   , 0.   ],
       [0.053, 0.   , 0.352, 1.   , 0.314, 0.   , 0.352, 0.   ],
       [0.168, 0.   , 0.365, 0.314, 1.   , 0.   , 0.365, 0.   ],
       [0.   , 0.   , 0.   , 0.   , 0.   , 1.   , 0.   , 0.   ],
       [0.061, 0.   , 1.   , 0.352, 0.365, 0.   , 1.   , 0.   ],
       [0.279, 0.728, 0.   , 0.   , 0.   , 0.   , 0.   , 1.   ]])

In [30]:
# Build a 1-dimensional array with movie titles
titles = movies['title'] # titles series - index and title of a movie

# building a indices "Series", reversing the index & title - this will help to get an index, given a movie title
indices = pd.Series(movies.index, index=movies['title'])  # index is 'title' and value is the index in the titles series above

In [31]:
print(titles.head())

0                      Toy Story 
1                        Jumanji 
2               Grumpier Old Men 
3              Waiting to Exhale 
4    Father of the Bride Part II 
Name: title, dtype: object


In [34]:
titles[5]

'Heat '

In [35]:
indices['Heat ']

5

In [None]:
print(indices.head())

In [None]:
mn = titles[3]
print(mn)
indices[mn]

In [None]:
titles[2]

In [43]:
idx = indices['Jumanji ']
sim_scores = cosine_sim[idx]
print(sim_scores)
sim_scores = list(enumerate(sim_scores))
print(sim_scores[0:10])
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
print(sim_scores[0:10])
movie_indices = [i[0] for i in sim_scores]
print(movie_indices[0:5])
print(titles[movie_indices[0:10]])

[0.314 1.    0.    ... 0.    0.    0.   ]
[(0, 0.314), (1, 1.0), (2, 0.0), (3, 0.0), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.728), (8, 0.0), (9, 0.109)]
[(1, 1.0), (53, 1.0), (109, 1.0), (767, 1.0), (1514, 1.0), (1556, 1.0), (1617, 1.0), (1618, 1.0), (1799, 1.0), (3574, 1.0)]
[1, 53, 109, 767, 1514]
1                                                Jumanji 
53                           Indian in the Cupboard, The 
109                           NeverEnding Story III, The 
767                             Escape to Witch Mountain 
1514                  Darby O'Gill and the Little People 
1556                                        Return to Oz 
1617                              NeverEnding Story, The 
1618         NeverEnding Story II: The Next Chapter, The 
1799                              Santa Claus: The Movie 
3574    Harry Potter and the Sorcerer's Stone (a.k.a. ...
Name: title, dtype: object


In [44]:
# Get the vector for the genre info of each movie
# create a cosine similarity matrix for each movie to all the movies (matrix size 9742 x 9742)
# create 2 series objects 'titles' and 'indices'
# Function that get movie recommendations based on the cosine similarity score of movie genres
def genre_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [45]:
genre_recommendations('Toy Story ')

1706                                                Antz 
2355                                         Toy Story 2 
2809             Adventures of Rocky and Bullwinkle, The 
3000                           Emperor's New Groove, The 
3568                                      Monsters, Inc. 
6194                                           Wild, The 
6486                                     Shrek the Third 
6948                             Tale of Despereaux, The 
7760    Asterix and the Vikings (AstÃ©rix et les Vikin...
8219                                               Turbo 
8927                                   The Good Dinosaur 
9430                                               Moana 
5490    Twelve Tasks of Asterix, The (Les douze travau...
6448                 TMNT (Teenage Mutant Ninja Turtles) 
8357                                      The Lego Movie 
3194                                               Shrek 
7530                                     Gnomeo & Juliet 
7805          

In [46]:
movies['title'][3]

'Waiting to Exhale '

In [47]:
genre_recommendations('Waiting to Exhale ')

10                  American President, The 
47                         Mighty Aphrodite 
52               Postman, The (Postino, Il) 
83                          Beautiful Girls 
165                 Something to Talk About 
191                        Don Juan DeMarco 
198    Eat Drink Man Woman (Yin shi nan nu) 
243                           Nobody's Fool 
309                        Corrina, Corrina 
317                     I Like It Like That 
319                  It Could Happen to You 
330                           Reality Bites 
472                    Sleepless in Seattle 
475                              Son in Law 
631                     Walking and Talking 
650                                    Emma 
658                                 Tin Cup 
676                           Twelfth Night 
680                 Philadelphia Story, The 
691                          Apartment, The 
Name: title, dtype: object

# Recommendation based on Title

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['title'])
tfidf_matrix.shape

(9742, 20558)

In [49]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim[:4, :4]

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

In [50]:
# Build a 1-dimensional array with movie titles
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])

# Function that get movie recommendations based on the cosine similarity score of movie genres
def title_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [51]:
title_recommendations('Dark Knight ')

7768                     Dark Knight Rises, The 
8032    Batman: The Dark Knight Returns, Part 1 
8080    Batman: The Dark Knight Returns, Part 2 
140                                First Knight 
2417                         Cry in the Dark, A 
5778                          Alone in the Dark 
7375                             Knight and Day 
3576                               Black Knight 
3190                           Knight's Tale, A 
6858                       Alone in the Dark II 
4242                                  Dark Blue 
5060                                  Dark Days 
1305                                  Dark City 
5483                                  Dark Star 
6815                      Batman: Gotham Knight 
5934                                 Dark Water 
4749                        Shot in the Dark, A 
7877                               Dark Shadows 
8766                            The Dark Valley 
6690                      Taxi to the Dark Side 
Name: title, dtype: 

In [53]:
title_recommendations('Shrek the Third ')

5160                                             Shrek 2 
6486                                     Shrek the Third 
7360    Shrek Forever After (a.k.a. Shrek: The Final C...
6915                                     Shrek the Halls 
0                                              Toy Story 
1                                                Jumanji 
2                                       Grumpier Old Men 
3                                      Waiting to Exhale 
4                            Father of the Bride Part II 
5                                                   Heat 
6                                                Sabrina 
7                                           Tom and Huck 
8                                           Sudden Death 
9                                              GoldenEye 
10                               American President, The 
11                           Dracula: Dead and Loving It 
12                                                 Balto 
13            

In [52]:
movies['title'][6486]

'Shrek the Third '