In [14]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity

corpus=[ 'Julie loves me more than Linda loves me',
'Jane likes me more than Julie loves me',
'harry likes kiwi fruit']

# Creating count vectorizer object
vectorizer = CountVectorizer(stop_words='english')
vectors = vectorizer.fit_transform(corpus)

# Get the feature names (unique words)
feature_names = vectorizer.get_feature_names_out()

# Create a DataFrame to display the mapping
df = pd.DataFrame(vectors.toarray(), columns=feature_names)
print("\n---Mapping of vectors to words---\n")
print(df)

#Compute similarity score
cosine_sim = cosine_similarity(vectors, vectors)
cosine_sim2 = linear_kernel(vectors, vectors)
print('\n--Cosine similarity using cosine similarity function:--\n',cosine_sim)
print('\n--Cosine similarity using Linear Kernel function:--\n',cosine_sim2)


---Mapping of vectors to words---

   fruit  harry  jane  julie  kiwi  likes  linda  loves
0      0      0     0      1     0      0      1      2
1      0      0     1      1     0      1      0      1
2      1      1     0      0     1      1      0      0

--Cosine similarity using cosine similarity function:--
 [[1.         0.61237244 0.        ]
 [0.61237244 1.         0.25      ]
 [0.         0.25       1.        ]]

--Cosine similarity using Linear Kernel function:--
 [[6. 3. 0.]
 [3. 4. 1.]
 [0. 1. 4.]]


In [15]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity

corpus=[ 'Julie loves me more than Linda loves me',
'Jane likes me more than Julie loves me',
'harry likes kiwi fruit']

# Creating count vectorizer object
vectorizer = TfidfVectorizer(stop_words='english')
vectors = vectorizer.fit_transform(corpus)

# Get the feature names (unique words)
feature_names = vectorizer.get_feature_names_out()

# Create a DataFrame to display the mapping
df = pd.DataFrame(vectors.toarray(), columns=feature_names)
print("\n---Mapping of vectors to words---\n")
print(df)

#Compute similarity score
cosine_sim = cosine_similarity(vectors, vectors)
cosine_sim2 = linear_kernel(vectors, vectors)
print('\n--Cosine similarity using cosine similarity function:--\n',cosine_sim)
print('\n--Cosine similarity using Linear Kernel function:--\n',cosine_sim2)


---Mapping of vectors to words---

      fruit     harry      jane     julie      kiwi     likes    linda  \
0  0.000000  0.000000  0.000000  0.385503  0.000000  0.000000  0.50689   
1  0.000000  0.000000  0.604652  0.459854  0.000000  0.459854  0.00000   
2  0.528635  0.528635  0.000000  0.000000  0.528635  0.402040  0.00000   

      loves  
0  0.771006  
1  0.459854  
2  0.000000  

--Cosine similarity using cosine similarity function:--
 [[1.         0.53182464 0.        ]
 [0.53182464 1.         0.18487962]
 [0.         0.18487962 1.        ]]

--Cosine similarity using Linear Kernel function:--
 [[1.         0.53182464 0.        ]
 [0.53182464 1.         0.18487962]
 [0.         0.18487962 1.        ]]


In [16]:
# Import Pandas
import pandas as pd
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Load Movies Metadata
metadata = pd.read_csv('./movies_metadata.csv', low_memory=False)
# Print the first three rows
print(metadata.head(3))
#Print plot overviews of the first 5 movies.
print(metadata['overview'].head())


   adult                              belongs_to_collection    budget  \
0  False  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000   
1  False                                                NaN  65000000   
2  False  {'id': 119050, 'name': 'Grumpy Old Men Collect...         0   

                                              genres  \
0  [{'id': 16, 'name': 'Animation'}, {'id': 35, '...   
1  [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...   
2  [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...   

                               homepage     id    imdb_id original_language  \
0  http://toystory.disney.com/toy-story    862  tt0114709                en   
1                                   NaN   8844  tt0113497                en   
2                                   NaN  15602  tt0113228                en   

     original_title                                           overview  ...  \
0         Toy Story  Led by Woody, Andy's toys live happily in his ...  ...   
1      

In [17]:
#Replace NaN with an empty string
metadata['overview'] = metadata['overview'].fillna('')

In [19]:
#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')
#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(metadata['overview'])
#Output the shape of tfidf_matrix
print(tfidf_matrix.shape)
#Array mapping from feature integer indices to feature name.
print(tfidf.get_feature_names_out()[5000:5010])

(45466, 75827)
['avails' 'avaks' 'avalanche' 'avalanches' 'avallone' 'avalon' 'avant'
 'avanthika' 'avanti' 'avaracious']


In [20]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
print('cosine similiarity matrix shape:', cosine_sim.shape)
#Construct a reverse map of indices and movie titles
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()
print(indices[:10])

cosine similiarity matrix shape: (45466, 45466)
title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
Heat                           5
Sabrina                        6
Tom and Huck                   7
Sudden Death                   8
GoldenEye                      9
dtype: int64


In [23]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]
    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    # Return the top 10 most similar movies
    return metadata['title'].iloc[movie_indices]

get_recommendations('Father of the Bride Part II')

6793        Father of the Bride
6571                      Kuffs
6306            North to Alaska
19801                   Babbitt
34466         You're Killing Me
13611       The Magic of Méliès
5005                    Wendigo
27974          I Start Counting
43887    George of the Jungle 2
7097         The Out of Towners
Name: title, dtype: object