In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from scipy.sparse import csr_matrix  # as TF-IDF matrix isn't already sparse, we are using this library
from sklearn.metrics.pairwise import cosine_similarity

<h2>Load Cleaned Movies Dataset</h2>

In [2]:
df = pd.read_pickle("../datasets/clean/movies_df.pkl")  

In [3]:
df.shape

(31516, 28)

In [4]:
df

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,status,tagline,title,video,vote_average,vote_count,cast,director,producer,text_corpus
0,False,Toy Story Collection,30000000,"Animation, Comedy, Family",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,Released,,Toy Story,False,7.7,5415.0,"Tom Hanks, Tim Allen, Don Rickles, Jim Varney,...",John Lasseter,"Bonnie Arnold, Ralph Guggenheim","Toy Story, Animation, Comedy, Family, Tom Hank..."
1,False,,65000000,"Adventure, Fantasy, Family",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"Robin Williams, Jonathan Hyde, Kirsten Dunst, ...",Joe Johnston,"Scott Kroopf, William Teitler","Jumanji, Adventure, Fantasy, Family, Robin Wil..."
2,False,Grumpy Old Men Collection,0,"Romance, Comedy",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,"Walter Matthau, Jack Lemmon, Ann-Margret, Soph...",Howard Deutch,,"Grumpier Old Men, Romance, Comedy, Walter Matt..."
3,False,,16000000,"Comedy, Drama, Romance",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,"Whitney Houston, Angela Bassett, Loretta Devin...",Forest Whitaker,"Ronald Bass, Ezra Swerdlow, Deborah Schindler,...","Waiting to Exhale, Comedy, Drama, Romance, Whi..."
4,False,Father of the Bride Collection,0,Comedy,,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,"Steve Martin, Diane Keaton, Martin Short, Kimb...",Charles Shyer,Nancy Meyers,"Father of the Bride Part II, Comedy, Steve Mar..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45531,False,,0,Science Fiction,,222848,tt0112613,en,Caged Heat 3000,It's the year 3000 AD. The world's most danger...,...,Released,,Caged Heat 3000,False,3.5,1.0,"Lisa Boyle, Kena Land, Zaneta Polard, Don Yana...",Aaron Osborne,Mike Upton,"Caged Heat 3000, Science Fiction, Lisa Boyle, ..."
45532,False,,0,"Drama, Action, Romance",,30840,tt0102797,en,Robin Hood,"Yet another version of the classic epic, with ...",...,Released,,Robin Hood,False,5.7,26.0,"Patrick Bergin, Uma Thurman, David Morrissey, ...",John Irvin,Sarah Radclyffe,"Robin Hood, Drama, Action, Romance, Patrick Be..."
45535,False,,0,"Action, Drama, Thriller",,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",...,Released,A deadly game of wits.,Betrayal,False,3.8,6.0,"Erika Eleniak, Adam Baldwin, Julie du Page, Ja...",Mark L. Lester,,"Betrayal, Action, Drama, Thriller, Erika Eleni..."
45536,False,,0,,,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",...,Released,,Satan Triumphant,False,0.0,0.0,"Iwan Mosschuchin, Nathalie Lissenko, Pavel Pav...",Yakov Protazanov,Joseph N. Ermolieff,"Satan Triumphant, , Iwan Mosschuchin, Nathalie..."


In [5]:
df['text_corpus'].head()

0    Toy Story, Animation, Comedy, Family, Tom Hank...
1    Jumanji, Adventure, Fantasy, Family, Robin Wil...
2    Grumpier Old Men, Romance, Comedy, Walter Matt...
3    Waiting to Exhale, Comedy, Drama, Romance, Whi...
4    Father of the Bride Part II, Comedy, Steve Mar...
Name: text_corpus, dtype: object

<h2>Reference</h2>

Ref: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [6]:
# example
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())
print(vectorizer.vocabulary_)
print(X.shape)

# Get the feature names (terms)
terms = vectorizer.get_feature_names_out()

# Convert the sparse matrix to a dense array
dense_matrix = X.toarray()

# Create a Pandas DataFrame for better visualization
df_count = pd.DataFrame(dense_matrix, columns=terms)
df_count.index = [f'Document {i+1}' for i in range(len(corpus))]

print("\nCount Vectorizer Output:")
print(df_count)


['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
{'this': 8, 'is': 3, 'the': 6, 'first': 2, 'document': 1, 'second': 5, 'and': 0, 'third': 7, 'one': 4}
(4, 9)

Count Vectorizer Output:
            and  document  first  is  one  second  the  third  this
Document 1    0         1      1   1    0       0    1      0     1
Document 2    0         2      0   1    0       1    1      0     1
Document 3    1         0      0   1    1       0    1      1     1
Document 4    0         1      1   1    0       0    1      0     1


Ref: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

In [7]:
# example
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
# create instance # will be used to learn the vocabulary and calculate the TF-IDF vectors
vectorizer = TfidfVectorizer()
# fit method analyze the corpus to learn the vocabulary (unique words) and calculate the IDF values for each word
# A simplified vocabulary might look like:  {'this': 0, 'is': 1, 'the': 2, 'first': 3, 'document': 4, 'second': 5, 'and': 6, 'third': 7, 'one': 8}.  (The numbers are just indices)
# transform method takes the corpus and coverts each documents into a TF-IDF vector based on the vocabulary and IDF values learned during the fit() step
# The transform() step calculates TF-IDF scores for each word in each document.  For example, the first document "This is the first document" would be represented by a vector where the values at indices 0, 1, 2, 3, and 4 would have non-zero TF-IDF scores, and the other indices would be zero
X = vectorizer.fit_transform(corpus)
# it returns the list of words that the vectorizer has learned from corpus
print(vectorizer.get_feature_names_out())
# key: unique words found in corpus, value: it associated with each word are integer indices (e.g. the word 'this' has an index of 8)
print(vectorizer.vocabulary_)
# (number_of_documents, number_of_unique_words)
# The number of documents is 4 (the length of corpus) 
# The number of unique words depends on corpus. Let's say, after removing punctuation and converting to lowercase, we have 9 unique words.
print(X.shape)

# Get the feature names (terms)
terms = vectorizer.get_feature_names_out()

# Convert the sparse matrix to a dense array
dense_matrix = X.toarray()

# Create a Pandas DataFrame for better visualization
df_tfidf = pd.DataFrame(dense_matrix, columns=terms)
df_tfidf.index = [f'Document {i+1}' for i in range(len(corpus))]

print("\nTF-IDF Vectorizer Output:")
print(df_tfidf)



['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
{'this': 8, 'is': 3, 'the': 6, 'first': 2, 'document': 1, 'second': 5, 'and': 0, 'third': 7, 'one': 4}
(4, 9)

TF-IDF Vectorizer Output:
                 and  document     first        is       one    second  \
Document 1  0.000000  0.469791  0.580286  0.384085  0.000000  0.000000   
Document 2  0.000000  0.687624  0.000000  0.281089  0.000000  0.538648   
Document 3  0.511849  0.000000  0.000000  0.267104  0.511849  0.000000   
Document 4  0.000000  0.469791  0.580286  0.384085  0.000000  0.000000   

                 the     third      this  
Document 1  0.384085  0.000000  0.384085  
Document 2  0.281089  0.000000  0.281089  
Document 3  0.267104  0.511849  0.267104  
Document 4  0.384085  0.000000  0.384085  


Ref: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.linear_kernel.html
<br/><br/>
Linear Kernel calculates the similarity between two data points as the dot product of the points.
<ul>
<li>Dot Product:  If you have two vectors (data points), their dot product is calculated by multiplying the corresponding components and summing the results.
<br/>
For example, if a = [a1, a2, ..., an] and b = [b1, b2, ..., bn], then their dot product is:
<br/>
a · b = a1*b1 + a2*b2 + ... + an*bn
</li>
<li>Linear Kernel: The linear kernel simply returns this dot product as the similarity measure.</li>
</ul>

In [8]:
# example
from sklearn.metrics.pairwise import linear_kernel
# a matrix representing two data points.  Each row is a data point (a vector)
X = [[0, 0, 0], [1, 1, 1]]
# another matrix, also representing two data points
Y = [[1, 0, 0], [1, 1, 0]]
# calculates the linear kernel between all pairs of data points in X and Y
lk_matrix = linear_kernel(X, Y)
print(lk_matrix.shape) # Since X has 2 data points and Y has 2 data points, the resulting matrix will be 2x2
print(lk_matrix) # print kernel matrix

(2, 2)
[[0. 0.]
 [1. 2.]]


The linear_kernel function takes two matrices (or vectors) and calculates the dot product between all pairs of data points, returning a matrix of similarity scores.  This is a fundamental operation in many machine learning algorithms, especially those that rely on kernel methods.
<br/>
the values in the X matrix that gets printed:
<br/>
X[0, 0] (dot product of [0, 0, 0] and [1, 0, 0]): (0*1) + (0*0) + (0*0) = 0<br/>
X[0, 1] (dot product of [0, 0, 0] and [1, 1, 0]): (0*1) + (0*1) + (0*0) = 0<br/>
X[1, 0] (dot product of [1, 1, 1] and [1, 0, 0]): (1*1) + (1*0) + (1*0) = 1<br/>
X[1, 1] (dot product of [1, 1, 1] and [1, 1, 0]): (1*1) + (1*1) + (1*0) = 2

Ref: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.cosine_similarity.html

In [9]:
# example
from sklearn.metrics.pairwise import cosine_similarity
X = [[0, 0, 0], [1, 1, 1]]
Y = [[1, 0, 0], [1, 1, 0]]
cs_matrix = cosine_similarity(X, Y)
print(cs_matrix.shape) 
print(cs_matrix)

(2, 2)
[[0.         0.        ]
 [0.57735027 0.81649658]]


It computes the cosine of the angle between vectors in X and vectors in Y.
cosine_similarity(A,B) = A⋅B / ∥A∥ ⋅ ∥B∥
Where:
A⋅B is the dot product of vectors A and B
∥A∥ and ∥B∥ are the Euclidean norms (magnitudes) of A and B

<h2>Recommendations with traditional approaches</h2>

<h3>TF-IDF</h3>

In [10]:
tfidf_vector = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vector.fit_transform(df['text_corpus'])
tfidf_matrix.shape

(31516, 123164)

In [11]:
# Verify idf` exists
if hasattr(tfidf_vector, 'idf_'):
    print("idf exists before saving.")
else:
    print("idf is missing before saving.")

idf exists before saving.


In [12]:

# Inspect the vectorizer attributes
print("Vocabulary:", tfidf_vector.vocabulary_)
print("Vocabulary size:", len(tfidf_vector.vocabulary_))
print("Feature names:", tfidf_vector.get_feature_names_out())
print("idf_ attribute exists:", hasattr(tfidf_vector, 'idf_'))
print("shape:", tfidf_matrix.shape)

Vocabulary size: 123164
Feature names: ['00' '000' '000km' ... '莫玛' '连姆' '高桥一生']
idf_ attribute exists: True
shape: (31516, 123164)


In [13]:
tfidf_vector.get_feature_names_out()[5000:5010]

array(['antisocial', 'antista', 'antiterrorism', 'antithesis',
       'antitrust', 'antivero', 'antiviral', 'antiwar', 'antje',
       'antlers'], dtype=object)

In [14]:
# Convert to CSR sparse matrix 
tfidf_matrix_sparse = csr_matrix(tfidf_matrix)

In [15]:
linear_kernel_matrix = linear_kernel(tfidf_matrix_sparse)
print(linear_kernel_matrix.shape)

(31516, 31516)


In [16]:
cosine_similarity_matrix = cosine_similarity(tfidf_matrix_sparse)
print(cosine_similarity_matrix.shape)

(31516, 31516)


In [17]:
cosine_similarity_matrix

array([[1.        , 0.02460155, 0.01097632, ..., 0.00979532, 0.00475992,
        0.        ],
       [0.02460155, 1.        , 0.02748083, ..., 0.02098806, 0.01500929,
        0.00525394],
       [0.01097632, 0.02748083, 1.        , ..., 0.        , 0.01013821,
        0.009939  ],
       ...,
       [0.00979532, 0.02098806, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.00475992, 0.01500929, 0.01013821, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.00525394, 0.009939  , ..., 0.        , 0.        ,
        1.        ]])

<h3>Count Vectorizer</h3>

In [18]:
count_vector = CountVectorizer(stop_words='english')
count_matrix = count_vector.fit_transform(df['text_corpus'])
print(count_matrix.shape)

(31516, 123164)


In [19]:
cosine_similarity_matrix_2 = cosine_similarity(count_matrix, count_matrix)
print(cosine_similarity_matrix_2.shape)

(31516, 31516)


In [20]:
cosine_similarity_matrix_2

array([[1.        , 0.03851142, 0.06991127, ..., 0.02904046, 0.00984231,
        0.        ],
       [0.03851142, 1.        , 0.03755873, ..., 0.07800765, 0.02643813,
        0.01754656],
       [0.06991127, 0.03755873, 1.        , ..., 0.        , 0.01279844,
        0.02548236],
       ...,
       [0.02904046, 0.07800765, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.00984231, 0.02643813, 0.01279844, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.01754656, 0.02548236, ..., 0.        , 0.        ,
        1.        ]])

##### Recommendations by given movie title with similarity score using linear kernel or consine similarity of TF-IDF or Count Vectorizer

In [21]:
def get_recommendations(movie_title, matrix=cosine_similarity_matrix, n=5):

    try:
        # get index from dataframe
        index = df[df['title'] == movie_title].index[0]
        # print(index)
        
        # sort top n similar movies     
        similar_movies = sorted(list(enumerate(matrix[index])), reverse=True, key=lambda x: x[1]) 
        # print(similar_movies[1:6])
        
        # extract names from dataframe and return movie names
        recommendations = []
        for i in similar_movies[1:n+1]:
            similar_movie_title = df.iloc[i[0]].title
            similarity_score = i[1]
            recommendations.append((similar_movie_title, similarity_score))
            
        return recommendations
    
    except IndexError:
        return f"Movie '{movie_title}' not found in the dataset."

In [22]:
# using tf-idf linear kernel
get_recommendations('Toy Story', linear_kernel_matrix)

[('Toy Story 2', 0.4749337022705545),
 ('Toy Story 3', 0.36463412674469725),
 ('Small Fry', 0.22560878224550626),
 ('Toy Story of Terror!', 0.17699142355055816),
 ('Hawaiian Vacation', 0.1708619790688143)]

In [23]:
# using tf-idf consine similarity
get_recommendations('Toy Story', cosine_similarity_matrix)

[('Toy Story 2', 0.4749337022705545),
 ('Toy Story 3', 0.36463412674469725),
 ('Small Fry', 0.22560878224550632),
 ('Toy Story of Terror!', 0.1769914235505582),
 ('Hawaiian Vacation', 0.1708619790688143)]

In [24]:
# using count vectorizer consine similarity
get_recommendations('Toy Story', cosine_similarity_matrix_2)

[('Toy Story 2', 0.5456800796984003),
 ('Toy Story 3', 0.4634434265388623),
 ('Small Fry', 0.29137276844995375),
 ('Partysaurus Rex', 0.25001476973977005),
 ('Sleep', 0.24942329796316196)]

In [25]:
# checking lowercase letter
get_recommendations('toy story')

"Movie 'toy story' not found in the dataset."

In [26]:
# get_recommendations('The Dark Knight')

##### Recommendations by given any keyword with similarity score using linear kernel or consine similarity of TF-IDF or Count Vectorizer

In [27]:
def get_recommendations_by_keyword(keywords, vector=tfidf_vector, matrix=tfidf_matrix, n=5):

    try:
        keywords = keywords.split()
        keywords = " ".join(keywords)
        
        # transform the string to vector representation
        key_matrix = vector.transform([keywords]) 
        
        # compute cosine similarity    
        result = cosine_similarity(key_matrix, matrix)
        # print(result)

        # sort top n similar movies   
        similar_movies = sorted(list(enumerate(result[0])), reverse=True, key=lambda x: x[1])
        # print(similar_movies[1:6])
        
        # extract names from dataframe and return movie names
        recommendations = []
        for i in similar_movies[1:n+1]:
            similar_movie_title = df.iloc[i[0]].title
            similarity_score = i[1]
            recommendations.append((similar_movie_title, similarity_score))
            
        return recommendations
    
    except IndexError:
        return f"Keyword '{keywords}' not found in the dataset."

In [28]:
# using tfidf_vector and tfidf_matrix
get_recommendations_by_keyword('toy story')

[('Silent Night, Deadly Night 5: The Toy Maker', 0.3820793935505063),
 ('Tin Toy', 0.33084382056139966),
 ('Toy Story That Time Forgot', 0.32936315928497806),
 ('The Christmas Toy', 0.32081757869589556),
 ('Toy Story 2', 0.309486398470801)]

In [29]:
# using count_vector and count_matrix
get_recommendations_by_keyword('toy story', count_vector, count_matrix)

[('Toy Story That Time Forgot', 0.3621429841700741),
 ('The Sea That Thinks', 0.35856858280031806),
 ('Ten Canoes', 0.3517987723651459),
 ('If These Knishes Could Talk: The Story of the NY Accent',
  0.3333333333333333),
 ('Grand Theft Parsons', 0.28426762180748055)]

In [30]:
# using tfidf_vector and tfidf_matrix
get_recommendations_by_keyword('The Dark Knight Rises')

[('The Batman Shootings', 0.21336318408852634),
 ('Dr. Phibes Rises Again', 0.20492392152909703),
 ('Sword of the Valiant: The Legend of Sir Gawain and the Green Knight',
  0.18459546328679338),
 ('The Dark Below', 0.18305746581742885),
 ('Black Angel', 0.16305158650936977)]

In [31]:
# using tfidf_vector and tfidf_matrix
get_recommendations_by_keyword('joker')

[('Batman Beyond: Return of the Joker', 0.31516737970338876),
 ('Batman Unlimited: Monster Mayhem', 0.20066486925931895),
 ('The Defiant Ones', 0.19738118526040943),
 ('Lego Batman: The Movie - DC Super Heroes Unite', 0.18496762448969511),
 ('Laughter in Paradise', 0.18454923952895036)]

In [32]:
# using tfidf_vector and tfidf_matrix
get_recommendations_by_keyword('Christopher Nolan')

[('Interstellar', 0.2528548463087272),
 ('The Many Faces of Christopher Lee', 0.252208361629631),
 ('Inception', 0.24643957199522087),
 ('The Prestige', 0.23323580524987994),
 ('The Dark Knight Rises', 0.23027485040632467)]

In [33]:
# using tfidf_vector and tfidf_matrix
get_recommendations_by_keyword('chris hemsworth')

[('Team Thor', 0.27853759475960577),
 ('Empire State', 0.24296530433308566),
 ('Triangle', 0.19470764022884313),
 ('Chris Tucker Live', 0.18203206004956396),
 ('Kill Me Three Times', 0.17683026956473769)]

<h2>Saving Computed Matrix for Streamlit</h2>

In [34]:
import joblib

In [35]:
np.save('../models/cosine_similarity_matrix.npy', cosine_similarity_matrix)

In [36]:
joblib.dump(tfidf_vector, '../models/tfidf_vectorizer.joblib')
joblib.dump(tfidf_matrix, '../models/tfidf_matrix.joblib')

['../models/tfidf_matrix.joblib']

<h2>Saving Dataframe for Streamlit</h2>

In [37]:
streamlit_movies_df = df[['id', 'title']]
streamlit_movies_df.shape

(31516, 2)

In [38]:
streamlit_movies_df.to_pickle("../datasets/clean/streamlit_movies_df.pkl") 