# Importing the data

In [1]:
!pip install rake-nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rake-nltk
  Downloading rake_nltk-1.0.6-py3-none-any.whl (9.1 kB)
Installing collected packages: rake-nltk
Successfully installed rake-nltk-1.0.6


In [2]:
import numpy as np
import pandas as pd
from rake_nltk import Rake
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [29]:
data = pd.read_csv('/content/train.csv')

# Data Pre-processing

In [4]:
data.head()

Unnamed: 0,title,wiki_plot,imdb_plot
0,The Godfather,"On the day of his only daughters wedding, Vito...","In late summer , guests are gathered for the w..."
1,The Shawshank Redemption,"In , banker Andy Dufresne is convicted of murd...","In , Andy Dufresne (Tim Robbins), a banker in ..."
2,Schindler's List,"In , the Germans move Polish Jews into the Kra...",The relocation of Polish Jews from surrounding...
3,Raging Bull,"In a brief scene in , an aging, overweight Ita...","The film opens in , where an older and fatter ..."
4,Casablanca,It is early December . American expatriate Ric...,"In the early years of World War II, December ,..."


In [5]:
data.shape

(90, 3)

In [6]:
data.info()
print('missing values: ',data.isnull().sum().values.sum())
data.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      90 non-null     object
 1   wiki_plot  90 non-null     object
 2   imdb_plot  90 non-null     object
dtypes: object(3)
memory usage: 2.2+ KB
missing values:  0


title        0
wiki_plot    0
imdb_plot    0
dtype: int64

In [7]:
data.describe()

Unnamed: 0,title,wiki_plot,imdb_plot
count,90,90,90
unique,90,90,90
top,The Godfather,"On the day of his only daughters wedding, Vito...","In late summer , guests are gathered for the w..."
freq,1,1,1


In [30]:
data['wiki_plot']=data['wiki_plot'].str.lower()
data.head()

Unnamed: 0,title,wiki_plot,imdb_plot
0,The Godfather,"on the day of his only daughters wedding, vito...","In late summer , guests are gathered for the w..."
1,The Shawshank Redemption,"in , banker andy dufresne is convicted of murd...","In , Andy Dufresne (Tim Robbins), a banker in ..."
2,Schindler's List,"in , the germans move polish jews into the kra...",The relocation of Polish Jews from surrounding...
3,Raging Bull,"in a brief scene in , an aging, overweight ita...","The film opens in , where an older and fatter ..."
4,Casablanca,it is early december . american expatriate ric...,"In the early years of World War II, December ,..."


In [31]:
data['imdb_plot']=data['imdb_plot'].str.lower()
data.head()

Unnamed: 0,title,wiki_plot,imdb_plot
0,The Godfather,"on the day of his only daughters wedding, vito...","in late summer , guests are gathered for the w..."
1,The Shawshank Redemption,"in , banker andy dufresne is convicted of murd...","in , andy dufresne (tim robbins), a banker in ..."
2,Schindler's List,"in , the germans move polish jews into the kra...",the relocation of polish jews from surrounding...
3,Raging Bull,"in a brief scene in , an aging, overweight ita...","the film opens in , where an older and fatter ..."
4,Casablanca,it is early december . american expatriate ric...,"in the early years of world war ii, december ,..."


In [10]:
def remove_whitespace(text):
    return  " ".join(text.split())

data['wiki_plot']=data['wiki_plot'].apply(remove_whitespace)
data['imdb_plot']=data['imdb_plot'].apply(remove_whitespace)

In [26]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [32]:
from nltk import word_tokenize

data['wiki_plot']=data['wiki_plot'].apply(lambda X: word_tokenize(X))
data.head()

Unnamed: 0,title,wiki_plot,imdb_plot
0,The Godfather,"[on, the, day, of, his, only, daughters, weddi...","in late summer , guests are gathered for the w..."
1,The Shawshank Redemption,"[in, ,, banker, andy, dufresne, is, convicted,...","in , andy dufresne (tim robbins), a banker in ..."
2,Schindler's List,"[in, ,, the, germans, move, polish, jews, into...",the relocation of polish jews from surrounding...
3,Raging Bull,"[in, a, brief, scene, in, ,, an, aging, ,, ove...","the film opens in , where an older and fatter ..."
4,Casablanca,"[it, is, early, december, ., american, expatri...","in the early years of world war ii, december ,..."


In [13]:
data['imdb_plot']=data['imdb_plot'].apply(lambda X: word_tokenize(X))
data.head()

Unnamed: 0,title,wiki_plot,imdb_plot
0,The Godfather,"[on, the, day, of, his, only, daughters, weddi...","[in, late, summer, ,, guests, are, gathered, f..."
1,The Shawshank Redemption,"[in, ,, banker, andy, dufresne, is, convicted,...","[in, ,, andy, dufresne, (, tim, robbins, ), ,,..."
2,Schindler's List,"[in, ,, the, germans, move, polish, jews, into...","[the, relocation, of, polish, jews, from, surr..."
3,Raging Bull,"[in, a, brief, scene, in, ,, an, aging, ,, ove...","[the, film, opens, in, ,, where, an, older, an..."
4,Casablanca,"[it, is, early, december, ., american, expatri...","[in, the, early, years, of, world, war, ii, ,,..."


In [34]:
import re
def remove_tag(wiki_plot):
    
    wiki_plot=' '.join(wiki_plot)
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', wiki_plot)
data['wiki_plot'] = data['wiki_plot'].apply(remove_tag)
data.head()

Unnamed: 0,title,wiki_plot,imdb_plot
0,The Godfather,"on the day of hi onli daughter wed , vito corl...","in late summer , guests are gathered for the w..."
1,The Shawshank Redemption,"in , banker andi dufresn is convict of murder ...","in , andy dufresne (tim robbins), a banker in ..."
2,Schindler's List,"in , the german move polish jew into the krakó...",the relocation of polish jews from surrounding...
3,Raging Bull,"in a brief scene in , an age , overweight ital...","the film opens in , where an older and fatter ..."
4,Casablanca,it is earli decemb . american expatri rick bla...,"in the early years of world war ii, december ,..."


In [33]:
from nltk.stem import PorterStemmer

def stemming(wiki_plot):
    porter = PorterStemmer()
    
    result=[]
    for word in wiki_plot:
        result.append(porter.stem(word))
    return result
data['wiki_plot']=data['wiki_plot'].apply(stemming)
data.head()

Unnamed: 0,title,wiki_plot,imdb_plot
0,The Godfather,"[on, the, day, of, hi, onli, daughter, wed, ,,...","in late summer , guests are gathered for the w..."
1,The Shawshank Redemption,"[in, ,, banker, andi, dufresn, is, convict, of...","in , andy dufresne (tim robbins), a banker in ..."
2,Schindler's List,"[in, ,, the, german, move, polish, jew, into, ...",the relocation of polish jews from surrounding...
3,Raging Bull,"[in, a, brief, scene, in, ,, an, age, ,, overw...","the film opens in , where an older and fatter ..."
4,Casablanca,"[it, is, earli, decemb, ., american, expatri, ...","in the early years of world war ii, december ,..."


In [14]:
!pip install spellchecker

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spellchecker
  Downloading spellchecker-0.4.tar.gz (3.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.9/3.9 MB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting inexactsearch
  Downloading inexactsearch-1.0.2.tar.gz (21 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting soundex>=1.0
  Downloading soundex-1.1.3.tar.gz (9.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting silpa_common>=0.3
  Downloading silpa_common-0.3.tar.gz (9.4 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: spellchecker, inexactsearch, silpa_common, soundex
  Building wheel for spellchecker (setup.py) ... [?25l[?25hdone
  Created wheel for spellchecker: filename=spellchecker-0.4-py3-none-any.whl size=3966514 sha256=bceef4e2f9f09380503d4

In [15]:
!pip install pyspellchecker

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspellchecker
  Downloading pyspellchecker-0.7.1-py3-none-any.whl (2.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.7.1


In [16]:
from spellchecker import SpellChecker
def spell_check(wiki_plot):
    
    result = []
    spell = SpellChecker()
    for word in wiki_plot:
        correct_word = spell.correction(word)
        result.append(correct_word)
    
    return result

data['wiki_plot'] = data['wiki_plot'].apply(spell_check)
data.head()

Unnamed: 0,title,wiki_plot,imdb_plot
0,The Godfather,"[on, the, day, of, his, only, daughters, weddi...","[in, late, summer, ,, guests, are, gathered, f..."
1,The Shawshank Redemption,"[in, ,, banker, andy, dufresne's, is, convicte...","[in, ,, andy, dufresne, (, tim, robbins, ), ,,..."
2,Schindler's List,"[in, ,, the, germans, move, polish, jews, into...","[the, relocation, of, polish, jews, from, surr..."
3,Raging Bull,"[in, a, brief, scene, in, ,, an, aging, ,, ove...","[the, film, opens, in, ,, where, an, older, an..."
4,Casablanca,"[it, is, early, december, ., american, expatri...","[in, the, early, years, of, world, war, ii, ,,..."


In [17]:
from spellchecker import SpellChecker
def spell_check(imdb_plot):
    
    result = []
    spell = SpellChecker()
    for word in imdb_plot:
        correct_word = spell.correction(word)
        result.append(correct_word)
    
    return result

data['imdb_plot'] = data['imdb_plot'].apply(spell_check)
data.head()

Unnamed: 0,title,wiki_plot,imdb_plot
0,The Godfather,"[on, the, day, of, his, only, daughters, weddi...","[in, late, summer, ,, guests, are, gathered, f..."
1,The Shawshank Redemption,"[in, ,, banker, andy, dufresne's, is, convicte...","[in, ,, andy, dufresne's, (, tim, robbins, ), ..."
2,Schindler's List,"[in, ,, the, germans, move, polish, jews, into...","[the, relocation, of, polish, jews, from, surr..."
3,Raging Bull,"[in, a, brief, scene, in, ,, an, aging, ,, ove...","[the, film, opens, in, ,, where, an, older, an..."
4,Casablanca,"[it, is, early, december, ., american, expatri...","[in, the, early, years, of, world, war, ii, ,,..."


In [18]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [19]:
from nltk.corpus import stopwords
print(stopwords.words('english'))
en_stopwords = stopwords.words('english')

def remove_stopwords(wiki_plot):
    result = []
    for token in wiki_plot:
        if token not in en_stopwords:
            result.append(token)
            
    return result
data['wiki_plot'] = data['wiki_plot'].apply(remove_stopwords)
data.head()

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

Unnamed: 0,title,wiki_plot,imdb_plot
0,The Godfather,"[day, daughters, wedding, ,, vito, corleone, h...","[in, late, summer, ,, guests, are, gathered, f..."
1,The Shawshank Redemption,"[,, banker, andy, dufresne's, convicted, murde...","[in, ,, andy, dufresne's, (, tim, robbins, ), ..."
2,Schindler's List,"[,, germans, move, polish, jews, kraken, ghett...","[the, relocation, of, polish, jews, from, surr..."
3,Raging Bull,"[brief, scene, ,, aging, ,, overweight, italia...","[the, film, opens, in, ,, where, an, older, an..."
4,Casablanca,"[early, december, ., american, expatriate, ric...","[in, the, early, years, of, world, war, ii, ,,..."


In [20]:
from nltk.corpus import stopwords
print(stopwords.words('english'))
en_stopwords = stopwords.words('english')

def remove_stopwords(imdb_plot):
    result = []
    for token in imdb_plot:
        if token not in en_stopwords:
            result.append(token)
            
    return result
data['imdb_plot'] = data['imdb_plot'].apply(remove_stopwords)
data.head()

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

Unnamed: 0,title,wiki_plot,imdb_plot
0,The Godfather,"[day, daughters, wedding, ,, vito, corleone, h...","[late, summer, ,, guests, gathered, wedding, r..."
1,The Shawshank Redemption,"[,, banker, andy, dufresne's, convicted, murde...","[,, andy, dufresne's, (, tim, robbins, ), ,, b..."
2,Schindler's List,"[,, germans, move, polish, jews, kraken, ghett...","[relocation, polish, jews, surrounding, areas,..."
3,Raging Bull,"[brief, scene, ,, aging, ,, overweight, italia...","[film, opens, ,, older, fatter, jake, lotta, (..."
4,Casablanca,"[early, december, ., american, expatriate, ric...","[early, years, world, war, ii, ,, december, ,,..."


# Data Representation

In [35]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy

nlp = spacy.load("en_core_web_sm")


print("With Count Vectorizer")
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data)
print(vectorizer.get_feature_names_out())
print(X.toarray())
print("\n")
print(cosine_similarity(X))


print("\n\nWith Count Vectorizer and removing stop words")
vectorizer = CountVectorizer(stop_words=nlp.Defaults.stop_words)
X = vectorizer.fit_transform(data)
print(vectorizer.get_feature_names_out())
print(X.toarray())
print("\n")
print(cosine_similarity(X))


print("\n\nWith TFIDFVectorizer")
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data)
print(vectorizer.get_feature_names_out())
print(X.toarray())
print("\n")
print(cosine_similarity(X))


print("\n\nWith TFIDFVectorizer and removing stop words")
vectorizer = TfidfVectorizer(stop_words=nlp.Defaults.stop_words)
X = vectorizer.fit_transform(data)
print(vectorizer.get_feature_names_out())
print(X.toarray())
print("\n")
print(cosine_similarity(X))



With Count Vectorizer
['imdb_plot' 'title' 'wiki_plot']
[[0 1 0]
 [0 0 1]
 [1 0 0]]


[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


With Count Vectorizer and removing stop words
['imdb_plot' 'title' 'wiki_plot']
[[0 1 0]
 [0 0 1]
 [1 0 0]]


[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


With TFIDFVectorizer
['imdb_plot' 'title' 'wiki_plot']
[[0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]]


[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


With TFIDFVectorizer and removing stop words
['imdb_plot' 'title' 'wiki_plot']
[[0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]]


[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]




In [36]:
import pandas as pd
import gensim.downloader as api
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained word2vec model
w2v_model = api.load('word2vec-google-news-300')



In [37]:
# Create a TfidfVectorizer object to tokenize and count the words in the plot column
vectorizer = TfidfVectorizer(stop_words='english')

In [38]:
wiki_plot_matrix = vectorizer.fit_transform(data['wiki_plot'])

In [39]:
# Get the user query
query = input('Enter a query: ')

Enter a query: thriller and action


In [42]:
# Convert the query into a vector using TF-IDF
imdb_plot_vector = vectorizer.transform([query]).toarray().flatten()
# Calculate the cosine similarity between the query vector and the movie plot vectors
cosine_similarities_tfidf = cosine_similarity([imdb_plot_vector], wiki_plot_matrix).flatten()
# Get the indices of the top 10 movies with the highest cosine similarity to the query
top_indices1 = cosine_similarities_tfidf.argsort()[:-11:-1]
# Get the movie titles and plot descriptions for the top 10 movies
top_movies1 = data.iloc[top_indices1][['title', 'wiki_plot']]


In [43]:
# Print the top 10 movie recommendations to the user
print('Top 10 movie recommendations:')
for i, row in top_movies1.iterrows():
    print('\nTitle:', row['title'])


Top 10 movie recommendations:

Title: Saving Private Ryan

Title: On the Waterfront

Title: Doctor Zhivago

Title: American Graffiti

Title: A Place in the Sun

Title: Sunset Blvd.

Title: Braveheart

Title: Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb

Title: The Apartment

Title: It's a Wonderful Life


In [44]:
# Tokenize and clean the query
query_words = query.split()
query_words = [w for w in query_words if w in w2v_model.vocab]

# Calculate the mean vector for the query using word2vec
imdb_plot_vector_w2v = np.mean([w2v_model[w] for w in query_words], axis=0)

# Convert the query into a vector using TF-IDF
imdb_plot_vector_tfidf = vectorizer.transform([query]).toarray().flatten()

# Concatenate the two vectors into a single representation
imdb_plot_vector = np.concatenate((imdb_plot_vector_tfidf, imdb_plot_vector_w2v))

# Calculate the cosine similarity between the query vector and the movie plot vectors
wiki_plot_vectors = [np.concatenate((wiki_plot_matrix[i].toarray().flatten(), np.mean([w2v_model[w] for w in p.split() if w in w2v_model.vocab], axis=0))) for i, p in enumerate(data['wiki_plot'])]
cosine_similarities = cosine_similarity([imdb_plot_vector], wiki_plot_vectors).flatten()

# Get the indices of the top 10 movies with the highest cosine similarity to the query
top_indices = cosine_similarities.argsort()[:-11:-1]

# Get the movie titles and plot descriptions for the top 10 movies
top_movies = data.iloc[top_indices][['title', 'wiki_plot']]


In [45]:
# Print the top 10 movie recommendations to the user
print('Top 10 movie recommendations:')
for i, row in top_movies.iterrows():
    print('\nTitle:', row['title'])

Top 10 movie recommendations:

Title: On the Waterfront

Title: Singin' in the Rain

Title: Rocky

Title: Jaws

Title: The Lord of the Rings: The Return of the King

Title: Unforgiven

Title: Sunset Blvd.

Title: Taxi Driver

Title: Saving Private Ryan

Title: Gladiator


In [46]:
# Convert the query into a vector using word2vec
imdb_plot_vec = np.zeros(300)
num_words = 0
for word in query.split():
    if word in w2v_model:
        imdb_plot_vec += w2v_model[word]
        num_words += 1
if num_words > 0:
    imdb_plot_vec /= num_words


In [48]:
# Calculate the cosine similarity between the query vector and the movie plot vectors
wiki_plot_matrix = np.zeros((len(data), 300))
for i, wiki_plot in enumerate(data['wiki_plot']):
    wiki_plot_vec = np.zeros(300)
    num_words = 0
    for word in wiki_plot.split():
        if word in w2v_model:
            wiki_plot_vec += w2v_model[word]
            num_words += 1
    if num_words > 0:
        wiki_plot_vec /= num_words
    wiki_plot_matrix[i] = wiki_plot_vec

cosine_similarities_w2v = cosine_similarity([imdb_plot_vec], wiki_plot_matrix).flatten()
# Get the indices of the top 10 movies with the highest cosine similarity to the query
top_indices2 = cosine_similarities_w2v.argsort()[:-11:-1]

In [49]:
# Get the movie titles and plot descriptions for the top 10 movies
for i, index in enumerate(top_indices2):
    movie_title = data.loc[index, 'title']
    print(f"Recommendation {i+1}: {movie_title}\n")


Recommendation 1: Singin' in the Rain

Recommendation 2: Rocky

Recommendation 3: The Lord of the Rings: The Return of the King

Recommendation 4: On the Waterfront

Recommendation 5: Unforgiven

Recommendation 6: Jaws

Recommendation 7: Gladiator

Recommendation 8: Butch Cassidy and the Sundance Kid

Recommendation 9: Taxi Driver

Recommendation 10: The Treasure of the Sierra Madre

