Step 1: Importing the libraries

In [18]:
#Importing the libraries
import pandas as pd 
import numpy as np 
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag

nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\princ\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\princ\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\princ\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Step 2: Load the dataset

In [2]:
#Loading the dataset
movie_df = pd.read_csv("C:\\Users\\princ\\Documents\\Machine Learning Projects\\Summer 24\\Natural Language Processing\\Movie Plot Similarity using IMDB dataset\\wiki_movie_plots_deduped.csv")

Step 3: Preprocess the data

In [3]:
#Displaying the top 5 records
movie_df.head(5)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


In [4]:
#Picking only the most recent record to reduce the size and complexity of the dataset
movie_df = movie_df[ movie_df['Release Year'] >= 2017 ].reset_index()

In [5]:
#Displaying the top 5 records again to see if the year is changed and index is reset
movie_df.head(5)

Unnamed: 0,index,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,17164,2017,Underworld: Blood Wars,American,Anna Foerster,Anna Foerster (director); Cory Goodman (screen...,"action, horror",https://en.wikipedia.org/wiki/Underworld:_Bloo...,The remaining vampire covens are on the verge ...
1,17165,2017,Monster Trucks,American,Chris Wedge,"Chris Wedge (director); Jonathan Aibel, Glenn ...","animation, adventure, sci-fi",https://en.wikipedia.org/wiki/Monster_Trucks_(...,Terravex Oil is in the midst of a fracking ope...
2,17166,2017,The Bye Bye Man,American,Stacy Title,Stacy Title (director); Jonathan Penner (scree...,horror,https://en.wikipedia.org/wiki/The_Bye_Bye_Man,"In 1969, a mass murder occurs in which a man k..."
3,17167,2017,Sleepless,American,Baran bo Odar,Baran bo Odar (director); Andrea Berloff (scre...,"action, thriller",https://en.wikipedia.org/wiki/Sleepless_(2017_...,"In Las Vegas, vice LVMPD policemen Vincent Dow..."
4,17168,2017,100 Streets,American,Jim O'Hanlon,Jim O'Hanlon (director); Leon F. Butler (scree...,drama,https://en.wikipedia.org/wiki/100_Streets,The film centers on three characters who have ...


In [7]:
#All the preprocessing - Removal of numbers from the word, Removal of stopwords, Stemming the word, 
def isAlpha( word ):
    for character in word:
        if character.isdigit():
            return False
    
    return True

def remove_numbers( plot ):
    words = plot.split()

    number_check_regex = [ word for word in words if isAlpha( word ) ]

    return " ".join(number_check_regex)

def remove_stopwords( plot ):
    words = plot.split()

    ans = [ word for word in words if word not in stopwords.words('english') ]

    return " ".join(ans)


In [8]:
#Testing the functionality of the preprocessing function
print(remove_numbers("Prince i2s a goo0d boy"))
print(isAlpha( "Prin3ce" ))
print(remove_stopwords(" My life is awesome and I am living my dream "))

'My life awesome I living dream'

In [12]:
#Stemming the data
def stem( plot ):

    sentences = sent_tokenize( plot )

    stemmed_words = []

    for sentence in sentences:
        words = word_tokenize( sentence )
        pos_tags = pos_tag(words)

        for word, pos in pos_tags:
            # print(f"{word} -> {pos}")   
            # if pos == 'NN':
            stemmed_words.append( word )


    return " ".join(stemmed_words)

In [9]:
#Initializing the PorterStemmer()
stemmer = PorterStemmer()

In [10]:
#The main preprocessing function that calls the other functionalities
def preprocess( plot ):

    plot = remove_numbers( plot )
    plot = remove_stopwords( plot )
    plot = stem( plot )

    return plot   


In [13]:
#A testing run to check the preprocess()
preprocess("Lets try to party till morning. Since we do not know how long we will stay here.")

'Lets try party till morning . Since know long stay here .'

In [14]:
#The funciton where the preprocessing is called for the dataset
i = 0
while i < len(movie_df):
    plot = movie_df['Plot'][i]
    movie_df.loc[i]['Plot'] = preprocess( plot )
    i += 1
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_df.loc[i]['Plot'] = preprocess( plot )


Step 4: Extracting the features using TfidfVectorizer

In [15]:
#Initializing and calling the TfIdfVectorizer() 
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer( max_df=0.8, min_df=0.2, max_features=800000, use_idf=True, ngram_range=(1,3) )

tfidf_matrix = tfidf_vectorizer.fit_transform( [ plot for plot in movie_df['Plot'] ] )

In [16]:
#Displaying the Tfidf Matrix
tfidf_matrix

<805x175 sparse matrix of type '<class 'numpy.float64'>'
	with 48743 stored elements in Compressed Sparse Row format>

Step 5: Clustering the Records

In [17]:
#Clustering the dataset
from sklearn.cluster import KMeans

km = KMeans( n_clusters = 5 )

km.fit( tfidf_matrix )

clusters = km.labels_.tolist()

movie_df['cluster'] = clusters

movie_df['cluster'].value_counts()

cluster
1    240
2    192
3    161
0    128
4     84
Name: count, dtype: int64