In [None]:
GENRE_MAP = {
    28: "Action",
    12: "Adventure",
    16: "Animation",
    35: "Comedy",
    80: "Crime",
    99: "Documentary",
    18: "Drama",
    10751: "Family",
    14: "Fantasy",
    36: "History",
    27: "Horror",
    10402: "Music",
    9648: "Mystery",
    10749: "Romance",
    878: "Science Fiction",
    10770: "TV Movie",
    53: "Thriller",
    10752: "War",
    37: "Western"
}


In [None]:
API_KEY = "8265bd1679663a7ea12ac168da84d2e8"

In [None]:
import requests
import pandas as pd

In [None]:
def fetch_movies(pages=471):
    movies = []

    for page in range(1, pages + 1):
        url = f"https://api.themoviedb.org/3/movie/top_rated?api_key=8265bd1679663a7ea12ac168da84d2e8&language=en-US&page=1"

        response = requests.get(url).json()

        for movie in response["results"]:
            title = movie.get("original_title")
            overview = movie.get("overview")

            genre_ids = movie.get("genre_ids", [])
            genres = ", ".join([GENRE_MAP.get(id, "Unknown") for id in genre_ids])

            movies.append({
                "movie_title": title,
                "description": overview,
                "movie_genre": genres
            })

    return movies


In [None]:
data = fetch_movies(pages=471)  # fetch 10 pages = 200 movies

df = pd.DataFrame(data)
df.to_csv("movie_dataset.csv", index=False)

print("Dataset saved successfully!")

Dataset saved successfully!


In [None]:
df = pd.read_csv('/content/movie_dataset.csv')

In [None]:
df.head()

Unnamed: 0,movie_title,description,movie_genre
0,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,"Drama, Crime"
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","Drama, Crime"
2,The Godfather Part II,In the continuing saga of the Corleone crime f...,"Drama, Crime"
3,Schindler's List,The true story of how businessman Oskar Schind...,"Drama, History, War"
4,12 Angry Men,The defense and the prosecution have rested an...,Drama


In [None]:
df.shape

(9420, 3)

In [None]:
df['description'][300].lower()

'imprisoned in the 1940s for the double murder of his wife and her lover, upstanding banker andy dufresne begins a new life at the shawshank prison, where he puts his accounting skills to work for an amoral warden. during his long stretch in prison, dufresne comes to be admired by the other inmates -- including an older prisoner named red -- for his integrity and unquenchable sense of hope.'

In [None]:
# lowercasing
df = df['description'].str.lower()

In [None]:
df

Unnamed: 0,description
0,imprisoned in the 1940s for the double murder ...
1,"spanning the years 1945 to 1955, a chronicle o..."
2,in the continuing saga of the corleone crime f...
3,the true story of how businessman oskar schind...
4,the defense and the prosecution have rested an...
...,...
9415,a man with a low iq has accomplished great thi...
9416,"the true story of henry hill, a half-irish, ha..."
9417,a samurai answers a village's request for prot...
9418,"in the final months of world war ii, 14-year-o..."


In [None]:
# removing urls if thet have in case
import re
def remove_urls(text):
  pattern = re.compile(r'https?://\S+|WWW\.\S+')
  return pattern.sub(r'',text)

In [None]:
df = df.apply(remove_urls)

In [None]:
# removing punctuation
import string,time
exclude = string.punctuation
exclude

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
# removing punctuation
def remove_punc(text):
  return text.translate(str.maketrans('','',exclude))

In [None]:
df = df.apply(remove_punc)

In [None]:
print(df[321])

spanning the years 1945 to 1955 a chronicle of the fictional italianamerican corleone crime family when organized crime family patriarch vito corleone barely survives an attempt on his life his youngest son michael steps in to take care of the wouldbe killers launching a campaign of bloody revenge


In [None]:
# there is no short hand words

In [None]:
# spelling correction
from textblob import TextBlob
def correct_spell(text):
  return str(TextBlob(text).correct())

In [None]:
sample = df[300]
sample

'imprisoned in the 1940s for the double murder of his wife and her lover upstanding banker andy dufresne begins a new life at the shawshank prison where he puts his accounting skills to work for an amoral warden during his long stretch in prison dufresne comes to be admired by the other inmates  including an older prisoner named red  for his integrity and unquenchable sense of hope'

In [None]:
correct_spell(sample)

'imprisoned in the 1940s for the double murder of his wife and her lover standing banker andy duquesne begins a new life at the shawshank prison where he puts his accounting skill to work for an moral garden during his long stretch in prison duquesne comes to be admired by the other inmates  including an older prisoner named red  for his integrity and unquenchable sense of hope'

In [None]:
# removing stopwords
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = stopwords.words('english')
stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [None]:
#
def remove_stopwords(text):
    new_text = []
    for w in text.split():
        if w not in stopwords:
            new_text.append(w)
    return " ".join(new_text)

In [None]:
df

Unnamed: 0,description
0,imprisoned in the 1940s for the double murder ...
1,spanning the years 1945 to 1955 a chronicle of...
2,in the continuing saga of the corleone crime f...
3,the true story of how businessman oskar schind...
4,the defense and the prosecution have rested an...
...,...
9415,a man with a low iq has accomplished great thi...
9416,the true story of henry hill a halfirish halfs...
9417,a samurai answers a villages request for prote...
9418,in the final months of world war ii 14yearold ...


In [None]:
df = df.apply(remove_stopwords)

In [None]:
df[300]

'imprisoned 1940s double murder wife lover upstanding banker andy dufresne begins new life shawshank prison puts accounting skills work amoral warden long stretch prison dufresne comes admired inmates including older prisoner named red integrity unquenchable sense hope'

In [None]:
df

Unnamed: 0,description
0,imprisoned 1940s double murder wife lover upst...
1,spanning years 1945 1955 chronicle fictional i...
2,continuing saga corleone crime family young vi...
3,true story businessman oskar schindler saved t...
4,defense prosecution rested jury filing jury ro...
...,...
9415,man low iq accomplished great things life pres...
9416,true story henry hill halfirish halfsicilian b...
9417,samurai answers villages request protection fa...
9418,final months world war ii 14yearold seita sist...


In [None]:
# tokenization
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize, sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
sample_df = df.apply(word_tokenize) # Text appears to be already tokenized, skipping this step.

In [None]:
sample_df

Unnamed: 0,description
0,"[imprisoned, 1940s, double, murder, wife, love..."
1,"[spanning, years, 1945, 1955, chronicle, ficti..."
2,"[continuing, saga, corleone, crime, family, yo..."
3,"[true, story, businessman, oskar, schindler, s..."
4,"[defense, prosecution, rested, jury, filing, j..."
...,...
9415,"[man, low, iq, accomplished, great, things, li..."
9416,"[true, story, henry, hill, halfirish, halfsici..."
9417,"[samurai, answers, villages, request, protecti..."
9418,"[final, months, world, war, ii, 14yearold, sei..."


In [None]:
# lemmatization
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

In [None]:
punctuations = '?:!.,;'
sample_df = df[300]
for word in sample_df:
  if word in punctuations:
    sample_df.remove(word)
sample_df

# printing style
print("{0:20}{1:20}".format("Word","Lemma"))
for word in sample_df:
  print("{0:20}{1:20}".format(word,wordnet_lemmatizer.lemmatize(word,pos='v')))

Word                Lemma               
i                   i                   
m                   m                   
p                   p                   
r                   r                   
i                   i                   
s                   s                   
o                   o                   
n                   n                   
e                   e                   
d                   d                   
                                        
1                   1                   
9                   9                   
4                   4                   
0                   0                   
s                   s                   
                                        
d                   d                   
o                   o                   
u                   u                   
b                   b                   
l                   l                   
e                   e                   
                

In [None]:
sample_df

Unnamed: 0,0
0,"[imprisoned, 1940s, double, murder, wife, love..."
1,"[spanning, year, 1945, 1955, chronicle, fictio..."
2,"[continuing, saga, corleone, crime, family, yo..."
3,"[true, story, businessman, oskar, schindler, s..."
4,"[defense, prosecution, rested, jury, filing, j..."
...,...
9415,"[man, low, iq, accomplished, great, thing, lif..."
9416,"[true, story, henry, hill, halfirish, halfsici..."
9417,"[samurai, answer, village, request, protection..."
9418,"[final, month, world, war, ii, 14yearold, seit..."
