In [1]:
# !pip install requests
# !pip install textblob
# !pip install emoji
# !pip install spacy==3.7.2
# !pip install pydantic==1.9.0
# !python -m spacy download en_core_web_sm

In [2]:
import requests
import json
import ast
import re
import pandas as pd
import string,time
import emoji
import spacy
import string

from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

In [3]:
df = pd.read_csv('data/movie_review_data_with_genre_from_api.csv')

In [4]:
df.tail()

Unnamed: 0.1,Unnamed: 0,original_title,overview,genre_ids,genre_names
9995,9995,The Last Airbender,"The story follows the adventures of Aang, a yo...","[28, 12, 14]","Action, Adventure, Fantasy"
9996,9996,From Dusk Till Dawn 2: Texas Blood Money,A bank-robbing gang of misfits heads to Mexico...,"[80, 28, 27, 53]","Crime, Action, Horror, Thriller"
9997,9997,Cage Dive,Three friends from California are filming an a...,"[27, 18, 53]","Horror, Drama, Thriller"
9998,9998,Street Fighter,Col. Guile and various other martial arts hero...,"[28, 12, 35, 53]","Action, Adventure, Comedy, Thriller"
9999,9999,Devil's Due,An unexpected pregnancy takes a terrifying tur...,[27],Horror


In [5]:
df.shape

(10000, 5)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Unnamed: 0      10000 non-null  int64 
 1   original_title  10000 non-null  object
 2   overview        9999 non-null   object
 3   genre_ids       10000 non-null  object
 4   genre_names     9997 non-null   object
dtypes: int64(1), object(4)
memory usage: 390.8+ KB


In [7]:
df['overview'][3].lower()

'the true story of how businessman oskar schindler saved over a thousand jewish lives from the nazis while they worked as slaves in his factory during world war ii.'

## 1. Remove Punctuations

In [8]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [9]:
exclude = string.punctuation

def remove_punc(text):
    if not isinstance(text, str):
        return ""
    return text.translate(str.maketrans('', '', exclude))

df['description_corrected'] = df['overview'].apply(remove_punc)

In [10]:
df['description_corrected'][5]

'A young girl Chihiro becomes trapped in a strange new world of spirits When her parents undergo a mysterious transformation she must call upon the courage she never knew she had to free her family'

## 2. Chat Word Treatment

In [11]:
# Open and read the chat_words.txt file
file_path = 'D:/oishee/jobs/2025/NLP/kaggle_movie_review/datasets/slang.txt'

# Create an empty dictionary
chat_words = {}

# Read and process the file
with open(file_path, 'r', encoding='utf-8') as file:
     for line in file:
        if line.strip():
            parts = re.split(r'\s*[:=–—\-]{1,2}>\s*|\s*[:=–—\-]\s*', line.strip(), maxsplit=1)
            if len(parts) == 2:
                key, value = parts
                chat_words[key.strip().upper()] = value.strip()

In [12]:
print(list(chat_words.items())[:5])  # Show first 5 key-value pairs

[('AFAIK', 'As Far As I Know'), ('AFK', 'Away From Keyboard'), ('ASAP', 'As Soon As Possible'), ('ATK', 'At The Keyboard'), ('ATM', 'At The Moment')]


In [13]:
def chat_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

df['description_corrected'] = df['description_corrected'].apply(chat_conversion)

In [14]:
df['description_corrected'][1000]

'A young girl had her voice magically taken away so that she would never hurt people with it but her outlook changes when she encounters music and friendship Will Naruse be able to convey the anthem of her heart'

## 3. Lower Casing

In [15]:
df['description_corrected'] = df['description_corrected'].str.lower()

df

Unnamed: 0.1,Unnamed: 0,original_title,overview,genre_ids,genre_names,description_corrected
0,0,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,"[18, 80]","Drama, Crime",imprisoned in the 1940s for the double murder ...
1,1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","[18, 80]","Drama, Crime",spanning the years 1945 to 1955 a chronicle of...
2,2,The Godfather Part II,In the continuing saga of the Corleone crime f...,"[18, 80]","Drama, Crime",in the continuing saga of the corleone crime f...
3,3,Schindler's List,The true story of how businessman Oskar Schind...,"[18, 36, 10752]","Drama, History, War",the true story of how businessman oskar schind...
4,4,12 Angry Men,The defense and the prosecution have rested an...,[18],Drama,the defense and the prosecution have rested an...
...,...,...,...,...,...,...
9995,9995,The Last Airbender,"The story follows the adventures of Aang, a yo...","[28, 12, 14]","Action, Adventure, Fantasy",the story follows the adventures of aang a you...
9996,9996,From Dusk Till Dawn 2: Texas Blood Money,A bank-robbing gang of misfits heads to Mexico...,"[80, 28, 27, 53]","Crime, Action, Horror, Thriller",a bankrobbing gang of misfits heads to mexico ...
9997,9997,Cage Dive,Three friends from California are filming an a...,"[27, 18, 53]","Horror, Drama, Thriller",three friends from california are filming an a...
9998,9998,Street Fighter,Col. Guile and various other martial arts hero...,"[28, 12, 35, 53]","Action, Adventure, Comedy, Thriller",col guile and various other martial arts heroe...


## 4. Spelling Correction

In [16]:
# def correct_text(text):
#     try:
#         return (TextBlob(text).correct().astype(str))
#     except:
#         return text

# df['description_corrected'] = df['description_corrected'].apply(correct_text)

In [17]:
df['description_corrected'][5000]

'island farmer banana joe helps the local community by trading his bananas for goods when gangsters arrive with plans to construct a banana processing plant joe kicks them out but the mob boss discovers that joe is operating without a license after the mob tips off the authorities and joes boat is impounded he ventures into a big city for the first tears in my eyes to seek help'

## 5. Removing Stopwords

In [18]:
# import nltk
# nltk.download('stopwords')

In [19]:
print(list(stopwords.words('english')[:5]))  # Show first 5 key-value pairs

['a', 'about', 'above', 'after', 'again']


In [20]:
def remove_stopwords(text):
    new_text = []
    
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

df['description_corrected'] = df['description_corrected'].apply(remove_stopwords)

In [21]:
df['description_corrected'][9999]

' unexpected pregnancy takes  terrifying turn  newlyweds zach  samantha mccall'

## 6. Handling Emojis

In [22]:
df['description_corrected'] = df['description_corrected'].apply(emoji.demojize)

df['description_corrected'][9998]

'col guile  various  martial arts heroes fight   tyranny  dictator  bison   cohorts'

## 7. Tokenization

In [23]:
nlp = spacy.load('en_core_web_sm')

df['description_tokenized'] = df['description_corrected'].apply(nlp)

df['description_tokenized']

0       (imprisoned,   , 1940s,   , double, murder,   ...
1       (spanning,  , years, 1945,  , 1955,  , chronic...
2       (  , continuing, saga,   , corleone, crime, fa...
3       ( , true, story,   , businessman, oskar, schin...
4       ( , defense,   , prosecution,  , rested,   , j...
                              ...                        
9995    ( , story, follows,  , adventures,  , aang,  ,...
9996    ( , bankrobbing, gang,  , misfits, heads,  , m...
9997    (three, friends,  , california,  , filming,  ,...
9998    (col, guile,  , various,  , martial, arts, her...
9999    ( , unexpected, pregnancy, takes,  , terrifyin...
Name: description_tokenized, Length: 10000, dtype: object

In [24]:
df['description_tokenized'][5000]

island farmer banana joe helps  local community  trading  bananas  goods  gangsters arrive  plans  construct  banana processing plant joe kicks     mob boss discovers  joe  operating without  license   mob tips   authorities  joes boat  impounded  ventures   big city   first tears   eyes  seek help

## 8. Stemming

In [25]:
# Initialize the Porter Stemmer
ps = PorterStemmer()

def stem_words(text):
    if not isinstance(text, str):
        return ""
    tokens = word_tokenize(text)
    stems = [ps.stem(word) for word in tokens]
    return " ".join(stems).strip()

In [26]:
df['description_corrected_stemming'] = df['description_corrected'].apply(stem_words)

In [27]:
df['description_corrected_stemming'][5000]

'island farmer banana joe help local commun trade banana good gangster arriv plan construct banana process plant joe kick mob boss discov joe oper without licens mob tip author joe boat impound ventur big citi first tear eye seek help'

## Lemmatization

In [28]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

In [29]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    if not isinstance(text, str):
        return ""
    tokens = word_tokenize(text)
    lemmatized = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(lemmatized)

In [30]:
df['description_corrected_lemmatize'] = df['description_corrected'].apply(lemmatize_text)

df['description_corrected_lemmatize'][10]

'burgerloving hit man philosophical partner drugaddled gangster moll washedup boxer converge sprawling comedic crime caper adventure unfurl three story ingeniously trip back forth tear eye'

In [31]:
df['description_corrected_lemmatize'][5000]

'island farmer banana joe help local community trading banana good gangster arrive plan construct banana processing plant joe kick mob bos discovers joe operating without license mob tip authority joes boat impounded venture big city first tear eye seek help'

In [32]:
df.to_csv("data/preprocessed_data.csv")