# Text Preprocessing

## Data Import

In [1]:
import pandas as pd
data = pd.read_csv("IMDB Dataset.csv")
df = pd.DataFrame(data)
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## Lowercasing

In [2]:
df['review'] = df['review'].str.lower()
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,i am a catholic taught in parochial elementary...,negative
49998,i'm going to have to disagree with the previou...,negative


## Remove HTML Tags

In [3]:
import re
def remove_html_tags(text):
    clean = re.compile(r'<.*?>')
    return re.sub(clean, '', text)

In [4]:
df['review'] = df['review'].apply(remove_html_tags)
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,i am a catholic taught in parochial elementary...,negative
49998,i'm going to have to disagree with the previou...,negative


## Remove URLs

In [5]:
def remove_urls(text):
    url_pattern = re.compile(r'http\S+|www\S+|https\S+', re.IGNORECASE)
    return url_pattern.sub(r'', text)

In [6]:
df['review'] = df['review'].apply(remove_urls)
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,i am a catholic taught in parochial elementary...,negative
49998,i'm going to have to disagree with the previou...,negative


## Remove Puntuation Marks

In [7]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [8]:
exclude = string.punctuation

In [9]:
def remove_punc(text):
    return text.translate(str.maketrans('', '', exclude))

In [10]:
df['review'] = df['review'].apply(remove_punc)
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,bad plot bad dialogue bad acting idiotic direc...,negative
49997,i am a catholic taught in parochial elementary...,negative
49998,im going to have to disagree with the previous...,negative


## Chat Word Treatment

In [11]:
chat_words = {
    "u": "you", "ur": "your", "r": "are", "im": "i am", "idk": "i do not know",
    "brb": "be right back", "btw": "by the way", "b4": "before", "bc": "because",
    "cya": "see you", "gr8": "great", "l8r": "later", "nvm": "never mind",
    "omg": "oh my god", "lol": "laughing out loud", "lmao": "laughing my ass off",
    "rofl": "rolling on the floor laughing", "smh": "shaking my head", 
    "tbh": "to be honest", "idc": "i do not care", "ikr": "i know right",
    "np": "no problem", "omw": "on my way", "pls": "please", "plz": "please",
    "ppl": "people", "thx": "thanks", "ty": "thank you", "u2": "you too",
    "wtf": "what the fuck", "wth": "what the hell", "afaik": "as far as i know",
    "bff": "best friends forever", "dm": "direct message", "ftw": "for the win",
    "fyi": "for your information", "gtg": "got to go", "hbu": "how about you",
    "hbd": "happy birthday", "ily": "i love you", "msg": "message",
    "ngl": "not going to lie", "oic": "oh i see", "roflmao": "rolling on the floor laughing my ass off",
    "sup": "what is up", "tba": "to be announced", "tbc": "to be continued",
    "tbd": "to be decided", "tfw": "that feeling when", "tmi": "too much information",
    "ttyl": "talk to you later", "xoxo": "hugs and kisses", "ya": "yeah",
    "yw": "you are welcome", "zzz": "sleeping", "asap": "as soon as possible",
    "atm": "at the moment", "bbl": "be back later", "bf": "boyfriend", "gf": "girlfriend",
    "bday": "birthday", "cos": "because", "cuz": "because", "lmk": "let me know",
    "imo": "in my opinion", "imho": "in my humble opinion", "nah": "no",
    "obv": "obviously", "sec": "second", "sis": "sister", "bro": "brother",
    "fam": "family", "gg": "good game", "rip": "rest in peace", 
    "stfu": "shut the fuck up", "wbu": "what about you", "yolo": "you only live once"
}

In [12]:
def normalize_chat_words(text, chat_words):
    pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in chat_words.keys()) + r')\b', flags=re.IGNORECASE)
    return pattern.sub(lambda x: chat_words[x.group().lower()], text)

In [13]:
df['review_clean'] = df['review'].astype(str).apply(lambda x: normalize_chat_words(x, chat_words))

## Spelling Correction

In [14]:
from symspellpy.symspellpy import SymSpell, Verbosity
import pkg_resources
import swifter


sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt"
)
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)


def fast_symspell(text):
    words = text.split()
    corrected = []
    for w in words:
        suggestion = sym_spell.lookup(w, Verbosity.CLOSEST, max_edit_distance=2)
        corrected.append(suggestion[0].term if suggestion else w)
    return " ".join(corrected)


df['review'] = df['review'].astype(str).swifter.apply(fast_symspell)

  import pkg_resources


Pandas Apply:   0%|          | 0/50000 [00:00<?, ?it/s]

## Remove Stopwords

In [15]:
from nltk.corpus import stopwords
stopwords.words('english')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [16]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return " ".join(filtered_words)

df['review'] = df['review'].astype(str).swifter.apply(remove_stopwords)

Pandas Apply:   0%|          | 0/50000 [00:00<?, ?it/s]

In [17]:
df['review']

0        one reviewers mentioned watching episode hooke...
1        wonderful little production filming technique ...
2        thought wonderful way spend time hot summer we...
3        basically family little boy jake thinks zombie...
4        petter matters love time money visually stunni...
                               ...                        
49995    thought movie right good job want creative ori...
49996    bad plot bad dialogue bad acting idiotic direc...
49997    catholic taught parochial elementary schools n...
49998    going disagree previous comment side martin on...
49999    one expects star trek movies high art fans exp...
Name: review, Length: 50000, dtype: object

## Handling Emojis

In [18]:
import emoji
def handle_emojis(text):
    return emoji.demojize(text)
df['review'] = df['review'].astype(str).swifter.apply(handle_emojis)

Pandas Apply:   0%|          | 0/50000 [00:00<?, ?it/s]

## Stemming

In [19]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def stemming(text):
    words = text.split()
    stemmed_words = [ps.stem(word) for word in words]
    return " ".join(stemmed_words)

df['review'] = df['review'].astype(str).swifter.apply(stemming)

Pandas Apply:   0%|          | 0/50000 [00:00<?, ?it/s]

In [20]:
df.drop(columns=['review_clean'], inplace=True)
df.head()

Unnamed: 0,review,sentiment
0,one review mention watch episod hook right exa...,positive
1,wonder littl product film techniqu unassum old...,positive
2,thought wonder way spend time hot summer weeke...,positive
3,basic famili littl boy jake think zombi closet...,negative
4,petter matter love time money visual stun film...,positive


In [None]:
import pandas as pd


# Save to CSV
output_path = "preprocessed_sentiment_data.csv"
df.to_csv(output_path, index=False, encoding="utf-8")

print(f"✅ Preprocessed data saved to {output_path}")

✅ Preprocessed data saved to preprocessed_sentiment_data.csv
