# 🎯 Text Preprocessing Notebook
### Comprehensive and Detailed Preprocessing 

This notebook focuses exclusively on **text preprocessing**. It prepares the dataset for machine learning and deep learning models using rigorous linguistic and statistical preprocessing.

In [5]:
import os, re, string, nltk, spacy, pandas as pd, numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
from sklearn.model_selection import train_test_split

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ravina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ravina/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/ravina/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## 1️⃣ Load Dataset

In [6]:
def load_imdb_dataset(base_dir):
    data = []
    for split in ['train', 'test']:
        for sentiment in ['pos', 'neg']:
            folder = os.path.join(base_dir, split, sentiment)
            for file_name in os.listdir(folder):
                with open(os.path.join(folder, file_name), 'r', encoding='utf-8') as f:
                    text = f.read()
                    data.append({'review': text, 'sentiment': sentiment, 'split': split})
    return pd.DataFrame(data)

df = load_imdb_dataset('../data/aclImdb')
df.head()

Unnamed: 0,review,sentiment,split
0,For a movie that gets no respect there sure ar...,pos,train
1,Bizarre horror movie filled with famous faces ...,pos,train
2,"A solid, if unremarkable film. Matthau, as Ein...",pos,train
3,It's a strange feeling to sit alone in a theat...,pos,train
4,"You probably all already know this by now, but...",pos,train


## 2️⃣ Clean and Normalize Text

In [8]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', ' ', text)
    text = re.sub(r'http\S+|www\S+', ' ', text)
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['clean_text'] = df['review'].apply(clean_text)
df.head()

Unnamed: 0,review,sentiment,split,clean_text
0,For a movie that gets no respect there sure ar...,pos,train,for a movie that gets no respect there sure ar...
1,Bizarre horror movie filled with famous faces ...,pos,train,bizarre horror movie filled with famous faces ...
2,"A solid, if unremarkable film. Matthau, as Ein...",pos,train,a solid if unremarkable film matthau as einste...
3,It's a strange feeling to sit alone in a theat...,pos,train,it s a strange feeling to sit alone in a theat...
4,"You probably all already know this by now, but...",pos,train,you probably all already know this by now but ...


## 3️⃣ Lemmatization, Stopword Removal, and POS Filtering

In [9]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
tqdm.pandas()

def preprocess_text(text):
    doc = nlp(text)
    tokens = []
    for token in doc:
        if token.is_alpha and token.text not in stop_words:
            tokens.append(token.lemma_.lower())
    return ' '.join(tokens)

df['processed_text'] = df['clean_text'].progress_apply(preprocess_text)
df.head()

100%|██████████| 50000/50000 [5:19:42<00:00,  2.61it/s]      


Unnamed: 0,review,sentiment,split,clean_text,processed_text
0,For a movie that gets no respect there sure ar...,pos,train,for a movie that gets no respect there sure ar...,movie get respect sure lot memorable quote lis...
1,Bizarre horror movie filled with famous faces ...,pos,train,bizarre horror movie filled with famous faces ...,bizarre horror movie fill famous face steal cr...
2,"A solid, if unremarkable film. Matthau, as Ein...",pos,train,a solid if unremarkable film matthau as einste...,solid unremarkable film matthau einstein wonde...
3,It's a strange feeling to sit alone in a theat...,pos,train,it s a strange feeling to sit alone in a theat...,strange feeling sit alone theater occupy paren...
4,"You probably all already know this by now, but...",pos,train,you probably all already know this by now but ...,probably already know additional episode never...


## 4️⃣ Balance Dataset

In [10]:
sentiment_counts = df['sentiment'].value_counts()
min_count = sentiment_counts.min()
df_balanced = df.groupby('sentiment').sample(min_count, random_state=42).reset_index(drop=True)
df_balanced['sentiment'].value_counts()

sentiment
neg    25000
pos    25000
Name: count, dtype: int64

## 5️⃣ Train-Test Split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    df_balanced['processed_text'], df_balanced['sentiment'],
    test_size=0.2, random_state=42, stratify=df_balanced['sentiment']
)
print('Train samples:', len(X_train))
print('Test samples:', len(X_test))

Train samples: 40000
Test samples: 10000


## 6️⃣ Save Preprocessed Data

In [None]:
os.makedirs('..data/processed', exist_ok=True)
df_balanced.to_csv('..data/processed/imdb_preprocessed.csv', index=False)
print('✅ Saved to ..data/processed/imdb_preprocessed.csv')

✅ Saved to ../processed/imdb_preprocessed.csv
