### Setup & Imports

In [66]:
import os
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\baqui\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\baqui\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\baqui\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Load dataset

In [67]:
df = pd.read_csv('before-preprocess_reviews.csv')
df.head()

Unnamed: 0,label,review
0,CG,"Love this! Well made, sturdy, and very comfor..."
1,CG,"love it, a great upgrade from the original. I..."
2,CG,This pillow saved my back. I love the look and...
3,CG,"Missing information on how to use it, but it i..."
4,CG,Very nice set. Good quality. We have had the s...


### Lowercase, Punctuation, Emoji Removal

In [68]:
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation and emojis using regex
    text = re.sub(r'[^\w\s]', '', text)  # Keep only words and spaces
    
    return text

### Tokenization

In [69]:
def tokenize_text(text):
    return word_tokenize(text)

### Stopword Removal

In [70]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

### Lemmatization

In [71]:
lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

### Preprocessing Function

In [72]:
def preprocess_text(text):
    # 1. Lowercase & clean
    text = clean_text(text)
    
    # 2. Tokenize
    tokens = tokenize_text(text)
    
    # 3. Remove stopwords
    tokens = remove_stopwords(tokens)
    
    # 4. Lemmatize
    tokens = lemmatize_tokens(tokens)
    
    return tokens

### Apply to Dataset

In [73]:
df['processed_review'] = df['review'].apply(preprocess_text)

# Preview results
df[['review', 'processed_review']].head()

Unnamed: 0,review,processed_review
0,"Love this! Well made, sturdy, and very comfor...","[love, well, made, sturdy, comfortable, love, ..."
1,"love it, a great upgrade from the original. I...","[love, great, upgrade, original, ive, mine, co..."
2,This pillow saved my back. I love the look and...,"[pillow, saved, back, love, look, feel, pillow]"
3,"Missing information on how to use it, but it i...","[missing, information, use, great, product, pr..."
4,Very nice set. Good quality. We have had the s...,"[nice, set, good, quality, set, two, month]"


### Save Results

In [74]:
df.to_csv('processed_reviews.csv', index=False)
print("Processed dataset saved to processed_reviews.csv")

Processed dataset saved to processed_reviews.csv
