In [9]:
import pandas as pd
import numpy as np
from pandarallel import pandarallel
from tokenizers import Tokenizer
from bs4 import BeautifulSoup 
import re
from functools import partial
import emoji

# Change dataset paths below if needed

CONFIG = {
    "max_seq_len": 768,
    "data_path": "Amazon/Movies_and_TV_reviews.parquet",
    "tokenizer_path": "movie_review_tokenizer.json",
    "padded_token_ids": "Amazon/A_padded_token_ids.pt",
    "padded_attention_masks": "Amazon/A_padded_attention_masks.pt",
    "sentiment_labels": "Amazon/A_sentiment_labels.pt",
    "nmb_workers": 8
}

# =======================
# 1. Reading and cleaning data
# =======================

df = pd.read_parquet(CONFIG["data_path"])

def clean_text(text, bs4_parser, regex_module, emoji_module):
    text = str(text)
    
    # 1. Lowercase
    text = text.lower()
    
    # 2. Remove HTML tags
    text = bs4_parser(text, "html.parser").get_text()
    
    # 3. Remove URLs
    text = regex_module.sub(r'http\S+|www\S+|https\S+', '', text, flags=regex_module.MULTILINE)

    # 4. Remove emojis
    text = emoji_module.replace_emoji(text, replace='')
    
    # 5. Remove problematic chars (keep emoticons)
    text = regex_module.sub(r'[\x00-\x1F\x7F-\x9F\u2000-\u200F\u2028-\u202F]', '', text)
    text = regex_module.sub(
        r'[^\w\s.,!?\':)(/-=;]', 
        '', 
        str(text)
    )
    text = regex_module.sub(r'(?<!\w)[:=/(](?!\w)', '', text)
    
    # 6. Remove extra whitespace
    text = regex_module.sub(r'\s+', ' ', text).strip()
    
    return text

# Create a pre-configured cleaning function and apply in parallel
clean_text_optimized = partial(clean_text, bs4_parser=BeautifulSoup, regex_module=re, emoji_module=emoji)
pandarallel.initialize(progress_bar=True, nb_workers=CONFIG["nmb_workers"])
df["cleaned_text"] = df["text"].parallel_apply(clean_text_optimized)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3125), Label(value='0 / 3125'))), …

In [10]:
# =======================
# 2. Tokenizing data
# =======================

def token_get(review, tokenizer):
    return tokenizer.encode(review, add_special_tokens=True).ids

# Parallel tokenization and length calculation
tokenizer = Tokenizer.from_file(CONFIG["tokenizer_path"])
token_get_tokenizer = partial(token_get, tokenizer=tokenizer)
df["tokenized_text"] = df["cleaned_text"].parallel_apply(token_get_tokenizer)
df["token_length"] = df["tokenized_text"].str.len()

# Get lengths as list
lengths = df["token_length"].tolist()

# Calculate statistics
print(f"Max length: {max(lengths)}")
print(f"Mean length: {np.mean(lengths)}")
print(f"Median length: {np.median(lengths)}")
print(f"95th percentile: {np.percentile(lengths, 95)}")
print(f"99th percentile: {np.percentile(lengths, 99)}")

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3125), Label(value='0 / 3125'))), …

Max length: 6181
Mean length: 68.78504
Median length: 27.0
95th percentile: 276.0
99th percentile: 640.0099999999984


In [11]:
# =======================
# 3. Creating attention masks and padding
# =======================
import torch
from torch.nn.utils.rnn import pad_sequence

# Create attention masks (1 for real tokens, 0 for padding)
def attention(ids):
    return [1] * len(ids)

df["attention_mask"] = df["tokenized_text"].parallel_apply(attention)

# Pad sequences to max_length
def padding(series, max_len=CONFIG["max_seq_len"], pad_value=tokenizer.token_to_id("[PAD]")):
    tensors = []
    for x in series:
        truncated = x[:max_len]
        padded = torch.nn.functional.pad(
            torch.tensor(truncated),
            (0, max_len - len(truncated)),  # Pad right side
            value=pad_value
        )
        tensors.append(padded)
    return torch.stack(tensors)
    
padded_ids = padding(df["tokenized_text"])
padded_masks = padding(df["attention_mask"])
labels = torch.tensor(df["label"].values, dtype=torch.long)

# Verify data shapes
assert padded_ids.shape == (len(df), CONFIG["max_seq_len"])
assert padded_masks.shape == (len(df), CONFIG["max_seq_len"])
assert len(labels) == len(df)

# =======================
# 4. Save prepared data
# =======================

torch.save(padded_ids, CONFIG["padded_token_ids"])
torch.save(padded_masks, CONFIG["padded_attention_masks"])
torch.save(labels, CONFIG["sentiment_labels"])

print(f"Data prepared - Samples: {len(labels)}, \nShapes:")
print(f"Token IDs: {padded_ids.shape}, \nMasks: {padded_masks.shape}")
print(f"Class balance: 0={sum(labels==0)}, 1={sum(labels==1)}")


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3125), Label(value='0 / 3125'))), …

Data prepared - Samples: 25000, 
Shapes:
Token IDs: torch.Size([25000, 768]), 
Masks: torch.Size([25000, 768])
Class balance: 0=12500, 1=12500
