In [4]:
# This script build tokenizer

import pandas as pd
import numpy as np
from pandarallel import pandarallel
from tokenizers import Tokenizer
from bs4 import BeautifulSoup 
import re
from functools import partial
import emoji

CONFIG = {
    "tokenizer_path": "movie_review_tokenizer.json",
    "nmb_workers": 8
}

# Load IMDb train and unsepervised files
df1 = pd.read_parquet("data/IMDb/raw/train-00000-of-00001.parquet")
df2 = pd.read_parquet("data/IMDb/raw/unsupervised-00000-of-00001.parquet")
df = pd.concat([df1, df2])

def clean_text(text, bs4_parser, regex_module, emoji_module):
    text = str(text)
    
    # 1. Lowercase
    text = text.lower()
    
    # 2. Remove HTML tags
    text = bs4_parser(text, "html.parser").get_text()
    
    # 3. Remove URLs
    text = regex_module.sub(r'http\S+|www\S+|https\S+', '', text, flags=regex_module.MULTILINE)

    # 4. Remove emojis
    text = emoji_module.replace_emoji(text, replace='')
    
    # 5. Remove problematic chars (keep emoticons)
    text = regex_module.sub(r'[\x00-\x1F\x7F-\x9F\u2000-\u200F\u2028-\u202F]', '', text)
    text = regex_module.sub(
        r'[^\w\s.,!?\':)(/-=;]', 
        '', 
        str(text)
    )
    text = regex_module.sub(r'(?<!\w)[:=/(](?!\w)', '', text)
    
    # 6. Remove extra whitespace
    text = regex_module.sub(r'\s+', ' ', text).strip()
    
    return text

# Create a pre-configured cleaning function and apply in parallel
clean_text_optimized = partial(clean_text, bs4_parser=BeautifulSoup, regex_module=re, emoji_module=emoji)
pandarallel.initialize(progress_bar=True, nb_workers=CONFIG["nmb_workers"])
df["cleaned_text"] = df["text"].parallel_apply(clean_text_optimized)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=9375), Label(value='0 / 9375'))), …

In [5]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, Regex

# Initialize tokenizer (WordPiece/BPE)
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
trainer = trainers.WordPieceTrainer(
    vocab_size=30_000,
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
    min_frequency=10,
)

# Splitting tokens on whitespace, puntuation and unknown chars
tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
    pre_tokenizers.Whitespace(),  # Split on whitespace
    pre_tokenizers.Punctuation(),  # Split punctuation
    pre_tokenizers.Split(
        pattern=Regex(r"[^\w\s]"),  # Use tokenizers.Regex
        behavior="isolated"
    )
])

# Convert DataFrame column to list for tqdm
texts = df["cleaned_text"].tolist()

# Train on cleaned text
tokenizer.train_from_iterator(df["cleaned_text"], trainer=trainer)
tokenizer.save(CONFIG["tokenizer_path"])
print("Tokenizer is built")

Tokenizer is built
