# Basic data cleaning and tokenization

In [21]:
import pandas as pd
from tqdm import tqdm

In [26]:
chunk_size = 1000
for chunk in tqdm(pd.read_csv("data/all_thoughts_.csv", chunksize = chunk_size)):
    for i, row in chunk.iterrows():
        if isinstance(row["c_parent"], str) and isinstance(row["body"], str):
            if row["body"] != '' and row["body"] != ' ' and row["body"] != '  ':
                with open("data/replies.txt", "a") as r:
                    r.write(str(row["body"]) + "\n")
        elif isinstance(row["c_parent"], float) and isinstance(row["body"], str):
            if row["body"] != '' and row["body"] != ' ' and row["body"] != '  ':
                with open("data/thoughts.txt", "a") as r:
                    r.write(str(row["body"]) + "\n")


1761it [02:37, 11.16it/s]


In [29]:
!pip install demoji

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip[0m


In [30]:
import demoji
demoji.download_codes()

  demoji.download_codes()


In [37]:
sample = "Happy birthday to me\nCheers to new adventures and life 🤝🧿"

def replace_emoji(text):
    emojis = demoji.findall(text)
    for k, v in emojis.items():
        text = text.replace(k, " :" + v + ": ")
    return text

replace_emoji(sample)

'Happy birthday to me\nCheers to new adventures and life  :handshake:  :nazar amulet: '

In [None]:
# demojied text

chunk_size = 1000
for chunk in tqdm(pd.read_csv("data/all_thoughts_.csv", chunksize = chunk_size)):
    for i, row in chunk.iterrows():
        if isinstance(row["c_parent"], str) and isinstance(row["body"], str):
            if row["body"] != '' and row["body"] != ' ' and row["body"] != '  ':
                with open("data/demojied_replies.txt", "a") as r:
                    r.write(replace_emoji(row["body"]) + "\n")
        elif isinstance(row["c_parent"], float) and isinstance(row["body"], str):
            if row["body"] != '' and row["body"] != ' ' and row["body"] != '  ':
                with open("data/demojied_thoughts.txt", "a") as r:
                    r.write(replace_emoji(row["body"]) + "\n")

220it [02:16,  2.48it/s]

## Cleaning

Some simple, regex-based cleaning is performed on train and dev datasets, e.g. to remove HTML tags from Wikipedia articles, non-verbal cues from subtitles, or even to correct I’s that were incorrectly recognized as l’s in OCR’ed uppercase text.

In [1]:
from pathlib import Path
from mrclean import *

In [2]:
DATA_ROOT = Path("./")
SEQ_LENGTH = 128 # this is a legacy parameter, it does not affect cleaning
DATA_SPLITS = ['babylm_10M', 'babylm_dev']

CLEANUP_FUNCTIONS = {
    'aochildes': cleanup_aochildes,
    'bnc_spoken': cleanup_bnc_spoken,
    'cbt': cleanup_cbt,
    'children_stories': cleanup_children_stories,
    'gutenberg': cleanup_gutenberg,
    'open_subtitles': cleanup_open_subtitles,
    'qed': cleanup_qed,
    'simple_wikipedia': cleanup_simple_wikipedia,
    'switchboard': cleanup_switchboard,
    'wikipedia': cleanup_wikipedia,
}


In [4]:
for split in DATA_SPLITS:
    INPUT_DIR = DATA_ROOT / 'data' / split
    OUTPUT_DIR = DATA_ROOT / 'data' / f'{split}_clean'
    
    OUTPUT_DIR.mkdir(exist_ok=True)

    train_files = [f for f in INPUT_DIR.iterdir() if f.is_file() and f.suffix in ['.train', '.dev']]
    
    for file in train_files:
        text = file.read_text()
        cleaned_text = CLEANUP_FUNCTIONS[file.stem](text, SEQ_LENGTH)
        (OUTPUT_DIR / file.name).write_text(cleaned_text)
        print(f"🧹 Cleaned '{file.name}' (size {len(text)} -> {len(cleaned_text)}) in {split}")


🧹 Cleaned 'open_subtitles.train' (size 16433872 -> 16431168) in babylm_10M
🧹 Cleaned 'qed.train' (size 5781992 -> 5682028) in babylm_10M
🧹 Cleaned 'bnc_spoken.train' (size 4493818 -> 4463546) in babylm_10M
🧹 Cleaned 'wikipedia.train' (size 6065862 -> 6008082) in babylm_10M
🧹 Cleaned 'gutenberg.train' (size 5745126 -> 5745126) in babylm_10M
🧹 Cleaned 'aochildes.train' (size 1900547 -> 1820547) in babylm_10M
🧹 Cleaned 'simple_wikipedia.train' (size 9087222 -> 9064456) in babylm_10M
🧹 Cleaned 'children_stories.train' (size 1797174 -> 1797174) in babylm_10M
🧹 Cleaned 'cbt.train' (size 2627694 -> 2554890) in babylm_10M
🧹 Cleaned 'switchboard.train' (size 601050 -> 601050) in babylm_10M
🧹 Cleaned 'simple_wikipedia.dev' (size 9396525 -> 9368481) in babylm_dev
🧹 Cleaned 'switchboard.dev' (size 670013 -> 670013) in babylm_dev
🧹 Cleaned 'qed.dev' (size 5390732 -> 5295754) in babylm_dev
🧹 Cleaned 'open_subtitles.dev' (size 15724635 -> 15722583) in babylm_dev
🧹 Cleaned 'gutenberg.dev' (size 489396

## Training a tokenizer

In [4]:
from pathlib import Path
from tokenizers import (Tokenizer, decoders, models, pre_tokenizers,
                        processors, trainers)
from tokenizers.normalizers import NFKC

In [7]:
# We train the tokenizer on the train data only
data_dir = Path("./data/babylm_10M_clean/")

paths = [str(f) for f in data_dir.glob("*") if f.is_file() and not f.name.endswith(".DS_Store") and f.suffix in [".train"]]

# paths
print(len(paths))
assert len(paths) > 0, 'No data files found'

10


In [8]:
tokenizer = Tokenizer(models.BPE())

tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
tokenizer.normalizer = NFKC()

In [9]:
trainer = trainers.BpeTrainer(vocab_size=16000, min_frequency=2, special_tokens=["<pad>", "<s>", "</s>"])
tokenizer.train(paths, trainer)






In [14]:
tokenizer_path =  DATA_ROOT / "models/gpt-clean-16000.json"
tokenizer.save(str(tokenizer_path), pretty=True)

## Testing the tokenizer

In [18]:

tokenizer = Tokenizer.from_file(str(tokenizer_path))


# text = 'Shiro Okada (岡田志郎, "Okada Shirō", June 9, 1949; Hirakata, Osaka {age 71} - ) is a Japanese guitarist who participate in the Group Sound band, the Ox. His nickname was Shiro (シロー) and his real name is Shiro Okamoto (岡田史郎).'
text = "The quick brown fox jumps over the lazy dog."

encoded = tokenizer.encode(text)
print(f"Encoded String: {encoded.tokens}")

print(f"Encoded IDs: {encoded.ids}")

decoded = tokenizer.decode(encoded.ids)
print(f"Decoded String: {decoded}")


Encoded String: ['ĠThe', 'Ġquick', 'Ġbrown', 'Ġfox', 'Ġj', 'umps', 'Ġover', 'Ġthe', 'Ġlazy', 'Ġdog', '.']
Encoded IDs: [295, 1993, 4629, 9277, 366, 8876, 574, 210, 12889, 2415, 16]
Decoded String:  The quick brown fox jumps over the lazy dog.
