In [1]:
import os
from dotenv import load_dotenv
from transformers import AutoTokenizer

load_dotenv()

HF_TOKEN = os.getenv("HF_TOKEN")
tokenizer = AutoTokenizer.from_pretrained("babylm-seqlen/tokenizer")


  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


### Creating Raw Dataset

In [2]:
def tokenize_and_chunk(examples, seq_len):
    """
    Tokenizes and chunks text data to fixed-length sequences.
    
    Args:
        examples: A batch of text examples from the dataset
        seq_len: The length of the sequences to chunk the text into
        
    Returns:
        Dictionary containing chunked token sequences of length SEQ_LEN
    """
    tokens = []
    # Process each text example in the batch
    for text in examples['text']:
        # Convert text to token IDs
        _tokens = tokenizer.encode(text)
        # Add EOS token to mark the end of each text example
        _tokens.append(tokenizer.eos_token_id)
        # Accumulate all tokens in a flat list
        tokens.extend(_tokens)

    # Split the accumulated tokens into chunks of SEQ_LEN
    chunks = [tokens[i:i + seq_len] for i in range(0, len(tokens), seq_len)]
    
    # Discard the last chunk if it's shorter than SEQ_LEN to ensure uniform sequence length
    if len(chunks[-1]) < seq_len:
        chunks = chunks[:-1]
        
    return {'input_ids': chunks}

In [3]:
from datasets import Dataset

In [4]:
import os
data_root_path = 'data/raw/train_100M'

raw_data_list= []

def raw_data_iterator(data_root_path):
    for file in os.listdir(data_root_path):
        print("Processing file: ", file)
        with open(os.path.join(data_root_path, file), 'r') as f:
            document = []
            for line in f:
                if line == '\n':
                    raw_data_list.append({'text': ' '.join(document)})
                    document = []
                else:
                    document.append(line)

data_iterator = raw_data_iterator(data_root_path)

Processing file:  simple_wiki.train
Processing file:  bnc_spoken.train
Processing file:  childes.train
Processing file:  open_subtitles.train
Processing file:  switchboard.train
Processing file:  gutenberg.train


In [5]:
raw_dataset = Dataset.from_list(raw_data_list)
raw_dataset = raw_dataset.shuffle(seed=420)

In [6]:
from huggingface_hub import HfApi
api = HfApi()

In [7]:
from functools import partial

In [8]:
# NOTE: SINGLE_SHUFFLE means only shuffle the dataset once at the document-level; otherwise, shuffle the dataset a second-time at the tokenized example-level
SINGLE_SHUFFLE=False

### Seq Len - 64 Dataset

In [9]:

tokenize_and_chunk_64 = partial(tokenize_and_chunk, seq_len=64)

tokenized_dataset_64 = raw_dataset.map(
    tokenize_and_chunk_64,
    batched=True,
    batch_size=500,
    num_proc=8,
    remove_columns=raw_dataset.column_names
)
if not SINGLE_SHUFFLE:
    tokenized_dataset_64 = tokenized_dataset_64.shuffle(seed=42)

Map (num_proc=8): 100%|██████████| 131035/131035 [02:05<00:00, 1044.59 examples/s]


In [10]:
parquet_path = 'data/processed/train_100M_64'
if SINGLE_SHUFFLE:
    parquet_path += '_single_shuffle'
parquet_path += '.parquet'
tokenized_dataset_64.to_parquet(parquet_path)

Creating parquet from Arrow format:   0%|          | 0/2557 [00:00<?, ?ba/s]

Creating parquet from Arrow format: 100%|██████████| 2557/2557 [00:37<00:00, 69.10ba/s]


664665560

In [14]:
repo_id = "babylm-seqlen/train_100M_64"
if SINGLE_SHUFFLE:
    repo_id += "_single_shuffle"

api.create_repo(repo_id, private=False, exist_ok=True, token=HF_TOKEN, repo_type="dataset")

api.upload_file(
    path_or_fileobj=parquet_path,
    repo_id=repo_id,
    repo_type='dataset',
    path_in_repo='train_100M_64_single_shuffle.parquet' if SINGLE_SHUFFLE else 'train_100M_64.parquet',
    token=HF_TOKEN
)

train_100M_64.parquet: 100%|██████████| 328M/328M [00:11<00:00, 29.8MB/s] 


CommitInfo(commit_url='https://huggingface.co/datasets/babylm-seqlen/train_100M_64/commit/637997e2668e570527955fd7b747759726239113', commit_message='Upload .parquet with huggingface_hub', commit_description='', oid='637997e2668e570527955fd7b747759726239113', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/babylm-seqlen/train_100M_64', endpoint='https://huggingface.co', repo_type='dataset', repo_id='babylm-seqlen/train_100M_64'), pr_revision=None, pr_num=None)

### Seq Len - 128 Dataset

In [15]:
tokenize_and_chunk_128 = partial(tokenize_and_chunk, seq_len=128)

tokenized_dataset_128 = raw_dataset.map(
    tokenize_and_chunk_128,
    batched=True,
    batch_size=500,
    num_proc=8,
    remove_columns=raw_dataset.column_names
)
if not SINGLE_SHUFFLE:
    tokenized_dataset_128 = tokenized_dataset_128.shuffle(seed=42)

Map (num_proc=8): 100%|██████████| 131035/131035 [01:59<00:00, 1097.04 examples/s]


In [16]:
parquet_path = 'data/processed/train_100M_128'
if SINGLE_SHUFFLE:
    parquet_path += '_single_shuffle'
parquet_path += '.parquet'
tokenized_dataset_128.to_parquet(parquet_path)

Creating parquet from Arrow format:   0%|          | 0/1279 [00:00<?, ?ba/s]

Creating parquet from Arrow format: 100%|██████████| 1279/1279 [00:21<00:00, 59.62ba/s]


659515080

In [None]:
repo_id = "babylm-seqlen/train_100M_128"
if SINGLE_SHUFFLE:
    repo_id += "_single_shuffle"
api.create_repo(repo_id, private=False, exist_ok=True, token=HF_TOKEN, repo_type="dataset")

api.upload_file(
    path_or_fileobj=parquet_path,
    repo_id=repo_id,
    repo_type='dataset',
    path_in_repo='train_100M_128_single_shuffle.parquet' if SINGLE_SHUFFLE else 'train_100M_128.parquet',
    token=HF_TOKEN
)

CommitInfo(commit_url='https://huggingface.co/datasets/babylm-seqlen/train_100M_128/commit/60a8c27e080391fdbd99e3d7250a1554c4abd938', commit_message='Upload .parquet with huggingface_hub', commit_description='', oid='60a8c27e080391fdbd99e3d7250a1554c4abd938', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/babylm-seqlen/train_100M_128', endpoint='https://huggingface.co', repo_type='dataset', repo_id='babylm-seqlen/train_100M_128'), pr_revision=None, pr_num=None)

### Seq Len - 256 Dataset

In [43]:
tokenize_and_chunk_256 = partial(tokenize_and_chunk, seq_len=256)

tokenized_dataset_256 = raw_dataset.map(
    tokenize_and_chunk_256,
    batched=True,
    batch_size=500,
    num_proc=8,
    remove_columns=raw_dataset.column_names
)
if not SINGLE_SHUFFLE:
    tokenized_dataset_256 = tokenized_dataset_256.shuffle(seed=42)

Map (num_proc=8): 100%|██████████| 131035/131035 [02:06<00:00, 1037.80 examples/s]


In [44]:
parquet_path = 'data/processed/train_100M_256'  
if SINGLE_SHUFFLE:
    parquet_path += '_single_shuffle'
parquet_path += '.parquet'
tokenized_dataset_256.to_parquet(parquet_path)

Creating parquet from Arrow format: 100%|██████████| 640/640 [00:05<00:00, 110.50ba/s]


656894056

In [None]:
repo_id = "babylm-seqlen/train_100M_256"    
if SINGLE_SHUFFLE:
    repo_id += "_single_shuffle"
api.create_repo(repo_id, private=False, exist_ok=True, token=HF_TOKEN, repo_type="dataset")

api.upload_file(
    path_or_fileobj=parquet_path,
    repo_id=repo_id,
    repo_type='dataset',
    path_in_repo='train_100M_256_single_shuffle.parquet' if SINGLE_SHUFFLE else 'train_100M_256.parquet',
    token=HF_TOKEN
)

train_100M_256_single_shuffle.parquet: 100%|██████████| 269M/269M [00:09<00:00, 26.9MB/s] 


CommitInfo(commit_url='https://huggingface.co/datasets/babylm-seqlen/train_100M_256_single_shuffle/commit/7d873f0c4c6f48e85dce54e922175b3b3434353f', commit_message='Upload train_100M_256_single_shuffle with huggingface_hub', commit_description='', oid='7d873f0c4c6f48e85dce54e922175b3b3434353f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/babylm-seqlen/train_100M_256_single_shuffle', endpoint='https://huggingface.co', repo_type='dataset', repo_id='babylm-seqlen/train_100M_256_single_shuffle'), pr_revision=None, pr_num=None)

### Seq Len - 512 Dataset

In [49]:
tokenize_and_chunk_512 = partial(tokenize_and_chunk, seq_len=512)

tokenized_dataset_512 = raw_dataset.map(
    tokenize_and_chunk_512,
    batched=True,
    batch_size=500,
    num_proc=8,
    remove_columns=raw_dataset.column_names
)
if not SINGLE_SHUFFLE:
    tokenized_dataset_512 = tokenized_dataset_512.shuffle(seed=42)

Map (num_proc=8): 100%|██████████| 131035/131035 [02:06<00:00, 1033.95 examples/s]


In [50]:
parquet_path = 'data/processed/train_100M_512'
if SINGLE_SHUFFLE:
    parquet_path += '_single_shuffle'
parquet_path += '.parquet'
tokenized_dataset_512.to_parquet(parquet_path)

Creating parquet from Arrow format: 100%|██████████| 320/320 [00:05<00:00, 54.27ba/s]


655480620

In [None]:
repo_id = "babylm-seqlen/train_100M_512"
if SINGLE_SHUFFLE:
    repo_id += "_single_shuffle"
api.create_repo(repo_id, private=False, exist_ok=True, token=HF_TOKEN, repo_type="dataset")


api.upload_file(
    path_or_fileobj=parquet_path,
    repo_id=repo_id,
    repo_type='dataset',
    path_in_repo='train_100M_512_single_shuffle.parquet' if SINGLE_SHUFFLE else 'train_100M_512.parquet',
    token=HF_TOKEN
)

train_100M_512_single_shuffle.parquet: 100%|██████████| 259M/259M [00:10<00:00, 25.7MB/s] 


CommitInfo(commit_url='https://huggingface.co/datasets/babylm-seqlen/train_100M_512_single_shuffle/commit/27964866816815f03d18cde54eb95a14894f88e8', commit_message='Upload train_100M_512.parquet with huggingface_hub', commit_description='', oid='27964866816815f03d18cde54eb95a14894f88e8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/babylm-seqlen/train_100M_512_single_shuffle', endpoint='https://huggingface.co', repo_type='dataset', repo_id='babylm-seqlen/train_100M_512_single_shuffle'), pr_revision=None, pr_num=None)

### Seq Len - 1024 Dataset

In [52]:
tokenize_and_chunk_1024 = partial(tokenize_and_chunk, seq_len=1024)

tokenized_dataset_1024 = raw_dataset.map(
    tokenize_and_chunk_1024,
    batched=True,
    batch_size=500,
    num_proc=8,
    remove_columns=raw_dataset.column_names
)
if not SINGLE_SHUFFLE:
    tokenized_dataset_1024 = tokenized_dataset_1024.shuffle(seed=42)

Map (num_proc=8): 100%|██████████| 131035/131035 [02:02<00:00, 1067.24 examples/s]


In [53]:
parquet_path = 'data/processed/train_100M_1024'
if SINGLE_SHUFFLE:
    parquet_path += '_single_shuffle'
parquet_path += '.parquet'
tokenized_dataset_1024.to_parquet(parquet_path)

Creating parquet from Arrow format: 100%|██████████| 160/160 [00:06<00:00, 26.56ba/s]


654589600

In [None]:
repo_id = "babylm-seqlen/train_100M_1024"
if SINGLE_SHUFFLE:
    repo_id += "_single_shuffle"
api.create_repo(repo_id, private=False, exist_ok=True, token=HF_TOKEN, repo_type="dataset")

api.upload_file(
    path_or_fileobj=parquet_path,
    repo_id=repo_id,
    repo_type='dataset',
    path_in_repo='train_100M_1024_single_shuffle.parquet' if SINGLE_SHUFFLE else 'train_100M_1024.parquet',
    token=HF_TOKEN
)

train_100M_1024_single_shuffle.parquet: 100%|██████████| 252M/252M [00:06<00:00, 40.3MB/s] 


CommitInfo(commit_url='https://huggingface.co/datasets/babylm-seqlen/train_100M_1024_single_shuffle/commit/3da02c5ac9104714b1b110614caed758e8ac4431', commit_message='Upload train_100M_1024_single_shuffle with huggingface_hub', commit_description='', oid='3da02c5ac9104714b1b110614caed758e8ac4431', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/babylm-seqlen/train_100M_1024_single_shuffle', endpoint='https://huggingface.co', repo_type='dataset', repo_id='babylm-seqlen/train_100M_1024_single_shuffle'), pr_revision=None, pr_num=None)

### Seq Len - 2048 Dataset

In [55]:
tokenize_and_chunk_2048 = partial(tokenize_and_chunk, seq_len=2048)

tokenized_dataset_2048 = raw_dataset.map(
    tokenize_and_chunk_2048,
    batched=True,
    batch_size=500,
    num_proc=8,
    remove_columns=raw_dataset.column_names
)
if not SINGLE_SHUFFLE:
    tokenized_dataset_2048 = tokenized_dataset_2048.shuffle(seed=42)

Map (num_proc=8): 100%|██████████| 131035/131035 [02:02<00:00, 1068.15 examples/s]


In [56]:
parquet_path = 'data/processed/train_100M_2048'
if SINGLE_SHUFFLE:
    parquet_path += '_single_shuffle'
parquet_path += '.parquet'
tokenized_dataset_2048.to_parquet(parquet_path)

Creating parquet from Arrow format: 100%|██████████| 80/80 [00:06<00:00, 12.33ba/s]


653721156

In [None]:
repo_id = "babylm-seqlen/train_100M_2048"   
if SINGLE_SHUFFLE:
    repo_id += "_single_shuffle"
api.create_repo(repo_id, private=False, exist_ok=True, token=HF_TOKEN, repo_type="dataset")

api.upload_file(
    path_or_fileobj=parquet_path,
    repo_id=repo_id,
    repo_type='dataset',
    path_in_repo='train_100M_2048_single_shuffle.parquet' if SINGLE_SHUFFLE else 'train_100M_2048.parquet',
    token=HF_TOKEN
)

train_100M_2048_single_shuffle.parquet: 100%|██████████| 248M/248M [00:09<00:00, 27.1MB/s] 


CommitInfo(commit_url='https://huggingface.co/datasets/babylm-seqlen/train_100M_2048_single_shuffle/commit/33a8cad8d689d508a605a5f2fd798c34b3bac363', commit_message='Upload train_100M_2048_single_shuffle with huggingface_hub', commit_description='', oid='33a8cad8d689d508a605a5f2fd798c34b3bac363', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/babylm-seqlen/train_100M_2048_single_shuffle', endpoint='https://huggingface.co', repo_type='dataset', repo_id='babylm-seqlen/train_100M_2048_single_shuffle'), pr_revision=None, pr_num=None)

### Seq Len - 4096 Dataset

In [58]:
tokenize_and_chunk_4096 = partial(tokenize_and_chunk, seq_len=4096)

tokenized_dataset_4096 = raw_dataset.map(
    tokenize_and_chunk_4096,
    batched=True,
    batch_size=500,
    num_proc=8,
    remove_columns=raw_dataset.column_names
)
if not SINGLE_SHUFFLE:
    tokenized_dataset_4096 = tokenized_dataset_4096.shuffle(seed=42)

Map (num_proc=8): 100%|██████████| 131035/131035 [02:01<00:00, 1079.07 examples/s]


In [59]:
parquet_path = 'data/processed/train_100M_4096' 
if SINGLE_SHUFFLE:
    parquet_path += '_single_shuffle'
parquet_path += '.parquet'
tokenized_dataset_4096.to_parquet(parquet_path)

Creating parquet from Arrow format: 100%|██████████| 40/40 [00:05<00:00,  6.87ba/s]


652471832

In [None]:
repo_id = "babylm-seqlen/train_100M_4096"
if SINGLE_SHUFFLE:
    repo_id += "_single_shuffle"
api.create_repo(repo_id, private=False, exist_ok=True, token=HF_TOKEN, repo_type="dataset")

api.upload_file(
    path_or_fileobj=parquet_path,
    repo_id=repo_id,
    repo_type='dataset',
    path_in_repo='train_100M_4096_single_shuffle.parquet' if SINGLE_SHUFFLE else 'train_100M_4096.parquet',
    token=HF_TOKEN
)

train_100M_4096_single_shuffle.parquet: 100%|██████████| 245M/245M [00:07<00:00, 30.8MB/s] 


CommitInfo(commit_url='https://huggingface.co/datasets/babylm-seqlen/train_100M_4096_single_shuffle/commit/9fe08028cf157abfc2f08895ee728af288fa5430', commit_message='Upload train_100M_4096_single_shuffle with huggingface_hub', commit_description='', oid='9fe08028cf157abfc2f08895ee728af288fa5430', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/babylm-seqlen/train_100M_4096_single_shuffle', endpoint='https://huggingface.co', repo_type='dataset', repo_id='babylm-seqlen/train_100M_4096_single_shuffle'), pr_revision=None, pr_num=None)

### Seq Len - 8192 Dataset

In [61]:
tokenize_and_chunk_8192 = partial(tokenize_and_chunk, seq_len=8192)

tokenized_dataset_8192 = raw_dataset.map(
    tokenize_and_chunk_8192,
    batched=True,
    batch_size=500,
    num_proc=8,
    remove_columns=raw_dataset.column_names
)
if not SINGLE_SHUFFLE:
    tokenized_dataset_8192 = tokenized_dataset_8192.shuffle(seed=42)

Map (num_proc=8): 100%|██████████| 131035/131035 [02:01<00:00, 1079.03 examples/s]


In [62]:
parquet_path = 'data/processed/train_100M_8192'
if SINGLE_SHUFFLE:
    parquet_path += '_single_shuffle'
parquet_path += '.parquet'
tokenized_dataset_8192.to_parquet(parquet_path)

Creating parquet from Arrow format: 100%|██████████| 20/20 [00:05<00:00,  3.49ba/s]


650327568

In [None]:
repo_id = "babylm-seqlen/train_100M_8192"
if SINGLE_SHUFFLE:
    repo_id += "_single_shuffle"
api.create_repo(repo_id, private=False, exist_ok=True, token=HF_TOKEN, repo_type="dataset")

api.upload_file(
    path_or_fileobj=parquet_path,
    repo_id=repo_id,
    repo_type='dataset',
    path_in_repo='train_100M_8192_single_shuffle.parquet' if SINGLE_SHUFFLE else 'train_100M_8192.parquet',
    token=HF_TOKEN
)

train_100M_8192_single_shuffle.parquet: 100%|██████████| 242M/242M [00:06<00:00, 34.7MB/s] 


CommitInfo(commit_url='https://huggingface.co/datasets/babylm-seqlen/train_100M_8192_single_shuffle/commit/a0bba81e3d1159b21000211c0ffc5a566af287f6', commit_message='Upload train_100M_8192_single_shuffle with huggingface_hub', commit_description='', oid='a0bba81e3d1159b21000211c0ffc5a566af287f6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/babylm-seqlen/train_100M_8192_single_shuffle', endpoint='https://huggingface.co', repo_type='dataset', repo_id='babylm-seqlen/train_100M_8192_single_shuffle'), pr_revision=None, pr_num=None)

### Seq Len - 16384 Dataset


In [64]:
tokenize_and_chunk_16384 = partial(tokenize_and_chunk, seq_len=16384)

tokenized_dataset_16384 = raw_dataset.map(
    tokenize_and_chunk_16384,
    batched=True,
    batch_size=500,
    num_proc=8,
    remove_columns=raw_dataset.column_names
)
if not SINGLE_SHUFFLE:
    tokenized_dataset_16384 = tokenized_dataset_16384.shuffle(seed=42)


Map (num_proc=8): 100%|██████████| 131035/131035 [01:59<00:00, 1097.39 examples/s]


In [65]:
parquet_path = 'data/processed/train_100M_16384'
if SINGLE_SHUFFLE:
    parquet_path += '_single_shuffle'
parquet_path += '.parquet'
tokenized_dataset_16384.to_parquet(parquet_path)

Creating parquet from Arrow format: 100%|██████████| 10/10 [00:05<00:00,  1.80ba/s]


646421020

In [None]:
repo_id = "babylm-seqlen/train_100M_16384"
if SINGLE_SHUFFLE:
    repo_id += "_single_shuffle"
api.create_repo(repo_id, private=False, exist_ok=True, token=HF_TOKEN, repo_type="dataset")

api.upload_file(
    path_or_fileobj=parquet_path,
    repo_id=repo_id,
    repo_type='dataset',
    path_in_repo='train_100M_16384_single_shuffle.parquet' if SINGLE_SHUFFLE else 'train_100M_16384.parquet',
    token=HF_TOKEN
)

train_100M_16384_single_shuffle.parquet: 100%|██████████| 241M/241M [00:06<00:00, 36.1MB/s] 


CommitInfo(commit_url='https://huggingface.co/datasets/babylm-seqlen/train_100M_16384_single_shuffle/commit/1ddb02c402ef3abe47c9357089ab42405f5160da', commit_message='Upload train_100M_16384_single_shuffle with huggingface_hub', commit_description='', oid='1ddb02c402ef3abe47c9357089ab42405f5160da', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/babylm-seqlen/train_100M_16384_single_shuffle', endpoint='https://huggingface.co', repo_type='dataset', repo_id='babylm-seqlen/train_100M_16384_single_shuffle'), pr_revision=None, pr_num=None)