In [60]:
import os
from dotenv import load_dotenv
from transformers import AutoTokenizer

load_dotenv()

HF_TOKEN = os.getenv("HF_TOKEN")
tokenizer = AutoTokenizer.from_pretrained("babylm-seqlen/tokenizer")


### Creating Raw Dataset

In [2]:
def tokenize_and_chunk(examples, seq_len):
    """
    Tokenizes and chunks text data to fixed-length sequences.
    
    Args:
        examples: A batch of text examples from the dataset
        seq_len: The length of the sequences to chunk the text into
        
    Returns:
        Dictionary containing chunked token sequences of length SEQ_LEN
    """
    tokens = []
    # Process each text example in the batch
    for text in examples['text']:
        # Convert text to token IDs
        _tokens = tokenizer.encode(text)
        # Add EOS token to mark the end of each text example
        _tokens.append(tokenizer.eos_token_id)
        # Accumulate all tokens in a flat list
        tokens.extend(_tokens)

    # Split the accumulated tokens into chunks of SEQ_LEN
    chunks = [tokens[i:i + seq_len] for i in range(0, len(tokens), seq_len)]
    
    # Discard the last chunk if it's shorter than SEQ_LEN to ensure uniform sequence length
    if len(chunks[-1]) < seq_len:
        chunks = chunks[:-1]
        
    return {'input_ids': chunks}

In [22]:
from datasets import Dataset

In [40]:
import os
data_root_path = 'data/raw/train_100M'

raw_data_list= []

def raw_data_iterator(data_root_path):
    for file in os.listdir(data_root_path):
        print("Processing file: ", file)
        with open(os.path.join(data_root_path, file), 'r') as f:
            document = []
            for line in f:
                if line == '\n':
                    raw_data_list.append({'text': ' '.join(document)})
                    document = []
                else:
                    document.append(line)

data_iterator = raw_data_iterator(data_root_path)

Processing file:  simple_wiki.train


Processing file:  bnc_spoken.train
Processing file:  childes.train
Processing file:  open_subtitles.train
Processing file:  switchboard.train
Processing file:  gutenberg.train


In [51]:
raw_dataset = Dataset.from_list(raw_data_list)
raw_dataset = raw_dataset.shuffle(seed=420)

In [57]:
from huggingface_hub import HfApi
api = HfApi()

In [None]:
from functools import partial

### Seq Len - 64 Dataset

In [54]:

tokenize_and_chunk_64 = partial(tokenize_and_chunk, seq_len=64)

tokenized_dataset_64 = raw_dataset.map(
    tokenize_and_chunk_64,
    batched=True,
    batch_size=500,
    num_proc=8,
    remove_columns=raw_dataset.column_names
)
tokenized_dataset_64 = tokenized_dataset_64.shuffle(seed=42)

Map (num_proc=8): 100%|██████████| 131035/131035 [02:09<00:00, 1011.15 examples/s]


In [None]:
tokenized_dataset_64.to_parquet('data/processed/train_100M_64.parquet')

In [63]:
api.create_repo("babylm-seqlen/train_100M_64", private=True, exist_ok=True, token=HF_TOKEN, repo_type="dataset")

api.upload_file(
    path_or_fileobj='data/processed/train_100M_64.parquet',
    repo_id='babylm-seqlen/train_100M_64',
    repo_type='dataset',
    path_in_repo='train_100M_64.parquet',
    token=HF_TOKEN
)

train_100M_64.parquet: 100%|██████████| 328M/328M [00:09<00:00, 32.9MB/s] 


CommitInfo(commit_url='https://huggingface.co/datasets/babylm-seqlen/train_100M_64/commit/0b426d8d2341f6a4cc4237806ede056a5b3522fc', commit_message='Upload train_100M_64.parquet with huggingface_hub', commit_description='', oid='0b426d8d2341f6a4cc4237806ede056a5b3522fc', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/babylm-seqlen/train_100M_64', endpoint='https://huggingface.co', repo_type='dataset', repo_id='babylm-seqlen/train_100M_64'), pr_revision=None, pr_num=None)

### Seq Len - 128 Dataset

In [87]:
tokenize_and_chunk_128 = partial(tokenize_and_chunk, seq_len=128)

tokenized_dataset_128 = raw_dataset.map(
    tokenize_and_chunk_128,
    batched=True,
    batch_size=500,
    num_proc=8,
    remove_columns=raw_dataset.column_names
)
tokenized_dataset_128 = tokenized_dataset_128.shuffle(seed=42)

Map (num_proc=8): 100%|██████████| 131035/131035 [02:07<00:00, 1030.98 examples/s]


In [88]:
tokenized_dataset_128.to_parquet('data/processed/train_100M_128.parquet')

Creating parquet from Arrow format: 100%|██████████| 1279/1279 [00:22<00:00, 56.42ba/s]


659515080

In [89]:
api.create_repo("babylm-seqlen/train_100M_128", private=True, exist_ok=True, token=HF_TOKEN, repo_type="dataset")

api.upload_file(
    path_or_fileobj='data/processed/train_100M_128.parquet',
    repo_id='babylm-seqlen/train_100M_128',
    repo_type='dataset',
    path_in_repo='train_100M_128.parquet',
    token=HF_TOKEN
)

train_100M_128.parquet: 100%|██████████| 307M/307M [00:07<00:00, 40.9MB/s] 


CommitInfo(commit_url='https://huggingface.co/datasets/babylm-seqlen/train_100M_128/commit/3aa00e5b28b6e91fd357e44d00afef6beeae7d88', commit_message='Upload train_100M_128.parquet with huggingface_hub', commit_description='', oid='3aa00e5b28b6e91fd357e44d00afef6beeae7d88', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/babylm-seqlen/train_100M_128', endpoint='https://huggingface.co', repo_type='dataset', repo_id='babylm-seqlen/train_100M_128'), pr_revision=None, pr_num=None)

### Seq Len - 256 Dataset

In [65]:
tokenize_and_chunk_256 = partial(tokenize_and_chunk, seq_len=256)

tokenized_dataset_256 = raw_dataset.map(
    tokenize_and_chunk_256,
    batched=True,
    batch_size=500,
    num_proc=8,
    remove_columns=raw_dataset.column_names
)
tokenized_dataset_256 = tokenized_dataset_256.shuffle(seed=42)

Map (num_proc=8): 100%|██████████| 131035/131035 [02:06<00:00, 1032.04 examples/s]


In [66]:
tokenized_dataset_256.to_parquet('data/processed/train_100M_256.parquet')

Creating parquet from Arrow format: 100%|██████████| 640/640 [00:15<00:00, 42.63ba/s]


656894056

In [67]:
api.create_repo("babylm-seqlen/train_100M_256", private=True, exist_ok=True, token=HF_TOKEN, repo_type="dataset")

api.upload_file(
    path_or_fileobj='data/processed/train_100M_256.parquet',
    repo_id='babylm-seqlen/train_100M_256',
    repo_type='dataset',
    path_in_repo='train_100M_256.parquet',
    token=HF_TOKEN
)

train_100M_256.parquet: 100%|██████████| 287M/287M [00:09<00:00, 30.8MB/s] 


CommitInfo(commit_url='https://huggingface.co/datasets/babylm-seqlen/train_100M_256/commit/d2719101569f761b53b402dbe1e3b86afb8d69e4', commit_message='Upload train_100M_256.parquet with huggingface_hub', commit_description='', oid='d2719101569f761b53b402dbe1e3b86afb8d69e4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/babylm-seqlen/train_100M_256', endpoint='https://huggingface.co', repo_type='dataset', repo_id='babylm-seqlen/train_100M_256'), pr_revision=None, pr_num=None)

### Seq Len - 512 Dataset

In [84]:
tokenize_and_chunk_512 = partial(tokenize_and_chunk, seq_len=512)

tokenized_dataset_512 = raw_dataset.map(
    tokenize_and_chunk_512,
    batched=True,
    batch_size=500,
    num_proc=8,
    remove_columns=raw_dataset.column_names
)
tokenized_dataset_512 = tokenized_dataset_512.shuffle(seed=42)

Map (num_proc=8): 100%|██████████| 131035/131035 [02:07<00:00, 1030.75 examples/s]


In [85]:
tokenized_dataset_512.to_parquet('data/processed/train_100M_512.parquet')

Creating parquet from Arrow format: 100%|██████████| 320/320 [00:10<00:00, 31.05ba/s]


655480620

In [86]:
api.create_repo("babylm-seqlen/train_100M_512", private=True, exist_ok=True, token=HF_TOKEN, repo_type="dataset")

api.upload_file(
    path_or_fileobj='data/processed/train_100M_512.parquet',
    repo_id='babylm-seqlen/train_100M_512',
    repo_type='dataset',
    path_in_repo='train_100M_512.parquet', 
    token=HF_TOKEN
)

train_100M_512.parquet: 100%|██████████| 272M/272M [00:15<00:00, 17.9MB/s] 


CommitInfo(commit_url='https://huggingface.co/datasets/babylm-seqlen/train_100M_512/commit/a1f77804849b6676d5a93c57053fc94f2f45a6a1', commit_message='Upload train_100M_512.parquet with huggingface_hub', commit_description='', oid='a1f77804849b6676d5a93c57053fc94f2f45a6a1', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/babylm-seqlen/train_100M_512', endpoint='https://huggingface.co', repo_type='dataset', repo_id='babylm-seqlen/train_100M_512'), pr_revision=None, pr_num=None)

### Seq Len - 1024 Dataset

In [68]:
tokenize_and_chunk_1024 = partial(tokenize_and_chunk, seq_len=1024)

tokenized_dataset_1024 = raw_dataset.map(
    tokenize_and_chunk_1024,
    batched=True,
    batch_size=500,
    num_proc=8,
    remove_columns=raw_dataset.column_names
)
tokenized_dataset_1024 = tokenized_dataset_1024.shuffle(seed=42)

Map (num_proc=8): 100%|██████████| 131035/131035 [02:06<00:00, 1036.57 examples/s]


In [69]:
tokenized_dataset_1024.to_parquet('data/processed/train_100M_1024.parquet')

Creating parquet from Arrow format: 100%|██████████| 160/160 [00:08<00:00, 19.60ba/s]


654589600

In [70]:
api.create_repo("babylm-seqlen/train_100M_1024", private=True, exist_ok=True, token=HF_TOKEN, repo_type="dataset")

api.upload_file(
    path_or_fileobj='data/processed/train_100M_1024.parquet',
    repo_id='babylm-seqlen/train_100M_1024',
    repo_type='dataset',
    path_in_repo='train_100M_1024.parquet', 
    token=HF_TOKEN
)

train_100M_1024.parquet: 100%|██████████| 262M/262M [00:07<00:00, 36.9MB/s] 


CommitInfo(commit_url='https://huggingface.co/datasets/babylm-seqlen/train_100M_1024/commit/0680905b063dd89f77005aa4acb7582d92694a6e', commit_message='Upload train_100M_1024.parquet with huggingface_hub', commit_description='', oid='0680905b063dd89f77005aa4acb7582d92694a6e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/babylm-seqlen/train_100M_1024', endpoint='https://huggingface.co', repo_type='dataset', repo_id='babylm-seqlen/train_100M_1024'), pr_revision=None, pr_num=None)

### Seq Len - 2048 Dataset

In [80]:
tokenize_and_chunk_2048 = partial(tokenize_and_chunk, seq_len=2048)

tokenized_dataset_2048 = raw_dataset.map(
    tokenize_and_chunk_2048,
    batched=True,
    batch_size=500,
    num_proc=8,
    remove_columns=raw_dataset.column_names
)
tokenized_dataset_2048 = tokenized_dataset_2048.shuffle(seed=42)

Map (num_proc=8): 100%|██████████| 131035/131035 [02:03<00:00, 1059.14 examples/s]


In [81]:
tokenized_dataset_2048.to_parquet('data/processed/train_100M_2048.parquet')

Creating parquet from Arrow format: 100%|██████████| 80/80 [00:06<00:00, 11.63ba/s]


653721156

In [82]:
api.create_repo("babylm-seqlen/train_100M_2048", private=True, exist_ok=True, token=HF_TOKEN, repo_type="dataset")

api.upload_file(
    path_or_fileobj='data/processed/train_100M_2048.parquet',
    repo_id='babylm-seqlen/train_100M_2048',
    repo_type='dataset',
    path_in_repo='train_100M_2048.parquet', 
    token=HF_TOKEN
)

train_100M_2048.parquet: 100%|██████████| 256M/256M [00:08<00:00, 31.9MB/s] 


CommitInfo(commit_url='https://huggingface.co/datasets/babylm-seqlen/train_100M_2048/commit/7fc8b25065991a3c8d4dd9e5e6fdb0aabee434e2', commit_message='Upload train_100M_2048.parquet with huggingface_hub', commit_description='', oid='7fc8b25065991a3c8d4dd9e5e6fdb0aabee434e2', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/babylm-seqlen/train_100M_2048', endpoint='https://huggingface.co', repo_type='dataset', repo_id='babylm-seqlen/train_100M_2048'), pr_revision=None, pr_num=None)

### Seq Len - 4096 Dataset

In [71]:
tokenize_and_chunk_4096 = partial(tokenize_and_chunk, seq_len=4096)

tokenized_dataset_4096 = raw_dataset.map(
    tokenize_and_chunk_4096,
    batched=True,
    batch_size=500,
    num_proc=8,
    remove_columns=raw_dataset.column_names
)
tokenized_dataset_4096 = tokenized_dataset_4096.shuffle(seed=42)

Map (num_proc=8): 100%|██████████| 131035/131035 [02:01<00:00, 1074.45 examples/s]


In [72]:
tokenized_dataset_4096.to_parquet('data/processed/train_100M_4096.parquet')

Creating parquet from Arrow format: 100%|██████████| 40/40 [00:06<00:00,  6.51ba/s]


652471832

In [73]:
api.create_repo("babylm-seqlen/train_100M_4096", private=True, exist_ok=True, token=HF_TOKEN, repo_type="dataset")

api.upload_file(
    path_or_fileobj='data/processed/train_100M_4096.parquet',
    repo_id='babylm-seqlen/train_100M_4096',
    repo_type='dataset',
    path_in_repo='train_100M_4096.parquet', 
    token=HF_TOKEN
)

train_100M_4096.parquet: 100%|██████████| 252M/252M [00:09<00:00, 26.8MB/s] 


CommitInfo(commit_url='https://huggingface.co/datasets/babylm-seqlen/train_100M_4096/commit/bb19e62af7b58d3f5a0714f08ca36ca8a6e7f5ef', commit_message='Upload train_100M_4096.parquet with huggingface_hub', commit_description='', oid='bb19e62af7b58d3f5a0714f08ca36ca8a6e7f5ef', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/babylm-seqlen/train_100M_4096', endpoint='https://huggingface.co', repo_type='dataset', repo_id='babylm-seqlen/train_100M_4096'), pr_revision=None, pr_num=None)

### Seq Len - 8192 Dataset

In [77]:
tokenize_and_chunk_8192 = partial(tokenize_and_chunk, seq_len=8192)

tokenized_dataset_8192 = raw_dataset.map(
    tokenize_and_chunk_8192,
    batched=True,
    batch_size=500,
    num_proc=8,
    remove_columns=raw_dataset.column_names
)
tokenized_dataset_8192 = tokenized_dataset_8192.shuffle(seed=42)

Map (num_proc=8): 100%|██████████| 131035/131035 [02:05<00:00, 1046.75 examples/s]


In [78]:
tokenized_dataset_8192.to_parquet('data/processed/train_100M_8192.parquet')

Creating parquet from Arrow format: 100%|██████████| 20/20 [00:05<00:00,  3.70ba/s]


650327568

In [79]:
api.create_repo("babylm-seqlen/train_100M_8192", private=True, exist_ok=True, token=HF_TOKEN, repo_type="dataset")

api.upload_file(
    path_or_fileobj='data/processed/train_100M_8192.parquet',
    repo_id='babylm-seqlen/train_100M_8192',
    repo_type='dataset',
    path_in_repo='train_100M_8192.parquet', 
    token=HF_TOKEN
)

train_100M_8192.parquet: 100%|██████████| 249M/249M [00:09<00:00, 26.8MB/s] 


CommitInfo(commit_url='https://huggingface.co/datasets/babylm-seqlen/train_100M_8192/commit/61dd4531f1bf3348c502f03ee5417e81c1eaf01b', commit_message='Upload train_100M_8192.parquet with huggingface_hub', commit_description='', oid='61dd4531f1bf3348c502f03ee5417e81c1eaf01b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/babylm-seqlen/train_100M_8192', endpoint='https://huggingface.co', repo_type='dataset', repo_id='babylm-seqlen/train_100M_8192'), pr_revision=None, pr_num=None)

### Seq Len - 16384 Dataset


In [74]:
tokenize_and_chunk_16384 = partial(tokenize_and_chunk, seq_len=16384)

tokenized_dataset_16384 = raw_dataset.map(
    tokenize_and_chunk_16384,
    batched=True,
    batch_size=500,
    num_proc=8,
    remove_columns=raw_dataset.column_names
)
tokenized_dataset_16384 = tokenized_dataset_16384.shuffle(seed=42)


Map (num_proc=8): 100%|██████████| 131035/131035 [02:03<00:00, 1060.08 examples/s]


In [75]:
tokenized_dataset_16384.to_parquet('data/processed/train_100M_16384.parquet')

Creating parquet from Arrow format: 100%|██████████| 10/10 [00:05<00:00,  1.81ba/s]


646421020

In [76]:
api.create_repo("babylm-seqlen/train_100M_16384", private=True, exist_ok=True, token=HF_TOKEN, repo_type="dataset")

api.upload_file(
    path_or_fileobj='data/processed/train_100M_16384.parquet',
    repo_id='babylm-seqlen/train_100M_16384',
    repo_type='dataset',
    path_in_repo='train_100M_16384.parquet', 
    token=HF_TOKEN
)

train_100M_16384.parquet: 100%|██████████| 244M/244M [00:09<00:00, 26.0MB/s] 


CommitInfo(commit_url='https://huggingface.co/datasets/babylm-seqlen/train_100M_16384/commit/76c26ad39d01bda6046911b7e421df1420cebb2a', commit_message='Upload train_100M_16384.parquet with huggingface_hub', commit_description='', oid='76c26ad39d01bda6046911b7e421df1420cebb2a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/babylm-seqlen/train_100M_16384', endpoint='https://huggingface.co', repo_type='dataset', repo_id='babylm-seqlen/train_100M_16384'), pr_revision=None, pr_num=None)