In [1]:
from datasets import load_dataset
from tqdm import tqdm
import random

In [3]:
def chunk_corpus(dataset, max_bytes=1024, min_bytes=256):
    text_data = dataset['text']
    meta_data = dataset['meta']
    for pile_idx, (text, meta) in tqdm(enumerate(zip(text_data, meta_data))):
        chunk_list = split_utf8(text, max_bytes)
        for chunk_idx, chunk in enumerate(chunk_list):
            if is_enough_length(chunk, min_bytes):
                yield {
                    'pile_idx': pile_idx,
                    'chunk_idx': chunk_idx,
                    'text': chunk,
                    'pile_set_name': meta['pile_set_name']
                }

def split_utf8_rec(text, max_bytes=1024):
    if not text:
        return []
    encoded = text.encode('utf-8')
    if len(encoded) <= max_bytes:
        return [text]
    split_pos = max_bytes
    while encoded[split_pos] & 0xC0 == 0x80:
        split_pos -= 1
    first_chunk = encoded[:split_pos].decode('utf-8')
    rest = encoded[split_pos:].decode('utf-8')
    return [first_chunk] + split_utf8_rec(rest, max_bytes)

def split_utf8(text, max_bytes=1024):
    chunks = []
    current_text = text
    while len(current_text.encode('utf-8')) > max_bytes:
        encoded = current_text.encode('utf-8')
        split_pos = max_bytes
        while encoded[split_pos] & 0xC0 == 0x80:
            split_pos -= 1
        decoded_chunk = encoded[:split_pos].decode('utf-8')
        chunks.append(decoded_chunk)
        current_text = encoded[split_pos:].decode('utf-8')
    if current_text:
        chunks.append(current_text)
    return chunks

def is_enough_length(text, min_bytes=256):
    return len(text.encode('utf-8')) > min_bytes

In [4]:
random.seed(42)
# dataset = load_dataset("monology/pile-uncopyrighted")
# dataset = dataset['train']
corpus_1M = dataset[:1_000_000]

In [5]:
chunked = list(chunk_corpus(corpus_1M, max_bytes=1024, min_bytes=256))

1000000it [04:41, 3557.06it/s]


In [6]:
len(chunked)

5703791

In [9]:
ds1 = random.sample(chunked, 40_000)
ds1 = ds1[:10_000]

In [10]:
print(f"len(ds1): {len(ds1)}")
print(f"ds1[0]: {ds1[0]}")

len(ds1): 10000
ds1[0]: {'pile_idx': 940294, 'chunk_idx': 237, 'text': "s of\nour wonderful Abbey Church.\n\n[Illustration: CHEAPSIDE CROSS.]\n\nCheapside Cross was 're-edified' in 1441, and afterwards newly gilt and\nnewly burnished. Defaced and repaired at different times, little was\nleft of the original when the cross was cleared away in 1647, at the\nsame time as Charing Cross.\n\nOnly three of the original Eleanor crosses remain: two in\nNorthamptonshire--one at Geddington, and the other at Northampton, and\nthe third at Waltham Cross. Every Englishman should be proud of these\nglorious records of a past age, which not only tell of the devoted love\nof two sovereigns, of whom we all must be proud, but also because they\nprove the high state of English art at this time. Until late years, when\ncertain documents were discovered containing the names of the artists,\nthe historians of art attempted to believe that the designs were too\ngood for Englishmen, and must have been made by 