In [None]:
#| default_exp fineweb

In [None]:
from datasets import load_dataset
import os
import numpy as np

In [7]:
data_dir = os.getenv("DATA_DIR", None) or "./edu_fineweb10B"
cache_dir = os.getenv("CACHE_DIR", None) # default to ~/.cache/huggingface/datasets
os.makedirs(data_dir, exist_ok=True)

In [8]:
remote_name = "sample-10BT"

In [9]:
fw = load_dataset("HuggingFaceFW/fineweb-edu", name=remote_name, split="train", cache_dir=cache_dir)
fw, len(fw)

Resolving data files:   0%|          | 0/1630 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/98 [00:00<?, ?it/s]

(Dataset({
     features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
     num_rows: 9672101
 }),
 9672101)

In [10]:
import pprint
pprint.pprint(fw[0])

{'dump': 'CC-MAIN-2013-20',
 'file_path': 's3://commoncrawl/crawl-data/CC-MAIN-2013-20/segments/1368696381249/warc/CC-MAIN-20130516092621-00000-ip-10-60-113-184.ec2.internal.warc.gz',
 'id': '<urn:uuid:0d8a309d-25c5-405d-a08a-c11239f0d717>',
 'int_score': 3,
 'language': 'en',
 'language_score': 0.9743200540542603,
 'score': 2.75,
 'text': 'The Independent Jane\n'
         'For all the love, romance and scandal in Jane Austen’s books, what '
         'they are really about is freedom and independence. Independence of '
         'thought and the freedom to choose.\n'
         'Elizabeth’s refusal of Mr. Collins offer of marriage showed an '
         'independence seldom seen in heroines of the day. Her refusal of Mr. '
         'Darcy while triggered by anger showed a level of independence that '
         'left him shocked and stunned.\n'
         'The freedom she exhibited in finally accepting him in direct '
         'defiance of Lady Catherine and knowing her father would disapprove 

In [12]:
# | export
import tiktoken
gpt2_encoder = tiktoken.get_encoding("gpt2")
eot = gpt2_encoder._special_tokens["<|endoftext|>"]  # end of text token


# tokenize a single document and returns a numpy array of uint16 tokens
def to_tokens_np(doc):
    tokens = [eot]  # eot separates documents, so every sequence starts with eot
    tokens.extend(gpt2_encoder.encode_ordinary(doc["text"]))  # ignore special tokens
    tokens_np = np.array(tokens, dtype=np.uint16)
    return tokens_np

def load_np_tokens(input_file):
    tokens = np.load(input_file)
    tokens = tokens.astype(np.int32)
    return tokens

In [13]:
to_tokens_np(fw[0])[:10], eot

(array([50256,   464, 13362, 12091,   198,  1890,   477,   262,  1842,
           11], dtype=uint16),
 50256)

In [14]:
# | export
import numpy as np
import os
from tqdm import tqdm


class DataDirNotEmptyError(Exception):
    pass


class TokenShardWriter:
    def __init__(self, shard_size=2**20, data_dir=".", overwrite=False):
        self.shard_size = shard_size
        self.shard_index = 0
        self.tokens_buffer = np.empty((shard_size), dtype=np.uint16)
        self.shard_token_count = 0  # number of tokens in the current shard
        self.pbar = None
        self.data_dir = data_dir
        self.overwrite = overwrite

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.clear_buffer()
        if self.pbar is not None:
            self.pbar.close()

    def write(self, tokens):
        if not self.overwrite and os.listdir(self.data_dir):
            raise DataDirNotEmptyError("Data directory is not empty")
        return self._write_shard(tokens)

    def _write_shard(self, tokens):
        # current shard has enough space for incoming tokens
        if len(tokens) + self.shard_token_count <= self.shard_size:
            self.tokens_buffer[
                self.shard_token_count : self.shard_token_count + len(tokens)
            ] = tokens
            self.shard_token_count += len(tokens)
            if self.pbar is None:
                self.pbar = tqdm(
                    total=self.shard_size,
                    unit="tokens",
                    desc=f"Shard {self.shard_index}",
                )
            self.pbar.update(len(tokens))
            return

        # current shard has not enough space
        split = (
            "val" if self.shard_index == 0 else "train"
        )  # first shard is val, the rest is train
        filename = os.path.join(self.data_dir, f"{split}_{self.shard_index:06d}.npy")
        remainder = self.shard_size - self.shard_token_count
        # fill the current shard
        self.tokens_buffer[
            self.shard_token_count : self.shard_token_count + remainder
        ] = tokens[:remainder]
        self.pbar.update(remainder)
        self.dump_tokens_np(filename, self.tokens_buffer)
        # start a new shard
        self.pbar.close()
        self.shard_index += 1
        # add the remaining tokens to the new shard
        self.tokens_buffer[: len(tokens) - remainder] = tokens[remainder:]
        self.shard_token_count = len(tokens) - remainder
        self.pbar = tqdm(
            total=self.shard_size,
            unit="tokens",
            desc=f"Shard {self.shard_index}",
        )
        self.pbar.update(self.shard_token_count)

    def clear_buffer(self):
        if self.shard_token_count > 0:
            split = "val" if self.shard_index == 0 else "train"
            filename = os.path.join(
                self.data_dir, f"{split}_{self.shard_index:06d}.npy"
            )
            self.dump_tokens_np(filename, self.tokens_buffer[: self.shard_token_count])
            self.shard_token_count = 0

    def dump_tokens_np(self, filename, tokens):
        np.save(filename, tokens)

In [15]:
import multiprocessing as mp

In [17]:
nproc = max(1, mp.cpu_count() // 2)

try:
    with TokenShardWriter(int(1e8), data_dir=data_dir) as wrt:
        with mp.Pool(nproc) as pool:
            for tokens in pool.imap(to_tokens_np, fw, chunksize=16):
                wrt.write(tokens)
except DataDirNotEmptyError:
    print("Data directory is not empty, skipping")
    pass

Data directory is not empty, skipping
