## user-end goals
- [ ] use fineweb-edu, sample-10B dataset
- [ ] join documents with <BOS>
- [ ] add max document length (2x context length) - split at reasonable point
## model changes
- [ ] (maybe) add GQA
- [ ] remove bias in linear layers
- [ ] document masking

In [1]:
import requests
import os
import sys
import pyarrow
import pyarrow.parquet as pq
from pathlib import Path
from tqdm import tqdm

def get_project_info() -> tuple[Path, Path]:
  current = Path.cwd().resolve()
  root = current
  for parent in [current, *current.parents]:
    if (parent / "toy_transformers").exists():
      root = parent
      break
  return root, current

if 'ROOT_DIR' not in globals():
	ROOT_DIR, EXPERIMENT_DIR = get_project_info()
	if str(ROOT_DIR) not in sys.path:
		sys.path.append(str(ROOT_DIR))
	if Path.cwd() != ROOT_DIR:
		os.chdir(ROOT_DIR)

DATA_DIR = ROOT_DIR / "data/fineweb-edu/sample/10BT"
DATA_DIR.mkdir(parents=True, exist_ok=True)

from toy_transformers.tokenization import create_bpe, bulk_encode, Vocabulary, TokenizationMode

In [2]:
BATCH_SIZE_BYTES = 100 * 1024 * 1024
API_URL = "https://huggingface.co/api/datasets/HuggingFaceFW/fineweb-edu/parquet/sample-10BT/train"

response = requests.get(API_URL)
response.raise_for_status()
SHARD_URLS = response.json()

def download_shard(url, dst: Path):
	if dst.exists():
		return
	
	tmp = dst.with_suffix(".tmp")
	try:
		with requests.get(url, stream=True, headers={"User-Agent": "python"}) as r:
			r.raise_for_status()

			total = int(r.headers.get("Content-Length", 0))
			with open(tmp, "wb") as f, tqdm(total=total, unit="B", unit_scale=True, desc=dst.name) as bar:
				for chunk in r.iter_content(chunk_size=1024*1024):
					if chunk:
						f.write(chunk)
						bar.update(len(chunk))
		tmp.rename(dst)

	except Exception as e:
		if tmp.exists():
			tmp.unlink()
		raise e

def stream_raw_ds(columns=None, shards=None, batch_size_bytes=BATCH_SIZE_BYTES):
	idxs = shards if shards is not None else range(len(SHARD_URLS))

	batch_tables = []
	batch_bytes = 0

	for i in idxs:
		dst = DATA_DIR / f"{i:02d}.parquet"
		download_shard(SHARD_URLS[i], dst)

		pf = pq.ParquetFile(dst)

		for rg in range(pf.metadata.num_row_groups):
			table = pf.read_row_group(rg, columns=columns)
			batch_tables.append(table)
			batch_bytes += table.nbytes

			if batch_bytes >= batch_size_bytes:
				yield pyarrow.concat_tables(batch_tables)
				batch_tables = []
				batch_bytes = 0

	if batch_tables:
		yield pyarrow.concat_tables(batch_tables)

In [3]:
VOCAB_SIZE = 1 << 15
BOS = "<BOS>"
PAD = "<PAD>"
SPECIAL_TOKENS = [BOS, PAD]

VOCAB_PATH = EXPERIMENT_DIR / "data/vocab_32k.json"
OUTPUT_DIR = EXPERIMENT_DIR / "data/encoded"

def stream_texts(shards=None):
	for batch in stream_raw_ds(columns=["text"], shards=shards):
		yield (BOS + BOS.join(batch["text"].to_pylist())).encode('utf-8')

In [4]:
vocab = create_bpe(
	data_iter=stream_texts(shards=[0, 1, 2, 3, 4]),
	vocab_size=VOCAB_SIZE,
	mode=TokenizationMode.BYTES,
	special_tokens=SPECIAL_TOKENS
)

vocab.save(EXPERIMENT_DIR / "data/vocab_32k.json")

preprocessing: 162shard [05:29,  2.03s/shard]


starting merging...


BPE Training: 100%|██████████| 32510/32510 [02:36<00:00, 208.29it/s] 


In [4]:
vocab = Vocabulary.load(EXPERIMENT_DIR / "data/vocab_32k.json")

In [5]:
bulk_encode(
	doc_iter=stream_texts(shards=[0]),
	vocab=vocab,
	vocab_path=VOCAB_PATH,
	output_dir=OUTPUT_DIR,
	split_token=BOS
)

encoding: 5chunk [00:20,  2.41s/chunk]

wrote 99,988,431 tokens to shard_0000.bin


encoding: 9chunk [00:36,  3.10s/chunk]

wrote 99,997,968 tokens to shard_0001.bin


encoding: 13chunk [00:51,  4.39s/chunk]

wrote 99,996,637 tokens to shard_0002.bin


encoding: 18chunk [00:57,  2.16s/chunk]

wrote 99,990,697 tokens to shard_0003.bin


encoding: 22chunk [01:12,  2.63s/chunk]

wrote 99,997,764 tokens to shard_0004.bin


encoding: 26chunk [01:26,  3.52s/chunk]

wrote 99,995,411 tokens to shard_0005.bin


encoding: 32chunk [01:37,  2.28s/chunk]

wrote 99,988,214 tokens to shard_0006.bin


encoding: 33chunk [01:37,  2.96s/chunk]


wrote 51,902,468 tokens to shard_0007.bin


In [5]:
from toy_transformers.tokenization import _read_shard

shard = _read_shard(OUTPUT_DIR / "shard_0003.bin")
print(f"Shard shape: {shard.shape}, dtype: {shard.dtype}")
print(f"First 20 token IDs: {shard[:20]}")

# decode first 200 tokens
decoded = vocab.decode(shard[:200].tolist())
text = b"".join(decoded).decode("utf-8", errors="replace")
print(f"\n=== First 200 tokens decoded ===\n{text}")

# check BOS positions
bos_id = vocab.token_to_idx[b"<BOS>"]
bos_positions = (shard == bos_id).nonzero()[0]
print(f"\nBOS id: {bos_id}")
print(f"Number of documents in shard: {len(bos_positions)}")
print(f"First 5 BOS positions: {bos_positions[:5]}")

Shard shape: (99990484,), dtype: uint16
First 20 token IDs: [    0    87 10453 13649    59  8865   328   444  2798   415  5218   284
   362 12366  3868    33    55    47  1156   308]

=== First 200 tokens decoded ===
<BOS>Version info: Code for this page was tested in Mplus version 6.12.
Zero-inflated poisson regression is used to model count data that has an excess of zero counts. Further, theory suggests that the excess zeros are generated by a separate process from the count values and that the excess zeros can be modeled independently. Thus, the zip model has two parts, a poisson count model and the logit model for predicting excess zeros. You may want to review these Data Analysis Example pages, Poisson Regression and Logit Regression.
Please Note: The purpose of this page is to show how to use various data analysis commands. It does not cover all aspects of the research process which researchers are expected to do. In particular, it does not cover data cleaning and verification, 

In [6]:
from toy_transformers.tokenization import shuffle_shards

shuffle_shards(OUTPUT_DIR, EXPERIMENT_DIR / "data/shuffled")

shuffling: 162729doc [00:01, 109999.43doc/s]


In [7]:
from toy_transformers.tokenization import _read_shard

shard = _read_shard(EXPERIMENT_DIR / "data/shuffled/shard_0000.bin")
print(f"Shard shape: {shard.shape}, dtype: {shard.dtype}")
print(f"First 20 token IDs: {shard[:20]}")

# decode first 200 tokens
decoded = vocab.decode(shard[:300].tolist())
text = b"".join(decoded).decode("utf-8", errors="replace")
print(f"\n=== First 200 tokens decoded ===\n{text}")

# check BOS positions
bos_id = vocab.token_to_idx[b"<BOS>"]
bos_positions = (shard == bos_id).nonzero()[0]
print(f"\nBOS id: {bos_id}")
print(f"Number of documents in shard: {len(bos_positions)}")
print(f"First 5 BOS positions: {bos_positions[:5]}")

Shard shape: (94011011,), dtype: uint16
First 20 token IDs: [    0  6100  3989   329  9155    12   458 12374   329   427 11973 25620
  4601 25303   559  6489   335   263 14354    12]

=== First 200 tokens decoded ===
<BOS>Not Just for Kids
The Hunt for Falling Leaves...
Nature's Color on the Ground
by Mary Catherine Ball
Being a reporter, I am always looking for an adventure. Last week, I found one.
I left work to go on a simple journey, but it turned out to be much more.
First, I crossed a mud-ridden stream. Then, I came face to face with flying creatures, fighting to get near me. I even endured webmakers spinning my hair into a shiny maze.
Where did I go? Into the woods, of course. Why? I wanted to gather some fallen leaves.
My luck was good that day. I was able to spy lots of different kinds of leaves lying on the ground. Some were leaves I had never seen. Some were still green, while others were changing to their autumn colors.
Have you ever hunted for leaves? I wonder if you know 