# FineWeb-Edu sample-10BT — Download shards, stream batches
*thanks claude!*

Downloads one shard at a time to `data/fineweb-edu/sample/10BT/`, then streams
100MB-ish batches from it via PyArrow. Already-downloaded shards are skipped.

In [3]:
import urllib.request
import json
import os
import pyarrow.parquet as pq
from pathlib import Path
from tqdm import tqdm

DATA_DIR = Path("../../data/fineweb-edu/sample/10BT")
DATA_DIR.mkdir(parents=True, exist_ok=True)

BATCH_SIZE_BYTES = 100 * 1024 * 1024  # 100 MB

API_URL = "https://huggingface.co/api/datasets/HuggingFaceFW/fineweb-edu/parquet/sample-10BT/train"


def get_shard_urls():
    with urllib.request.urlopen(API_URL) as r:
        return json.load(r)


def download_shard(hf_url, dest: Path):
    if dest.exists():
        print(f"  already exists, skipping: {dest.name}")
        return
    # Resolve HF redirect to direct CDN URL
    req = urllib.request.Request(hf_url, headers={"User-Agent": "python"})
    with urllib.request.urlopen(req) as r:
        direct_url = r.url
        total = int(r.headers.get("Content-Length", 0))

    tmp = dest.with_suffix(".tmp")
    chunk = 8 * 1024 * 1024  # 8 MB chunks
    with urllib.request.urlopen(direct_url) as src, \
         open(tmp, "wb") as dst, \
         tqdm(total=total, unit="B", unit_scale=True, desc=dest.name) as bar:
        while True:
            buf = src.read(chunk)
            if not buf:
                break
            dst.write(buf)
            bar.update(len(buf))
    tmp.rename(dest)


def stream_fineweb(columns=None, shards=None, batch_size_bytes=BATCH_SIZE_BYTES):
    """Yield PyArrow Tables of ~batch_size_bytes each.

    Downloads each shard on demand if not already present.
    Args:
        columns: list of column names to load, or None for all.
        shards:  list of shard indices (0-13), or None for all.
        batch_size_bytes: target uncompressed batch size in bytes.
    """
    urls = get_shard_urls()
    indices = shards if shards is not None else range(min(5, len(urls)))

    for i in indices:
        dest = DATA_DIR / f"{i:04d}.parquet"
        print(f"Shard {i}/{len(urls)-1}")
        download_shard(urls[i], dest)

        pf = pq.ParquetFile(dest)
        batch_tables = []
        batch_bytes = 0

        for rg in range(pf.metadata.num_row_groups):
            table = pf.read_row_group(rg, columns=columns)
            batch_tables.append(table)
            batch_bytes += table.nbytes

            if batch_bytes >= batch_size_bytes:
                import pyarrow as pa
                yield pa.concat_tables(batch_tables)
                batch_tables = []
                batch_bytes = 0

        if batch_tables:
            import pyarrow as pa
            yield pa.concat_tables(batch_tables)


In [5]:
# Example: stream first two batches from shard 0
for batch in stream_fineweb(columns=["text", "url", "score", "token_count"], shards=[0]):
    d = batch.to_pydict()
    print(f"Batch: {batch.num_rows} rows, {batch.nbytes / 1024**2:.1f} MB")
    print(f"  first url : {d['url'][0]}")
    print(f"  first text: {d['text'][0][:120].replace(chr(10), ' ')}...")
    print()


Shard 0/13
  already exists, skipping: 0000.parquet
Batch: 22000 rows, 104.4 MB
  first url : http://austenauthors.net/the-independent-jane
  first text: The Independent Jane For all the love, romance and scandal in Jane Austen’s books, what they are really about is freedom...

Batch: 22000 rows, 102.6 MB
  first url : https://www.bankofcanadamuseum.ca/2022/11/design-your-own-bank-note/
  first text: Your students will research and choose their own iconic Canadian, theme, images and symbols to go on a new bank note. Lo...

Batch: 22000 rows, 104.0 MB
  first url : https://drwho.virtadpt.net/archive/2007-02-08/an-interesting-method-of-data-visualisation/
  first text: An interesting method of data visualisation. Data visualisation is a process in which the bits of a given data field are...

Batch: 22000 rows, 102.2 MB
  first url : https://coastfunds.ca/stories/reclaiming-control-how-the-gwasala-nakwaxdaxw-are-determining-their-economic-future/
  first text: Through in-depth community c