## Sampling from HF using Streaming

Start by counting. Streaming lets you "peek" without download

In [None]:
from collections import Counter
from datasets import load_dataset
import numpy as np

In [None]:
train_stream = load_dataset("nyu-mll/multi_nli", split="train", streaming=True)

genre_counts = Counter()
for ex in train_stream:
    genre_counts[ex["genre"]] += 1

genre_counts

Counter({'telephone': 83348,
         'government': 77350,
         'travel': 77350,
         'fiction': 77348,
         'slate': 77306})

As before, decide how many docs per genre.

In [None]:


N = 20000
total = sum(genre_counts.values())

per_genre = {g: int(round(c / total * N)) for g, c in genre_counts.items()}

# Fix rounding to sum exactly to N
diff = N - sum(per_genre.values())
if diff != 0:
    biggest = max(per_genre, key=per_genre.get)
    per_genre[biggest] += diff

per_genre, sum(per_genre.values())

({'government': 3939,
  'telephone': 4246,
  'fiction': 3939,
  'travel': 3939,
  'slate': 3937},
 20000)

Collect rows until your quotas are met

In [None]:
import pandas as pd
from collections import defaultdict

train_stream = load_dataset("nyu-mll/multi_nli", split="train", streaming=True)

picked = {g: 0 for g in per_genre}
rows = []

for ex in train_stream:
    g = ex["genre"]
    if g in per_genre and picked[g] < per_genre[g]:
        rows.append({
            "premise": ex["premise"],
            "hypothesis": ex["hypothesis"],
            "label": ex["label"],
            "genre": ex["genre"],
        })
        picked[g] += 1

    # Stop once all quotas are met
    if all(picked[g] >= per_genre[g] for g in per_genre):
        break

df = pd.DataFrame(rows)
df["genre"].value_counts(), len(df)

(genre
 telephone     4246
 government    3939
 fiction       3939
 travel        3939
 slate         3937
 Name: count, dtype: int64,
 20000)

In [None]:
df.head(3)

Unnamed: 0,premise,hypothesis,label,genre
0,Conceptually cream skimming has two basic dime...,Product and geography are what make cream skim...,1,government
1,you know during the season and i guess at at y...,You lose the things to the following level if ...,0,telephone
2,One of our number will carry out your instruct...,A member of my team will execute your orders w...,0,fiction
