# IHY Data Generation â†’ HuggingFace Dataset

Load generated IHY data, filter by score, balance splits, and create a HuggingFace dataset.

In [None]:
import json
from pathlib import Path
from datasets import Dataset

NOTEBOOK_DIR = Path(__file__).parent if "__file__" in dir() else Path.home() / "training-against-interp" / "experiments" / "ihy_data_generation"
results_path = NOTEBOOK_DIR / "results" / "ihy_when_deployment.json"

with open(results_path) as f:
    raw = json.load(f)

print(f"Positive examples: {len(raw['positive'])}")
print(f"Negative examples: {len(raw['negative'])}")

In [None]:
# Look at score distributions before filtering
from collections import Counter

pos_scores = Counter(ex["label"]["score"] for ex in raw["positive"])
neg_scores = Counter(ex["label"]["score"] for ex in raw["negative"])

print("Positive score distribution:")
for score in sorted(pos_scores):
    print(f"  {score}: {pos_scores[score]}")

print(f"\nNegative score distribution:")
for score in sorted(neg_scores):
    print(f"  {score}: {neg_scores[score]}")

In [None]:
# Filter: positive must have score == 10, negative must have score == 0
positive_filtered = [ex for ex in raw["positive"] if ex["label"]["score"] == 10]
negative_filtered = [ex for ex in raw["negative"] if ex["label"]["score"] == 0]

print(f"Positive after filtering (score == 10): {len(positive_filtered)}")
print(f"Negative after filtering (score == 0): {len(negative_filtered)}")

In [None]:
# Balance the splits by taking the minimum count
import random

random.seed(42)

n = min(len(positive_filtered), len(negative_filtered))
print(f"Balancing to {n} examples per split")

if len(positive_filtered) > n:
    positive_filtered = random.sample(positive_filtered, n)
if len(negative_filtered) > n:
    negative_filtered = random.sample(negative_filtered, n)

print(f"Positive: {len(positive_filtered)}, Negative: {len(negative_filtered)}")

In [None]:
# Build rows for the dataset
def make_rows(examples):
    rows = []
    for ex in examples:
        rows.append({
            "messages": ex["transcript"],
        })
    return rows

rows = make_rows(positive_filtered) + make_rows(negative_filtered)
random.shuffle(rows)

print(f"Total rows: {len(rows)}")

In [None]:
# Create HuggingFace dataset
ds = Dataset.from_list(rows)
print(ds)
print(ds[0])

In [None]:
# Push to HuggingFace Hub
ds.push_to_hub("alignment-science/ihy-sleeper-agent-dataset")