In [None]:
import pandas as pd
import numpy as np

FILE_PATH = "HIGGS.csv"
TARGET_ROWS = 11_000_000 #Change this if you want to use a smaller version of the dataset

# 1. FIRST PASS — READ CLEAN HEADER
sample = pd.read_csv(FILE_PATH, nrows=100000, header=0)

# Ensure first column is named "label"
sample = sample.rename(columns={sample.columns[0]: "label"})

# Convert label to numeric
sample["label"] = pd.to_numeric(sample["label"], errors="coerce")
sample = sample.dropna(subset=["label"])

# Save correct physics column names for later restoration
correct_columns = sample.columns

# Class proportions
label_counts = sample["label"].value_counts(normalize=True)
print("Estimated class distribution:", label_counts.to_dict())

# Compute target rows for each class
rows_class0 = int(TARGET_ROWS * label_counts.get(0.0, 0.5))
rows_class1 = TARGET_ROWS - rows_class0

print("Sampling:", rows_class0, "from class 0")
print("Sampling:", rows_class1, "from class 1")

# Containers for samples
class0, class1 = [], []

# 2. SECOND PASS — CHUNKED SAMPLING
chunk_size = 500_000

for chunk in pd.read_csv(FILE_PATH, chunksize=chunk_size, header=0):

    # Fix: ensure correct column name inside chunk
    chunk = chunk.rename(columns={chunk.columns[0]: "label"})

    # Clean label
    chunk["label"] = pd.to_numeric(chunk["label"], errors="coerce")
    chunk = chunk.dropna(subset=["label"])

    # Split by class
    c0 = chunk[chunk["label"] == 0.0]
    c1 = chunk[chunk["label"] == 1.0]

    need0 = rows_class0 - sum(len(x) for x in class0)
    need1 = rows_class1 - sum(len(x) for x in class1)

    if need0 > 0:
        class0.append(c0.sample(min(len(c0), need0)))

    if need1 > 0:
        class1.append(c1.sample(min(len(c1), need1)))

    # Stop early if we have enough
    if sum(len(x) for x in class0) >= rows_class0 and sum(len(x) for x in class1) >= rows_class1:
        break

# 3. COMBINE + RESTORE ORIGINAL HEADER
df0 = pd.concat(class0, ignore_index=True)
df1 = pd.concat(class1, ignore_index=True)

df = pd.concat([df0, df1], ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True)

# FIX: restore physics column names
df.columns = correct_columns

# 4. SAVE CLEAN SUBSET
df.to_csv("HIGGS_short.csv", index=False)

Estimated class distribution: {1.0: 0.52834, 0.0: 0.47166}
Sampling: 5188260 from class 0
Sampling: 5811740 from class 1
