this script will sample 10x negatives

In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Create a balanced training parquet from:
    parquet_cems_with_fraction_dataset

- burned = 1 if fraction > 0.5 else 0
- Keep ALL positives
- Sample 10x as many negatives
- Save to a single parquet file for use in the weighted-logloss script.
"""

import pandas as pd
import numpy as np

# ---------- INPUT / OUTPUT ----------
PARQUET_IN  = "/explore/nobackup/people/spotter5/clelland_fire_ml/parquet_cems_with_fraction_dataset"
OUT_PARQUET = "/explore/nobackup/people/spotter5/clelland_fire_ml/ml_training/cems_with_fraction_balanced_10x.parquet"

RANDOM_STATE = 42
NEG_MULT     = 10   # 10x as many negatives as positives

print(f"Loading full parquet dataset from:\n  {PARQUET_IN}")
df = pd.read_parquet(PARQUET_IN)

# ---------- LABEL DEFINITION ----------
if "fraction" not in df.columns:
    raise ValueError("Expected column 'fraction' in dataset.")

df["fraction"] = df["fraction"].astype("float32").clip(0, 1)
df["burned"] = (df["fraction"] > 0.5).astype(np.uint8)

print("\n===== Original Burned Class Distribution =====")
orig_counts = df["burned"].value_counts()
orig_props  = df["burned"].value_counts(normalize=True).mul(100)
print(orig_counts)
print(orig_props.map(lambda x: f"{x:.4f}%"))

# ---------- POS / NEG SPLIT ----------
pos = df[df["burned"] == 1]
neg = df[df["burned"] == 0]

n_pos = len(pos)
n_neg = len(neg)

print(f"\nOriginal counts:")
print(f"  Positives (1): {n_pos:,}")
print(f"  Negatives (0): {n_neg:,}")

neg_target = min(n_neg, NEG_MULT * n_pos)
print(f"\nSampling {neg_target:,} negatives ({NEG_MULT}x positives)...")

neg_sample = neg.sample(neg_target, random_state=RANDOM_STATE)

# ---------- CONCAT & SHUFFLE ----------
df_bal = (
    pd.concat([pos, neg_sample], axis=0)
    .sample(frac=1.0, random_state=RANDOM_STATE)
    .reset_index(drop=True)
)

print("\n===== Balanced Burned Class Distribution (10x negatives) =====")
bal_counts = df_bal["burned"].value_counts()
bal_props  = df_bal["burned"].value_counts(normalize=True).mul(100)
print(bal_counts)
print(bal_props.map(lambda x: f"{x:.4f}%"))

print(f"\nTotal rows in balanced dataset: {len(df_bal):,}")

# ---------- SAVE ----------
print(f"\nSaving balanced parquet to:\n  {OUT_PARQUET}")
df_bal.to_parquet(OUT_PARQUET, index=False)
print("✅ Done.")


Loading full parquet dataset from:
  /explore/nobackup/people/spotter5/clelland_fire_ml/parquet_cems_with_fraction_dataset

===== Original Burned Class Distribution =====
0    1649237766
1        395586
Name: burned, dtype: int64
0    99.9760%
1     0.0240%
Name: burned, dtype: object

Original counts:
  Positives (1): 395,586
  Negatives (0): 1,649,237,766

Sampling 3,955,860 negatives (10x positives)...

===== Balanced Burned Class Distribution (10x negatives) =====
0    3955860
1     395586
Name: burned, dtype: int64
0    90.9091%
1     9.0909%
Name: burned, dtype: object

Total rows in balanced dataset: 4,351,446

Saving balanced parquet to:
  /explore/nobackup/people/spotter5/clelland_fire_ml/ml_training/cems_with_fraction_balanced_10x.parquet
✅ Done.


In [1]:
't'

't'