**1st Data Split (All to Labelling and Probability)**

In [None]:
import json
import pandas as pd

INPUT_PATH = "/Users/sheillaschool/Documents/final/Thesis_PredictingNewsOutdatedness_LogisticDecay/data/main_data/main_dataset.json"
OUTPUT_PATH = "/Users/sheillaschool/Documents/final/Thesis_PredictingNewsOutdatedness_LogisticDecay/data/main_data/finetuning/finetuning.json"
REMAINING_OUTPUT = "/Users/sheillaschool/Documents/final/Thesis_PredictingNewsOutdatedness_LogisticDecay/data/main_data/probability/probability.json"

with open(INPUT_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)

df = pd.DataFrame.from_dict(data, orient="index").reset_index().rename(columns={"index": "article_id"})

bin_counts = df["t_bin"].value_counts()
print("✅ Original t_bin counts:")
print(bin_counts)

target_count = bin_counts.min()
print(f"\n🎯 Target sample size per bin: {target_count}")

selected_rows = []
bin_order = ["fresh", "recent", "middle_age", "old", "very_old"]
bins_to_use = [b for b in bin_order if b in df["t_bin"].unique()]

for bin_label in bins_to_use:
    bin_df = df[df["t_bin"] == bin_label].copy()
    unique_t_df = bin_df.drop_duplicates(subset="t")

    if len(unique_t_df) >= target_count:
        selected_bin = unique_t_df.sample(n=target_count, random_state=42)
    else:
        extra_needed = target_count - len(unique_t_df)
        remaining = bin_df[~bin_df.index.isin(unique_t_df.index)]
        fill = remaining.sample(n=min(extra_needed, len(remaining)), random_state=42)
        selected_bin = pd.concat([unique_t_df, fill], ignore_index=True)

    selected_rows.append(selected_bin)

final_df = pd.concat(selected_rows).drop_duplicates(subset="article_id").reset_index(drop=True)

selected_ids = set(final_df["article_id"])
remaining_df = df[~df["article_id"].isin(selected_ids)].copy().reset_index(drop=True)

print("\n📦 Final stratified bin counts:")
print(final_df["t_bin"].value_counts())
print(f"\n✅ Selected articles: {len(final_df)}")
print(f"🗂️ Remaining articles: {len(remaining_df)}")

final_dict = {
    str(row["article_id"]): row.drop(labels=["article_id"]).to_dict()
    for _, row in final_df.iterrows()
}
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    json.dump(final_dict, f, indent=2, ensure_ascii=False)

remaining_dict = {
    str(row["article_id"]): row.drop(labels=["article_id"]).to_dict()
    for _, row in remaining_df.iterrows()
}
with open(REMAINING_OUTPUT, "w", encoding="utf-8") as f:
    json.dump(remaining_dict, f, indent=2, ensure_ascii=False)

print(f"\n✅ Saved selected to: {OUTPUT_PATH}")
print(f"✅ Saved remaining to: {REMAINING_OUTPUT}")


✅ Original t_bin counts:
t_bin
very_old      7900
old           1555
middle_age    1501
recent         941
fresh          580
Name: count, dtype: int64

🎯 Target sample size per bin: 580

📦 Final stratified bin counts:
t_bin
fresh         580
recent        580
middle_age    580
old           580
very_old      580
Name: count, dtype: int64

✅ Selected articles: 2900
🗂️ Remaining articles: 9577

✅ Saved selected to: /Users/sheillaschool/Documents/final/Thesis_PredictingNewsOutdatedness_LogisticDecay/data/main_data/finetuning/finetuning.json
✅ Saved remaining to: /Users/sheillaschool/Documents/final/Thesis_PredictingNewsOutdatedness_LogisticDecay/data/main_data/probability/probability.json


**2nd Data Split (Labelling to Train/Val/Test)**

In [8]:
import json
from pathlib import Path
import random
from sklearn.model_selection import train_test_split
from collections import Counter

# === CONFIG ===
INPUT_PATH = Path("/Users/sheillaschool/Documents/final/Thesis_PredictingNewsOutdatedness_Logistic_Decay/model/labelling/data/manual/rulebased_manual.json")
OUTPUT_DIR = INPUT_PATH.parent / "balanced_split"
OUTPUT_DIR.mkdir(exist_ok=True)

TRAIN_OUTPUT = OUTPUT_DIR / "train_balanced.json"
VAL_OUTPUT = OUTPUT_DIR / "val_balanced.json"
TEST_OUTPUT = OUTPUT_DIR / "test_balanced.json"
EXTRA_OUTPUT = OUTPUT_DIR / "remaining_majority_not_in_seed.json"

RANDOM_SEED = 42
random.seed(RANDOM_SEED)

# === HELPERS ===
def load_json(path: Path) -> dict:
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def save_json(data: dict, path: Path) -> None:
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    print(f"✅ Saved: {path.name} ({len(data)} samples)")

def print_label_distribution(name: str, items: list) -> None:
    labels = [v["label"] for _, v in items]
    counter = Counter(labels)
    print(f"📊 {name} label distribution: {dict(counter)}")

# === MAIN ===
def main():
    data = load_json(INPUT_PATH)

    # === Separate by class ===
    label_groups = {}
    for k, v in data.items():
        label = v.get("label")
        if label not in label_groups:
            label_groups[label] = []
        label_groups[label].append((k, v))

    # === Identify minority and majority classes ===
    sorted_labels = sorted(label_groups.items(), key=lambda x: len(x[1]))
    minority_label, minority_items = sorted_labels[0]
    majority_label, majority_items = sorted_labels[1]
    target_size = len(minority_items)

    print(f"Class distribution: {[(l, len(v)) for l, v in label_groups.items()]}")
    print(f"🔍 Undersampling majority class '{majority_label}' to match minority class '{minority_label}' ({target_size} samples)")

    # === Undersample majority ===
    majority_sampled = random.sample(majority_items, target_size)
    balanced_items = minority_items + majority_sampled
    random.shuffle(balanced_items)

    # === Train/Test Split ===
    labels = [v["label"] for _, v in balanced_items]
    train_items, test_items, train_labels, _ = train_test_split(
        balanced_items, labels, test_size=0.20, stratify=labels, random_state=RANDOM_SEED
    )

    # === Train → Train/Val ===
    train_items_final, val_items, _, _ = train_test_split(
        train_items, train_labels, test_size=0.25, stratify=train_labels, random_state=RANDOM_SEED
    )
    # Final: 60% train, 20% val, 20% test

    # === Save leftover majority class samples not used ===
    used_keys = set(k for k, _ in balanced_items)
    remaining_majority = [(k, v) for k, v in majority_items if k not in used_keys]

    # === Report ===
    print_label_distribution("Train", train_items_final)
    print_label_distribution("Validation", val_items)
    print_label_distribution("Test", test_items)
    print(f"🧾 Remaining majority samples (not in balanced seed): {len(remaining_majority)}")

    # === Save ===
    save_json({k: v for k, v in train_items_final}, TRAIN_OUTPUT)
    save_json({k: v for k, v in val_items}, VAL_OUTPUT)
    save_json({k: v for k, v in test_items}, TEST_OUTPUT)
    save_json({k: v for k, v in remaining_majority}, EXTRA_OUTPUT)

if __name__ == "__main__":
    main()


Class distribution: [(1, 830), (0, 2070)]
🔍 Undersampling majority class '0' to match minority class '1' (830 samples)
📊 Train label distribution: {1: 498, 0: 498}
📊 Validation label distribution: {1: 166, 0: 166}
📊 Test label distribution: {0: 166, 1: 166}
🧾 Remaining majority samples (not in balanced seed): 1240
✅ Saved: train_balanced.json (996 samples)
✅ Saved: val_balanced.json (332 samples)
✅ Saved: test_balanced.json (332 samples)
✅ Saved: remaining_majority_not_in_seed.json (1240 samples)
