In [None]:
import json
import pandas as pd

INPUT_PATH = "/Users/sheillaschool/Documents/final/Thesis_PredictingNewsOutdatedness_LogisticDecay/data/main_data/main_dataset.json"
OUTPUT_PATH = "/Users/sheillaschool/Documents/final/Thesis_PredictingNewsOutdatedness_LogisticDecay/data/main_data/finetuning/finetuning.json"
REMAINING_OUTPUT = "/Users/sheillaschool/Documents/final/Thesis_PredictingNewsOutdatedness_LogisticDecay/data/main_data/probability/probability.json"

with open(INPUT_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)

df = pd.DataFrame.from_dict(data, orient="index").reset_index().rename(columns={"index": "article_id"})

bin_counts = df["t_bin"].value_counts()
print("✅ Original t_bin counts:")
print(bin_counts)

target_count = bin_counts.min()
print(f"\n🎯 Target sample size per bin: {target_count}")

selected_rows = []
bin_order = ["fresh", "recent", "middle_age", "old", "very_old"]
bins_to_use = [b for b in bin_order if b in df["t_bin"].unique()]

for bin_label in bins_to_use:
    bin_df = df[df["t_bin"] == bin_label].copy()
    unique_t_df = bin_df.drop_duplicates(subset="t")

    if len(unique_t_df) >= target_count:
        selected_bin = unique_t_df.sample(n=target_count, random_state=42)
    else:
        extra_needed = target_count - len(unique_t_df)
        remaining = bin_df[~bin_df.index.isin(unique_t_df.index)]
        fill = remaining.sample(n=min(extra_needed, len(remaining)), random_state=42)
        selected_bin = pd.concat([unique_t_df, fill], ignore_index=True)

    selected_rows.append(selected_bin)

final_df = pd.concat(selected_rows).drop_duplicates(subset="article_id").reset_index(drop=True)

selected_ids = set(final_df["article_id"])
remaining_df = df[~df["article_id"].isin(selected_ids)].copy().reset_index(drop=True)

print("\n📦 Final stratified bin counts:")
print(final_df["t_bin"].value_counts())
print(f"\n✅ Selected articles: {len(final_df)}")
print(f"🗂️ Remaining articles: {len(remaining_df)}")

final_dict = {
    str(row["article_id"]): row.drop(labels=["article_id"]).to_dict()
    for _, row in final_df.iterrows()
}
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    json.dump(final_dict, f, indent=2, ensure_ascii=False)

remaining_dict = {
    str(row["article_id"]): row.drop(labels=["article_id"]).to_dict()
    for _, row in remaining_df.iterrows()
}
with open(REMAINING_OUTPUT, "w", encoding="utf-8") as f:
    json.dump(remaining_dict, f, indent=2, ensure_ascii=False)

print(f"\n✅ Saved selected to: {OUTPUT_PATH}")
print(f"✅ Saved remaining to: {REMAINING_OUTPUT}")


✅ Original t_bin counts:
t_bin
very_old      7900
old           1555
middle_age    1501
recent         941
fresh          580
Name: count, dtype: int64

🎯 Target sample size per bin: 580

📦 Final stratified bin counts:
t_bin
fresh         580
recent        580
middle_age    580
old           580
very_old      580
Name: count, dtype: int64

✅ Selected articles: 2900
🗂️ Remaining articles: 9577

✅ Saved selected to: /Users/sheillaschool/Documents/final/Thesis_PredictingNewsOutdatedness_LogisticDecay/data/main_data/finetuning/finetuning.json
✅ Saved remaining to: /Users/sheillaschool/Documents/final/Thesis_PredictingNewsOutdatedness_LogisticDecay/data/main_data/probability/probability.json
