In [None]:

import json
from pathlib import Path
from transformers import pipeline
from tqdm import tqdm

INPUT_PATH = Path("/content/drive/MyDrive/final/Thesis/data/augment/train_balanced.json")
OUTPUT_PATH = Path("/content/drive/MyDrive/final/Thesis/data/augment/train_balanced_plus_bt.json")

with open(INPUT_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)

print(f"📁 Loaded {len(data)} articles")

print("🔁 Loading translation models...")
en_to_de = pipeline("translation", model="Helsinki-NLP/opus-mt-en-de", device=0)
de_to_en = pipeline("translation", model="Helsinki-NLP/opus-mt-de-en", device=0)


for article_id, article in tqdm(data.items(), desc="🔁 Back-translating label==1 only"):
    if article.get("label") != 1:
        continue 

    title = article.get("title_normalized", "")
    summary = article.get("summary_normalized", "")

    try:
        title_de = en_to_de(title, max_length=512)[0]["translation_text"]
        summary_de = en_to_de(summary, max_length=512)[0]["translation_text"]

        title_bt = de_to_en(title_de, max_length=512)[0]["translation_text"]
        summary_bt = de_to_en(summary_de, max_length=512)[0]["translation_text"]
    except Exception as e:
        print(f"⚠️ Error for article {article_id}: {e}")
        title_bt = title
        summary_bt = summary

    article["bt"] = f"{title_bt}. {summary_bt}"

with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2, ensure_ascii=False)

print(f"\n✅ Back-translated dataset saved to: {OUTPUT_PATH}")


  from .autonotebook import tqdm as notebook_tqdm


🔁 Loading translation pipelines...


Device set to use mps:0
Device set to use mps:0
🔁 Back-translating dummy data: 100%|██████████| 3/3 [00:19<00:00,  6.39s/it]


✅ Final back-translated dummy dataset:

📰 Article 1:
BT: Climate change is accelerating. New reports show that global temperatures are rising faster than expected.

📰 Article 2:
BT: Electric cars are gaining popularity in Europe. In 2024, the sale of electric vehicles reached a new record.

📰 Article 3:
BT: AI tools change our way of working. Companies take over generative AI to improve productivity and automate tasks.





In [4]:
import json
from pathlib import Path

# === CONFIG ===
INPUT_PATH = Path("/Users/sheillaschool/Documents/final/Thesis_PredictingNewsOutdatedness_Logistic_Decay/model/finetune/data/train_balanced_plus_bt.json")
OUTPUT_PATH = Path("/Users/sheillaschool/Documents/final/Thesis_PredictingNewsOutdatedness_Logistic_Decay/model/finetune/data/train_balanced_plus_bt_separated.json")

# === LOAD DATA ===
with open(INPUT_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)

original_articles = {}
synthetic_articles = {}
id_counter = max(int(k) for k in data.keys()) + 1  # To generate new unique keys

for key, article in data.items():
    # Make a copy of the article for original and synthetic
    original = article.copy()
    synthetic = article.copy()

    # If article has a back-translated version
    if "bt" in article and article["bt"]:
        # 1. Add the original version without the "bt" field
        original.pop("bt", None)
        original_articles[key] = original

        # 2. Create a synthetic version (same fields + "bt")
        synthetic_articles[str(id_counter)] = synthetic
        id_counter += 1
    else:
        # No "bt" → keep the original as is
        original_articles[key] = article

# === COMBINE ORIGINAL + SYNTHETIC ===
final_data = {**original_articles, **synthetic_articles}
print(f"🟢 Original articles: {len(original_articles)}")
print(f"🟠 Synthetic articles: {len(synthetic_articles)}")
print(f"📦 Final merged total: {len(final_data)}")

# === SAVE TO OUTPUT FILE ===
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    json.dump(final_data, f, indent=2, ensure_ascii=False)

print(f"✅ Output saved to: {OUTPUT_PATH}")


🟢 Original articles: 996
🟠 Synthetic articles: 498
📦 Final merged total: 1494
✅ Output saved to: /Users/sheillaschool/Documents/final/Thesis_PredictingNewsOutdatedness_Logistic_Decay/model/finetune/data/train_balanced_plus_bt_separated.json


In [6]:
import json
import random
from pathlib import Path
from collections import Counter

# === CONFIG ===
BT_PATH = Path("/Users/sheillaschool/Documents/final/Thesis_PredictingNewsOutdatedness_Logistic_Decay/model/finetune/data/train_balanced_plus_bt_separated.json")
REMAINING_PATH = Path("/Users/sheillaschool/Documents/final/Thesis_PredictingNewsOutdatedness_Logistic_Decay/model/finetune/data/remaining_majority_not_in_seed.json")
BALANCED_OUTPUT = Path("/Users/sheillaschool/Documents/final/Thesis_PredictingNewsOutdatedness_Logistic_Decay/model/finetune/data/train_ready_balanced.json")
REMAINDER_OUTPUT = Path("/Users/sheillaschool/Documents/final/Thesis_PredictingNewsOutdatedness_Logistic_Decay/model/finetune/data/train_remainder.json")
RANDOM_SEED = 42

random.seed(RANDOM_SEED)

# === LOAD FILES ===
with open(BT_PATH, "r", encoding="utf-8") as f:
    bt_data = json.load(f)

with open(REMAINING_PATH, "r", encoding="utf-8") as f:
    remaining_data = json.load(f)

# === MERGE DATA ===
merged_data = {**bt_data, **remaining_data}
print(f"✅ Total merged articles: {len(merged_data)}")

# === GROUP BY LABEL ===
label_groups = {0: [], 1: []}
for k, v in merged_data.items():
    label = v.get("label")
    if label in [0, 1]:
        label_groups[label].append((k, v))

# === BALANCE DATA ===
min_count = min(len(label_groups[0]), len(label_groups[1]))
print(f"⚖️  Balancing to {min_count} samples per class")

balanced_data = {}
remainder_data = {}

for label in [0, 1]:
    selected = random.sample(label_groups[label], min_count)
    not_selected = [item for item in label_groups[label] if item not in selected]

    # Add to output
    for k, v in selected:
        balanced_data[k] = v
    for k, v in not_selected:
        remainder_data[k] = v

# === SAVE BALANCED TRAINING DATA ===
with open(BALANCED_OUTPUT, "w", encoding="utf-8") as f:
    json.dump(balanced_data, f, indent=2, ensure_ascii=False)
print(f"📁 Saved balanced training set: {BALANCED_OUTPUT} ({len(balanced_data)} samples)")

# === SAVE REMAINDER ===
with open(REMAINDER_OUTPUT, "w", encoding="utf-8") as f:
    json.dump(remainder_data, f, indent=2, ensure_ascii=False)
print(f"📁 Saved remainder data: {REMAINDER_OUTPUT} ({len(remainder_data)} samples)")

# === STATS ===
balanced_labels = [v["label"] for v in balanced_data.values()]
remainder_labels = [v["label"] for v in remainder_data.values()]
print(f"📊 Final label distribution in training: {Counter(balanced_labels)}")
print(f"📊 Remaining data label distribution: {Counter(remainder_labels)}")


✅ Total merged articles: 2729
⚖️  Balancing to 991 samples per class
📁 Saved balanced training set: /Users/sheillaschool/Documents/final/Thesis_PredictingNewsOutdatedness_Logistic_Decay/model/finetune/data/train_ready_balanced.json (1982 samples)
📁 Saved remainder data: /Users/sheillaschool/Documents/final/Thesis_PredictingNewsOutdatedness_Logistic_Decay/model/finetune/data/train_remainder.json (747 samples)
📊 Final label distribution in training: Counter({0: 991, 1: 991})
📊 Remaining data label distribution: Counter({0: 747})
