Summary Count of each Label

In [19]:
import json
from pathlib import Path

INPUT_PATH = Path("/Users/sheillaschool/Documents/final/Thesis_PredictingNewsOutdatedness_LogisticDecay/data/main_data/probability/probability.json")

def load_json(file_path: Path) -> dict:
    """Load JSON file and return as dict."""
    with open(file_path, "r", encoding="utf-8") as f:
        return json.load(f)

def summarize_labels(data: dict) -> dict:
    """Count labels 0, 1, and None in dataset."""
    total = len(data)
    count_0 = sum(1 for v in data.values() if v.get("label") == 0)
    count_1 = sum(1 for v in data.values() if v.get("label") == 1)
    count_none = sum(1 for v in data.values() if v.get("label") not in [0, 1])
    return {
        "total": total,
        "label_0": count_0,
        "label_1": count_1,
        "label_none": count_none
    }

def print_summary(summary: dict) -> None:
    """Nicely print the label summary."""
    print("📊 === Label Summary ===")
    print(f"Total articles: {summary['total']}")
    print(f"Label 0: {summary['label_0']}")
    print(f"Label 1: {summary['label_1']}")
    print(f"No label: {summary['label_none']}")

if __name__ == "__main__":
    data = load_json(INPUT_PATH)
    summary = summarize_labels(data)
    print_summary(summary)


📊 === Label Summary ===
Total articles: 9577
Label 0: 0
Label 1: 0
No label: 9577


JSON to XLS

In [16]:
import json
import pandas as pd
from pathlib import Path

JSON_PATH = Path("/Users/sheillaschool/Documents/final/Thesis_PredictingNewsOutdatedness_LogisticDecay/data/main_data/main_dataset.json")
EXCEL_PATH = JSON_PATH.with_suffix(".xlsx")

def load_json(file_path: Path) -> dict | list:
    """Load JSON file and return as dict or list."""
    with open(file_path, "r", encoding="utf-8") as f:
        return json.load(f)

def json_to_dataframe(data: dict | list) -> pd.DataFrame:
    """Convert JSON data to DataFrame, auto-detecting format."""
    if isinstance(data, list):
        print("🔍 Detected format: records (list of dicts)")
        return pd.DataFrame(data)
    elif isinstance(data, dict):
        print("🔍 Detected format: index (dict of dicts)")
        df = pd.DataFrame.from_dict(data, orient="index").reset_index()
        df = df.rename(columns={"index": "article_id"})
        return df
    else:
        raise ValueError("Unsupported JSON structure. Must be list or dict.")

def save_dataframe_to_excel(df: pd.DataFrame, file_path: Path) -> None:
    """Save DataFrame to Excel."""
    df.to_excel(file_path, index=False)
    print(f"✅ Excel saved to: {file_path}")

def save_dataframe_to_json(df: pd.DataFrame, file_path: Path) -> None:
    """Optionally save DataFrame back to JSON if needed."""
    data = df.to_dict(orient="records")
    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"✅ JSON saved to: {file_path}")

if __name__ == "__main__":
    data = load_json(JSON_PATH)
    df = json_to_dataframe(data)
    save_dataframe_to_excel(df, EXCEL_PATH)
    # Optional: save the same DataFrame back to JSON (records format)
    # save_dataframe_to_json(df, JSON_PATH.with_stem(JSON_PATH.stem + "_records"))


🔍 Detected format: index (dict of dicts)
✅ Excel saved to: /Users/sheillaschool/Documents/final/Thesis_PredictingNewsOutdatedness_LogisticDecay/data/main_data/main_dataset.xlsx


XLS to JSOn

In [17]:
import pandas as pd
import json
from pathlib import Path

# === CONFIG ===
EXCEL_PATH = Path("/Users/sheillaschool/Documents/final/Thesis_PredictingNewsOutdatedness_LogisticDecay/data/main_data/main_dataset.xlsx")
JSON_OUTPUT_PATH = EXCEL_PATH.with_suffix(".json")

def load_excel(file_path: Path) -> pd.DataFrame:
    """Load Excel file to DataFrame."""
    return pd.read_excel(file_path)

def dataframe_to_indexed_dict(df: pd.DataFrame, index_col: str = "article_id") -> dict:
    """Convert DataFrame to dict-of-dicts with specified index column."""
    if index_col not in df.columns:
        raise ValueError(f"Column '{index_col}' not found in DataFrame columns: {df.columns.tolist()}")
    df = df.set_index(index_col)
    return df.to_dict(orient="index")

def save_json(data: dict, file_path: Path) -> None:
    """Save dict to JSON."""
    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"✅ JSON restored and saved to: {file_path}")

def main():
    df = load_excel(EXCEL_PATH)
    restored = dataframe_to_indexed_dict(df, index_col="article_id")
    save_json(restored, JSON_OUTPUT_PATH)

    print(f"🔑 Total articles: {len(restored)}")
    print(f"📄 Fields in each article: {df.columns.tolist()}")

if __name__ == "__main__":
    main()


✅ JSON restored and saved to: /Users/sheillaschool/Documents/final/Thesis_PredictingNewsOutdatedness_LogisticDecay/data/main_data/main_dataset.json
🔑 Total articles: 12477
📄 Fields in each article: ['article_id', 'source', 'url', 'date', 'title', 'body', 'summary', 'category', 'title_normalized', 'summary_normalized', 't', 't_bin']


Adding article id

In [None]:
import json


input_path = "/Users/sheillaschool/Documents/final/Thesis_PredictingNewsOutdatedness_LogisticDecay/data/main_data/main_data.json"
output_path = "/Users/sheillaschool/Documents/final/Thesis_PredictingNewsOutdatedness_LogisticDecay/data/main_data/main_data.json"


with open(input_path, "r", encoding="utf-8") as f:
    data = json.load(f)


new_data = {}
for idx, (url_key, article) in enumerate(data.items()):
    new_data[str(idx)] = article 


with open(output_path, "w", encoding="utf-8") as f:
    json.dump(new_data, f, ensure_ascii=False, indent=2)

print(f"✅ Done. Saved {len(new_data)} articles to '{output_path}'.")


✅ Done. Saved 29734 articles to '/Users/sheillaschool/Documents/final/Thesis_PredictingNewsOutdatedness_LogisticDecay/data/main_data/main_data.json'.
