In [None]:
from pathlib import Path
import json
from tqdm import tqdm
from collections import Counter, defaultdict

json_dir = Path("../data")

# Summary containers
field_counts = Counter()
text_lengths = []
titles = Counter()
errors = 0
file_count = 0

In [None]:
# Analyze JSON files and their headers
json_dir = Path("../data")
field_counter = Counter()
errors = 0

for file_path in tqdm(list(json_dir.glob("*.json"))):  # limit for speed; remove `[:5000]` to run all
    file_count += 1
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)

        # Count top-level fields
        field_counts.update(data.keys())
        

        # Text length from HTML
        html = data["text_by_page_url"]
        if isinstance(html, str):
            text_lengths.append(len(html))

        # Title field (if exists)
        title = data.get("title")
        if isinstance(title, str):
            titles[title.strip()] += 1

    except Exception as e:
        errors += 1
        continue

In [None]:
# Print out json files summary
print(f"Total files processed: {file_count}")
print(f"Files with errors: {errors}")

print("\nTop-level field occurrence across files:")
for key, count in field_counts.most_common():
    print(f"  {key}: {count}")

print("\nHTML text length stats:")
if text_lengths:
    import numpy as np
    print(f"  Avg length: {np.mean(text_lengths):.2f}")
    print(f"  Min: {min(text_lengths)}, Max: {max(text_lengths)}")
    print(f"  Median: {np.median(text_lengths):.2f}")
else:
    print("  No HTML/text found.")

print("\nMost common titles (if available):")
for title, count in titles.most_common(5):
    print(f"  {count}x {title[:80]}{'...' if len(title) > 80 else ''}")

In [None]:
# Write out main keys to a text file
output_keys_file = Path("json_keys.txt")
json_dir = Path("../data")

with open(output_keys_file, "w", encoding="utf-8") as out_file:
    for file_path in tqdm(list(json_dir.glob("*.json"))):
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)
            keys = data.keys()
            out_file.write(f"File: {file_path.name}\n")
            out_file.write(f"Keys: {', '.join(keys)}\n\n")
        except Exception as e:
            out_file.write(f"File: {file_path.name}\n")
            out_file.write(f"Error: {str(e)}\n\n")