In [None]:
import os
import json
from tqdm import tqdm

folder_path = "/home/cerrion/DATATHON/data/hackathon_data"
files_in_folder = os.listdir(folder_path)

len(files_in_folder)
def load_documents(json_file):
    """Loads the JSON file."""
    with open(json_file, 'r') as f:
      try:
          data = json.load(f)
          return data
      except json.JSONDecodeError:
          print(f"Error reading {json_file}, it may not be a valid JSON file.")
    return []

for filename in files_in_folder:
    if filename.endswith('.json'):
        file_path = os.path.join(folder_path, filename)
        doc = load_documents(file_path)
        break
print(doc.keys())
doc

In [None]:


num_items_per_doc = []
char_counts = []

long_texts = []   # Pages longer than 1M characters
short_texts = []  # Pages with 0 characters

LONG_TEXT_THRESHOLD = 1_000_000  # 1 million characters
SHORT_TEXT_THRESHOLD = 0         # Zero-length pages
MAX_SAVED = 30                   # Cap long/short saves for inspection

for filename in tqdm(files_in_folder):
    if filename.endswith('.json'):
        file_path = os.path.join(folder_path, filename)
        doc = load_documents(file_path)

        text_by_page = doc.get('text_by_page_url', {})
        num_items = len(text_by_page)
        num_items_per_doc.append(num_items)

        for page_url, text in text_by_page.items():
            length = len(text)
            char_counts.append(length)

            # Save long texts
            if length > LONG_TEXT_THRESHOLD and len(long_texts) < MAX_SAVED:
                long_texts.append({
                    "source_file": filename,
                    "page_url": page_url,
                    "char_length": length,
                    "text": text
                })

            # Save short texts (length == 0)
            if length == SHORT_TEXT_THRESHOLD and len(short_texts) < MAX_SAVED:
                short_texts.append({
                    "source_file": filename,
                    "page_url": page_url,
                    "char_length": length,
                    "text": text
                })

# Output paths (one dir up from input folder)
base_output_path = os.path.abspath(os.path.join(folder_path, ".."))
long_output_path = os.path.join(base_output_path, "long_texts_over_1M.json")
short_output_path = os.path.join(base_output_path, "short_texts_empty.json")

# Save long texts
with open(long_output_path, "w", encoding="utf-8") as f:
    json.dump(long_texts, f, ensure_ascii=False, indent=2)

# Save short texts
with open(short_output_path, "w", encoding="utf-8") as f:
    json.dump(short_texts, f, ensure_ascii=False, indent=2)

# Final output
print("\n📊 Summary Statistics:")
print(f"Number of items (pages) per document: {num_items_per_doc}")
print(f"Total number of text blocks processed: {len(char_counts)}")
print(f"Example character counts per text block: {char_counts[:10]}")
print(f"\n📝 Saved {len(long_texts)} long text blocks to: {long_output_path}")
print(f"📝 Saved {len(short_texts)} short (empty) text blocks to: {short_output_path}")

In [None]:
import matplotlib.pyplot as plt
import numpy as np

lengths_array = np.array(char_counts)
log_lengths = np.log10(lengths_array + 1)

plt.figure(figsize=(10, 5))
plt.hist(log_lengths, bins=50, color='skyblue', edgecolor='black')
plt.xlabel("log10(Text length in characters)")
plt.ylabel("Frequency")
plt.title("Distribution of Text Lengths (Character Count, log-scale)")
plt.grid(True)
plt.tight_layout()
plt.show()


items_array = np.array(num_items_per_doc)
log_items = np.log10(items_array + 1)

plt.figure(figsize=(10, 5))
plt.hist(log_items, bins=50, color='salmon', edgecolor='black')
plt.xlabel("log10(Number of pages per document)")
plt.ylabel("Frequency")
plt.title("Distribution of Pages per Document (log-scale)")
plt.grid(True)
plt.tight_layout()
plt.show()