In [1]:
import os
import re

INPUT_DIR = "data/emsa_texts"
OUTPUT_DIR = "data/emsa_cleaned"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def clean_text(text):
    # Basic cleaning: remove multiple line breaks, weird whitespace, control characters
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"[\x00-\x1f\x7f-\x9f]", "", text)
    return text.strip()

def process_file(path):
    with open(path, "r", encoding="utf-8") as f:
        raw_text = f.read()
    return clean_text(raw_text)

def process_all_files():
    files = [f for f in os.listdir(INPUT_DIR) if f.endswith(".txt")]
    print(f"🧹 Cleaning {len(files)} files...")

    for f in files:
        raw_path = os.path.join(INPUT_DIR, f)
        clean_path = os.path.join(OUTPUT_DIR, f.replace(".txt", "_cleaned.txt"))

        cleaned = process_file(raw_path)

        with open(clean_path, "w", encoding="utf-8") as out:
            out.write(cleaned)

        print(f"✅ Cleaned: {f}")

process_all_files()


🧹 Cleaning 33 files...
✅ Cleaned: 000_EMSA Catalogue 2025 v26.06.pdf.txt
✅ Cleaned: 001_EMSA CAAR2024.pdf.txt
✅ Cleaned: 002_EMSA_FACTS_FIGURES_2024.pdf.txt
✅ Cleaned: 003_Seafarers Statistics in the EU 2023 data report.pdf.txt
✅ Cleaned: 004_AFVs Guidance 1.2 2025.pdf.txt
✅ Cleaned: 005_ADER 2024.pdf.txt
✅ Cleaned: 006_emsa ipa-enp newsletter issue 2.pdf.txt
✅ Cleaned: 007_EMTER_F&F_2025_EN.pdf.txt
✅ Cleaned: 008_EMTER_F&F_2025_BG.pdf.txt
✅ Cleaned: 009_EMTER_F&F_2025_CS.pdf.txt
✅ Cleaned: 010_EMTER_F&F_2025_DA.pdf.txt
✅ Cleaned: 011_EMTER_F&F_2025_DE.pdf.txt
✅ Cleaned: 012_EMTER_F&F_2025_EL.pdf.txt
✅ Cleaned: 013_EMTER_F&F_2025_ES.pdf.txt
✅ Cleaned: 014_EMTER_F&F_2025_ET.pdf.txt
✅ Cleaned: 015_EMTER_F&F_2025_FI.pdf.txt
✅ Cleaned: 016_EMTER_F&F_2025_FR.pdf.txt
✅ Cleaned: 017_EMTER_F&F_2025_GA.pdf.txt
✅ Cleaned: 018_EMTER_F&F_2025_HR.pdf.txt
✅ Cleaned: 019_EMTER_F&F_2025_HU.pdf.txt
✅ Cleaned: 020_EMTER_F&F_2025_IT.pdf.txt
✅ Cleaned: 021_EMTER_F&F_2025_LT.pdf.txt
✅ Cleaned: 022_EMTER_F&