In [23]:
import os
import re
from pathlib import Path

### UTIL FUNCTIONS #################################################################

def save_text_file(text, output_file):
    """Guarda texto en un archivo solo si no está vacío."""
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(text)

def postprocess(text):
    """Limpia el texto de patrones indeseados."""
    # Eliminar menciones a la Biblioteca Nacional
    text = re.sub(r'\s*biblioteca\s+nacional\s+de\s+españa\s*$', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\s*BIBLIOTECA\s+NACIONAL', '', text, flags=re.IGNORECASE)
    text = re.sub(r'[©¶]', '', text, flags=re.MULTILINE)

    # Eliminar números en las primeras dos líneas
    lines = text.splitlines()
    for i in range(min(2, len(lines))):
        lines[i] = re.sub(r'\d+', '', lines[i])

    text = '\n'.join(lines)

    # Limpiar caracteres repetidos o extraños al principio
    #text = re.sub(r'["„©¶»«—]', '', text)
    text = re.sub(r'^["„©¶»«]', '', text, flags=re.MULTILINE)
    text = re.sub(r'^,,', '', text, flags=re.MULTILINE)
    text = re.sub(r'^,', '', text, flags=re.MULTILINE)
    text = re.sub(r'^>>', '', text, flags=re.MULTILINE)
    text = re.sub(r'^<"', '', text, flags=re.MULTILINE)
    text = re.sub(r'\s,', ',', text, flags=re.MULTILINE)
    lines = text.splitlines()
    stripped_lines = [line.strip() for line in lines]
    if stripped_lines and stripped_lines[0] == '':
        stripped_lines = stripped_lines[1:]
    text = '\n'.join(stripped_lines)
    return text

def process_directory(directory_in, directory_out):
    """Procesa todos los .txt en un directorio, aplicando postprocesamiento."""
    Path(directory_out).mkdir(parents=True, exist_ok=True)
    count = 0
    for file in sorted(Path(directory_in).glob("*.txt")):
        try:
            base_name = file.stem + ".txt"
            output_path = Path(directory_out) / base_name

            if not output_path.exists():
                file_content = file.read_text(encoding='utf-8', errors='ignore')
                cleaned_text = postprocess(file_content)
                save_text_file(cleaned_text, output_path)
                count += 1
        except Exception as e:
            print(f"⚠️ Error procesando {file.name}: {e}")
    return count

### CONFIGURACIÓN ###################################################################

if __name__ == "__main__":
    directory_in = "./test/txt/"
    directory_out = "./test/txt_revised/"
    count = process_directory(directory_in, directory_out)

    print(f"Process finished. {count} files processed.")

Process finished. 3000 files processed.
