In [None]:
import os
import re

def load_text(file_path):
    """Load text file into memory."""
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.readlines()

def save_text(file_path, content):
    """Save cleaned text back to file."""
    with open(file_path, 'w', encoding='utf-8') as file:
        file.writelines(content)

def clean_text(lines):
    """Apply a series of cleaning steps to the input text."""
    cleaned_lines = []
    last_line = ""

    for line in lines:
        # Remove page numbers (e.g., "123" at the beginning or end of a line)
        line = re.sub(r'^\d+\s*$', '', line).strip()
        line = re.sub(r'\s*\d+\s*$', '', line).strip()

        # Remove headers and footers (lines repeated at top or bottom of multiple pages)
        if len(line) > 0 and (line == last_line or is_repeated_phrase(line, lines)):
            continue

        # Fix broken paragraphs (join lines without punctuation)
        if last_line and not last_line.endswith(('.', '!', '?', ':', ';')) and line and line[0].islower():
            cleaned_lines[-1] = cleaned_lines[-1].strip() + " " + line
        else:
            cleaned_lines.append(line.strip() + "\n")

        last_line = line

    # Handle hyphenated words at line endings
    cleaned_lines = merge_hyphenated_words(cleaned_lines)

    # Standardize punctuation (quotation marks, dashes)
    cleaned_lines = standardize_punctuation(cleaned_lines)

    return cleaned_lines

def is_repeated_phrase(line, lines):
    """Check if a line is a repeated header/footer by comparing it against other lines."""
    count = sum(1 for l in lines if l.strip() == line.strip())
    return count > 2  # Consider a repeated line if it occurs more than twice

def merge_hyphenated_words(lines):
    """Merge hyphenated words split across lines."""
    merged_lines = []
    for line in lines:
        if merged_lines and merged_lines[-1].endswith('-'):
            merged_lines[-1] = merged_lines[-1][:-1] + line.lstrip()
        else:
            merged_lines.append(line)
    return merged_lines

def standardize_punctuation(lines):
    """Standardize common punctuation issues."""
    cleaned = []
    for line in lines:
        line = line.replace('“', '"').replace('”', '"')
        line = line.replace('‘', "'").replace('’', "'")
        cleaned.append(line)
    return cleaned

def clean_chapter_files(directory):
    """Clean up text in each chapter file within the given directory."""
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if not os.path.isfile(file_path):
            continue
        
        print(f"Cleaning: {file_path}")
        # Load, clean, and save the text
        lines = load_text(file_path)
        cleaned_lines = clean_text(lines)
        save_text(file_path, cleaned_lines)

# Run the cleaning process on a given directory
input_directory = "./books/bookName"
clean_chapter_files(input_directory)
