In [None]:
import os
from pathlib import Path

def merge_text_files(input_folder="raw_txt_pages", output_file="raw_book_merged_text.txt"):
    # Get all text files from input folder
    txt_files = sorted([f for f in os.listdir(input_folder) if f.endswith('.txt')])

    if not txt_files:
        print(f"No text files found in '{input_folder}' folder")
        return

    merged_content = []
    processed_files = 0

    for txt_file in txt_files:
        txt_path = os.path.join(input_folder, txt_file)

        try:
            # Extract page number from filename (e.g., page_0001.txt -> 0001)
            page_num = txt_file.replace('page_', '').replace('.txt', '')

            # Read the text content
            with open(txt_path, 'r', encoding='utf-8') as file:
                content = file.read().strip()

            # Create page header
            page_header = f"######PAGE {page_num} OCR'D TEXT#######"

            # Add header and content
            merged_content.append(page_header)
            merged_content.append(content)
            merged_content.append("")  # Add blank line after each page

            print(f"Added: {txt_file}")
            processed_files += 1

        except Exception as e:
            print(f"Error processing {txt_file}: {str(e)}")

    # Write merged content to output file
    try:
        with open(output_file, 'w', encoding='utf-8') as output:
            output.write('\n'.join(merged_content))

        print(f"\nMerge complete! Processed {processed_files} files")
        print(f"Merged text saved as '{output_file}'")

    except Exception as e:
        print(f"Error writing output file: {str(e)}")


In [None]:
merge_text_files()
