In [None]:
import PyPDF2
import os
from pathlib import Path

def extract_text_from_pdfs(input_folder="pdf_pages", output_folder="raw_txt_pages"):
    # Create output folder if it doesn't exist
    Path(output_folder).mkdir(exist_ok=True)

    # Get all PDF files from input folder
    pdf_files = sorted([f for f in os.listdir(input_folder) if f.endswith('.pdf')])

    if not pdf_files:
        print(f"No PDF files found in '{input_folder}' folder")
        return

    successful_extractions = 0

    for pdf_file in pdf_files:
        pdf_path = os.path.join(input_folder, pdf_file)

        try:
            # Open the PDF file
            with open(pdf_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)

                # Extract text from the first (and only) page
                if len(pdf_reader.pages) > 0:
                    page = pdf_reader.pages[0]
                    text = page.extract_text()

                    # Generate output filename (replace .pdf with .txt)
                    txt_filename = pdf_file.replace('.pdf', '.txt')
                    txt_path = os.path.join(output_folder, txt_filename)

                    # Write text to file
                    with open(txt_path, 'w', encoding='utf-8') as txt_file:
                        txt_file.write(text)

                    print(f"Extracted: {pdf_file} -> {txt_filename}")
                    successful_extractions += 1
                else:
                    print(f"Warning: {pdf_file} appears to be empty")

        except Exception as e:
            print(f"Error processing {pdf_file}: {str(e)}")

    print(f"\nExtraction complete! Successfully processed {successful_extractions} files")
    print(f"Text files saved in '{output_folder}' folder")


In [None]:
extract_text_from_pdfs()