In [8]:
import os
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file using PyMuPDF and returns it as a string.
    
    Parameters:
    - pdf_path (str): The path to the PDF file.
    
    Returns:
    - str: The extracted text.
    """
    text = ""
    try:
        # Open the PDF document
        doc = fitz.open(pdf_path)
        # Iterate through each page and append its text
        for page in doc:
            text += page.get_text()
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
    return text

def save_text_to_file(text, txt_path):
    """
    Saves the provided text into a .txt file.
    
    Parameters:
    - text (str): The text to save.
    - txt_path (str): The file path where the text will be saved.
    """
    with open(txt_path, "w", encoding="utf-8") as f:
        f.write(text)

def convert_all_pdfs_to_txt(pdf_folder, output_folder):
    """
    Iterates over all PDF files in the specified folder, extracts text from each,
    and saves the text into separate .txt files in the output folder.
    
    The text file will have the same base name as the original PDF.
    
    Parameters:
    - pdf_folder (str): The folder containing PDF files.
    - output_folder (str): The folder where the .txt files will be saved.
    """
    os.makedirs(output_folder, exist_ok=True)
    for filename in os.listdir(pdf_folder):
        if filename.lower().endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, filename)
            print(f"Processing {pdf_path} ...")
            text = extract_text_from_pdf(pdf_path)
            if text:
                base_name = os.path.splitext(filename)[0]
                txt_filename = base_name + ".txt"
                txt_path = os.path.join(output_folder, txt_filename)
                save_text_to_file(text, txt_path)
                print(f"Extracted text saved to {txt_path}")
            else:
                print(f"No text extracted from {pdf_path}")

# Example usage:
if __name__ == "__main__":
    convert_all_pdfs_to_txt("pdf", "txt")


Processing pdf\KPMG_2025-02-06_KPMG global tech report 2024.pdf ...
Extracted text saved to txt\KPMG_2025-02-06_KPMG global tech report 2024.txt
Processing pdf\KPMG_2025-02-07_KPMG global tech report energy insights.pdf ...
Extracted text saved to txt\KPMG_2025-02-07_KPMG global tech report energy insights.txt
Processing pdf\KPMG_2025-02-07_KPMG global tech report Technology insights.pdf ...
Extracted text saved to txt\KPMG_2025-02-07_KPMG global tech report Technology insights.txt
Processing pdf\KPMG_2025-02-07_KPMG global tech report – industrial manufacturing insights.pdf ...
Extracted text saved to txt\KPMG_2025-02-07_KPMG global tech report – industrial manufacturing insights.txt
Processing pdf\KPMG_2025-02-20_Food and Nutritional Security in India.pdf ...
Extracted text saved to txt\KPMG_2025-02-20_Food and Nutritional Security in India.txt
Processing pdf\KPMG_2025-02-28_Issue no. 103  February 2025.pdf ...
Extracted text saved to txt\KPMG_2025-02-28_Issue no. 103  February 2025.

In [5]:
!pip install pymupdf

Collecting pymupdf
  Downloading pymupdf-1.25.3-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.3-cp39-abi3-win_amd64.whl (16.5 MB)
   ---------------------------------------- 0.0/16.5 MB ? eta -:--:--
   ----- ---------------------------------- 2.1/16.5 MB 11.8 MB/s eta 0:00:02
   ----------- ---------------------------- 4.7/16.5 MB 11.4 MB/s eta 0:00:02
   ----------------- ---------------------- 7.1/16.5 MB 11.2 MB/s eta 0:00:01
   ---------------------- ----------------- 9.2/16.5 MB 11.0 MB/s eta 0:00:01
   --------------------------- ------------ 11.5/16.5 MB 11.1 MB/s eta 0:00:01
   --------------------------------- ------ 13.9/16.5 MB 11.0 MB/s eta 0:00:01
   ---------------------------------------  16.3/16.5 MB 11.0 MB/s eta 0:00:01
   ---------------------------------------- 16.5/16.5 MB 10.7 MB/s eta 0:00:00
Installing collected packages: pymupdf
Successfully installed pymupdf-1.25.3
