In [None]:
import os
import re
from pathlib import Path
from pypdf import PdfReader

def convert_pdfs_to_text():
    # Define paths with absolute path to ensure correct location
    pdf_dir = Path("../aptnote_download").absolute()
    output_base_dir = Path("./pypdf").absolute()
    
    print(f"Looking for PDF files in: {pdf_dir}")
    
    # Check if directory exists
    if not pdf_dir.exists():
        print(f"Error: Directory {pdf_dir} does not exist!")
        return
    
    # List all files in the directory to debug
    all_files = list(pdf_dir.glob("*"))
    print(f"Found {len(all_files)} files in directory")
    for file in all_files:
        print(f"  - {file}")
    
    # Filter PDF files - recursively search in all subdirectories
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    print(f"Found {len(pdf_files)} PDF files")
    
    if len(pdf_files) == 0:
        print("No PDF files found. Check the directory path and file extensions.")
        return
    
    # Process each PDF file
    total_files = len(pdf_files)
    print(f"Starting conversion of {total_files} PDF files...")
    
    successful_conversions = 0
    failed_conversions = 0
    
    for i, pdf_file in enumerate(pdf_files, 1):
        try:
            # Get the parent directory name (which should be the year)
            parent_dir = pdf_file.parent.name
            
            # If parent directory is a year (like 2023), use it
            # Otherwise try to extract year from filename using regex
            if re.match(r'^20\d{2}$', parent_dir):
                year = parent_dir
            else:
                year_match = re.search(r'(20\d{2})', pdf_file.name)
                year = year_match.group(1) if year_match else "unknown"
            
            # Create year directory if it doesn't exist
            year_dir = output_base_dir / year
            year_dir.mkdir(exist_ok=True)
            
            # Output text file path
            output_file = year_dir / f"{pdf_file.stem}.txt"
            
            print(f"[{i}/{total_files}] Converting {pdf_file.name} to {output_file}")
            
            # Extract text using PyPDF
            reader = PdfReader(pdf_file)
            text_content = ""
            
            for page in reader.pages:
                text_content += page.extract_text() + "\n"
            
            # Write text to output file
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write(text_content)
                
            print(f"[{i}/{total_files}] Successfully converted {pdf_file.name}")
            successful_conversions += 1
            
        except Exception as e:
            print(f"[{i}/{total_files}] Error processing {pdf_file.name}: {str(e)}")
            failed_conversions += 1
    
    # Print summary
    print("\nConversion Summary:")
    print(f"Total PDF files: {total_files}")
    print(f"Successfully converted: {successful_conversions}")
    print(f"Failed conversions: {failed_conversions}")
    
    if successful_conversions > 0:
        print(f"\nText files have been saved to: {output_base_dir}")

if __name__ == "__main__":
    convert_pdfs_to_text()

In [8]:
import os
import re
from pathlib import Path
import fitz  # PyMuPDF

def convert_pdfs_to_text():
    # Define paths with absolute path to ensure correct location
    pdf_dir = Path("../aptnote_download").absolute()
    output_base_dir = Path("./pymupdf").absolute()
    
    print(f"Looking for PDF files in: {pdf_dir}")
    
    # Check if directory exists
    if not pdf_dir.exists():
        print(f"Error: Directory {pdf_dir} does not exist!")
        return
    
    # List all files in the directory to debug
    all_files = list(pdf_dir.glob("*"))
    print(f"Found {len(all_files)} files in directory")
    for file in all_files:
        print(f"  - {file}")
    
    # Filter PDF files - recursively search in all subdirectories
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    print(f"Found {len(pdf_files)} PDF files")
    
    if len(pdf_files) == 0:
        print("No PDF files found. Check the directory path and file extensions.")
        return
    
    # Process each PDF file
    total_files = len(pdf_files)
    print(f"Starting conversion of {total_files} PDF files...")
    
    successful_conversions = 0
    failed_conversions = 0
    
    for i, pdf_file in enumerate(pdf_files, 1):
        try:
            # Get the parent directory name (which should be the year)
            parent_dir = pdf_file.parent.name
            
            # If parent directory is a year (like 2023), use it
            # Otherwise try to extract year from filename using regex
            if re.match(r'^20\d{2}$', parent_dir):
                year = parent_dir
            else:
                year_match = re.search(r'(20\d{2})', pdf_file.name)
                year = year_match.group(1) if year_match else "unknown"
            
            # Create year directory if it doesn't exist
            year_dir = output_base_dir / year
            year_dir.mkdir(exist_ok=True)
            
            # Output text file path
            output_file = year_dir / f"{pdf_file.stem}.txt"
            
            print(f"[{i}/{total_files}] Converting {pdf_file.name} to {output_file}")
            
            # Extract text using PyMuPDF
            text_content = ""
            doc = fitz.open(pdf_file)
            
            for page_num in range(len(doc)):
                page = doc[page_num]
                text_content += page.get_text() + "\n"
            
            doc.close()
            
            # Write text to output file
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write(text_content)
                
            print(f"[{i}/{total_files}] Successfully converted {pdf_file.name}")
            successful_conversions += 1
            
        except Exception as e:
            print(f"[{i}/{total_files}] Error processing {pdf_file.name}: {str(e)}")
            failed_conversions += 1
    
    # Print summary
    print("\nConversion Summary:")
    print(f"Total PDF files: {total_files}")
    print(f"Successfully converted: {successful_conversions}")
    print(f"Failed conversions: {failed_conversions}")
    
    if successful_conversions > 0:
        print(f"\nText files have been saved to: {output_base_dir}")

if __name__ == "__main__":
    convert_pdfs_to_text()

Looking for PDF files in: e:\CExperiment\ThreatRAG\kg\data_spider\aptnote_text\..\aptnote_download
Found 28 files in directory
  - e:\CExperiment\ThreatRAG\kg\data_spider\aptnote_text\..\aptnote_download\.gitignore
  - e:\CExperiment\ThreatRAG\kg\data_spider\aptnote_text\..\aptnote_download\2006
  - e:\CExperiment\ThreatRAG\kg\data_spider\aptnote_text\..\aptnote_download\2008
  - e:\CExperiment\ThreatRAG\kg\data_spider\aptnote_text\..\aptnote_download\2009
  - e:\CExperiment\ThreatRAG\kg\data_spider\aptnote_text\..\aptnote_download\2010
  - e:\CExperiment\ThreatRAG\kg\data_spider\aptnote_text\..\aptnote_download\2011
  - e:\CExperiment\ThreatRAG\kg\data_spider\aptnote_text\..\aptnote_download\2012
  - e:\CExperiment\ThreatRAG\kg\data_spider\aptnote_text\..\aptnote_download\2013
  - e:\CExperiment\ThreatRAG\kg\data_spider\aptnote_text\..\aptnote_download\2014
  - e:\CExperiment\ThreatRAG\kg\data_spider\aptnote_text\..\aptnote_download\2015
  - e:\CExperiment\ThreatRAG\kg\data_spider\apt