In [1]:
import pdfplumber
import os
from pathlib import Path

def extract_text_from_pdfs(pdf_files, output_file):
    extracted_data = {}
    
    # Convert output_file to Path object
    output_path = Path(output_file)
    
    # Create directory if it doesn't exist
    output_path.parent.mkdir(parents=True, exist_ok=True)
    
    for pdf_path in pdf_files:
        try:
            # Convert pdf_path to Path object
            pdf_path = Path(pdf_path)
            
            if not pdf_path.exists():
                print(f"File not found: {pdf_path}")
                continue
                
            print(f"Processing: {pdf_path}")
            
            with pdfplumber.open(str(pdf_path)) as pdf:
                text = ""
                for page in pdf.pages:
                    # Extract text with better handling of layout
                    extracted_text = page.extract_text(x_tolerance=3, y_tolerance=3)
                    if extracted_text:
                        text += extracted_text + "\n"
                
                # Store extracted text if not empty
                if text.strip():
                    extracted_data[str(pdf_path)] = text
                else:
                    print(f"No text extracted from: {pdf_path}")
                    
        except Exception as e:
            print(f"Error processing {pdf_path}: {str(e)}")
    
    # Save extracted text into a text file
    try:
        with open(output_path, "w", encoding="utf-8") as out_file:
            for pdf_name, pdf_text in extracted_data.items():
                out_file.write(f"File: {pdf_name}\n")
                out_file.write(pdf_text)
                out_file.write("\n" + "="*80 + "\n")  # Separator
        
        if extracted_data:
            print(f"Extraction complete. Text saved in {output_path}")
            print(f"Successfully processed {len(extracted_data)} files")
        else:
            print("No text was extracted from any of the PDF files")
            
    except Exception as e:
        print(f"Error saving output file: {str(e)}")

# Example usage with proper Windows paths
if __name__ == "__main__":
    # Using raw strings and proper Windows path format
    pdf_files = [
        r"E:\insurance1.pdf",
        r"E:\insurance2.pdf",
        r"E:\insurance3.pdf",
        r"E:\insurance4.pdf",
        r"E:\insurance5.pdf"
    ]
    output_file = r"E:\extracted_text.txt"
    
    # First make sure you have pdfplumber installed
    try:
        import pdfplumber
    except ImportError:
        print("pdfplumber is not installed. Installing it now...")
        import subprocess
        subprocess.check_call(['pip', 'install', 'pdfplumber'])
        print("pdfplumber has been installed successfully!")
    
    try:
        # Test if we can write to the output directory
        test_file = Path(output_file).parent / "test_write_permission.txt"
        test_file.touch()
        test_file.unlink()  # Delete the test file
        
        # If we get here, we have write permissions
        extract_text_from_pdfs(pdf_files, output_file)
        
    except PermissionError:
        print(f"Error: No permission to write to directory {Path(output_file).parent}")
        print("Please run the script with appropriate permissions or choose a different output location")
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")

Processing: E:\insurance1.pdf
Processing: E:\insurance2.pdf
Processing: E:\insurance3.pdf
Processing: E:\insurance4.pdf
Processing: E:\insurance5.pdf
Extraction complete. Text saved in E:\extracted_text.txt
Successfully processed 5 files


In [2]:
import re
import csv
from datetime import datetime

def extract_details_from_text(text):
    # Dictionary to store extracted details
    details = {}
    
    # Extract Name
    name_match = re.search(r'Greetings\s+(\w+)', text)
    details['Name'] = name_match.group(1) if name_match else ''
    
    # Extract Policy Name
    policy_match = re.search(r'([\w\s]+Health Insurance)\s+\d{2}-\d{2}-\d{4}', text)
    details['Policy Name'] = policy_match.group(1).strip() if policy_match else ''
    
    # Extract Policy Period
    period_match = re.search(r'(\d{2}-\d{2}-\d{4}\s+to\s+\d{2}\s+-\d{2}-\d{4})', text)
    details['Policy Period'] = period_match.group(1) if period_match else ''
    
    # Extract Gender
    gender_match = re.search(r'(\w+)\s+Self', text)
    details['Gender'] = gender_match.group(1) if gender_match else ''
    
    # Extract Email
    email_match = re.search(r'(\S+@\S+\.\S+)\s', text.replace(' ', ''))
    details['Email Address'] = email_match.group(1) if email_match else ''
    
    # Extract DOB
    dob_match = re.search(r'(\d{2}-\d{2}-\d{4})\s+-', text)
    details['DOB'] = dob_match.group(1) if dob_match else ''
    
    # Extract Enrollment Date
    enrollment_match = re.search(r'confirmed on (\d{2}\s+-\d{2}-\d{4}\s+\d{2}:\d{2}:\d{2})', text)
    details['Enrollment Confirmed on'] = enrollment_match.group(1) if enrollment_match else ''
    
    # Extract Family Cover
    cover_match = re.search(r'Family\s+Cover\s+(\d{1,2},\d{2},\d{3})', text)
    details['Family Cover'] = cover_match.group(1) if cover_match else ''
    
    return details

def process_text_file_to_csv(input_file, output_csv):
    # Read the input file
    with open(input_file, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Split content by the separator
    pdf_contents = content.split('=' * 80)
    
    # Prepare data for CSV
    all_records = []
    
    # Process each PDF content
    for pdf_content in pdf_contents:
        if pdf_content.strip():  # Skip empty sections
            details = extract_details_from_text(pdf_content)
            if details['Name']:  # Only add if we found a name (valid entry)
                all_records.append(details)
    
    # Write to CSV
    if all_records:
        fieldnames = ['Name', 'Policy Name', 'Policy Period', 'Gender', 
                     'Email Address', 'DOB', 'Enrollment Confirmed on', 'Family Cover']
        
        with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(all_records)
        print(f"Data successfully written to {output_csv}")
    else:
        print("No records found to write to CSV")

# Example usage
input_file = r"E:\extracted_text.txt"
output_csv = r"E:\insurance_details.csv"
process_text_file_to_csv(input_file, output_csv)

Data successfully written to E:\insurance_details.csv
