In [7]:
import os
import glob
import pandas as pd
import pdfplumber
import re

In [8]:
# --- 1. SETUP: Define your data and output directories ---
# Please ensure these paths are correct for your system.
BASE_DIR = os.getcwd() 
DATA_DIR = os.path.join(BASE_DIR, "data")
OUT_DIR = os.path.join(BASE_DIR, "output")

# --- Helper function for cleaning numeric values ---
def clean_value(text):
    """
    Cleans a string to extract a numeric value.
    Removes commas, handles None, and converts to integer.
    """
    if text is None:
        return 0
    # Remove commas, newlines, and other non-numeric characters
    cleaned_text = re.sub(r'[^\d.-]', '', str(text))
    if not cleaned_text or cleaned_text == '-':
        return 0
    try:
        # Convert to float first to handle decimals, then to int
        return int(float(cleaned_text))
    except (ValueError, TypeError):
        return 0

In [9]:
# --- 2. THE PARSER: Function to handle the new table format ---
def parse_nl4_new_format(pdf_path):
    """
    Extracts Health, Personal Accident, and Travel premium data from a FORM NL-4 PDF 
    where headers are at the bottom and span multiple columns.
    """
    provider = "Unknown"
    # Map report month to financial quarter
    month_map = {
        'june': 'Q1', 'jun': 'Q1',
        'september': 'Q2', 'sep': 'Q2',
        'december': 'Q3', 'dec': 'Q3',
        'march': 'Q4', 'mar': 'Q4'
    }

    try:
        with pdfplumber.open(pdf_path) as pdf:
            # 1. Extract Provider Name from the first page (reusing robust logic)
            if pdf.pages:
                first_page_text = pdf.pages[0].extract_text(x_tolerance=2) or ""
                # Regex to find company names like 'Aditya Birla Health Insurance Co. Limited'
                m = re.search(r"([A-Za-z\s&.()]+(?:Insurance|Assurance)[\s]+(?:Company\s)?(?:Ltd|Limited)\.?)", first_page_text, re.IGNORECASE)
                if m:
                    provider = " ".join(m.group(1).strip().split()) # Clean up whitespace
                else: # Fallback logic
                    for line in first_page_text.splitlines():
                        if "Insurance" in line and ("Ltd" in line or "Limited" in line):
                             m = re.search(r"([A-Za-z &]+Insurance[^,\n]*)", line)
                             if m:
                                 provider = m.group(1).strip()
                                 break

            # 2. Find and process the NL-4 page
            for page in pdf.pages:
                page_text = page.extract_text(x_tolerance=2) or ""
                if "FORM NL-4" not in page_text or "PREMIUM SCHEDULE" not in page_text:
                    continue

                # 3. Extract Year and Quarter from the report's title
                year, quarter = None, None
                date_match = re.search(r"For the\s+Quarter\s+([A-Za-z]+),\s*(\d{4})", page_text, re.IGNORECASE)
                if date_match:
                    month_str, year_str = date_match.group(1).lower(), date_match.group(2)
                    year = int(year_str)
                    for key, q_val in month_map.items():
                        if month_str.startswith(key):
                            quarter = q_val
                            break
                    # For financial year, March quarter belongs to the previous calendar year
                    if quarter == 'Q4':
                        year -= 1
                
                if not year or not quarter:
                    continue # Skip page if date can't be determined

                # 4. Extract table data from the page
                table = page.extract_table({
                    "vertical_strategy": "lines",
                    "horizontal_strategy": "text",
                    "text_x_tolerance": 2,
                    "text_y_tolerance": 2,
                })
                if not table:
                    continue

                # 5. Find header row and map header text to column indices
                header_indices = {}
                header_row_found = False
                # Search from the bottom of the table upwards
                for r in reversed(table):
                    row_str = " ".join(filter(None, [str(s).replace('\n', ' ') for s in r]))
                    if "Personal Accident" in row_str and "Total Health" in row_str and "Travel Insurance" in row_str:
                        for i, cell in enumerate(r):
                            if cell:
                                clean_cell = cell.replace('\n', ' ').strip()
                                if clean_cell == "Health": header_indices['health'] = i
                                elif clean_cell == "Personal Accident": header_indices['personal_accident'] = i
                                elif clean_cell == "Travel Insurance": header_indices['travel'] = i
                                elif clean_cell == "Total Health": header_indices['total'] = i
                        
                        if len(header_indices) >= 3:
                           header_row_found = True
                           break
                
                if not header_row_found:
                    continue # Headers not found on this page

                # 6. Find the 'Gross Direct Premium' row and extract values
                for r in table:
                    if r and r[0] and "Gross Direct Premium" in str(r[0]):
                        # Extract data using the mapped indices. The index points to the 
                        # 'For the Quarter' column, which is the first of each pair.
                        health_val = clean_value(r[header_indices.get('health')])
                        pa_val = clean_value(r[header_indices.get('personal_accident')])
                        travel_val = clean_value(r[header_indices.get('travel')])
                        total_val = clean_value(r[header_indices.get('total')])

                        # Return the final dictionary
                        return {
                            "provider": provider,
                            "year": year,
                            "quarter": quarter,
                            "health": health_val,
                            "personal_accident": pa_val,
                            "travel": travel_val,
                            "total": total_val, # This comes from the 'Total Health' column
                            "source_file": os.path.basename(pdf_path)
                        }
                # If loop finishes, the target row was not found on this page
    except Exception as e:
        print(f"  -> An error occurred while processing {os.path.basename(pdf_path)}: {e}")
    return None


In [10]:
# --- 3. PROCESSING LOOP: Extract data from all PDFs ---
records = []
failed_files = []

if not os.path.isdir(DATA_DIR):
    print(f"❌ Error: Data directory not found at '{DATA_DIR}'")
    # As a fallback for demonstration, try to run on the provided attachment.
    # NOTE: You should place your PDFs in a 'data' sub-directory for the script to work.
    if os.path.exists("Q1-2025-5.pdf"):
         DATA_DIR = "." 
    else:
        print("Please create a 'data' directory and place your PDF files inside it.")

if os.path.isdir(DATA_DIR):
    pdf_files = sorted(glob.glob(os.path.join(DATA_DIR, "*.pdf")))
    if not pdf_files:
        print(f"No PDF files found in '{DATA_DIR}'.")

    for pdf_file in pdf_files:
        print(f"📄 Processing: {os.path.basename(pdf_file)}")
        # Use the new parsing function
        result = parse_nl4_new_format(pdf_file)
        if result:
            print(f"✅ Extracted: {result['provider']}, {result['year']} {result['quarter']}")
            records.append(result)
        else:
            print(f"❌ Failed to extract data from: {os.path.basename(pdf_file)}")
            failed_files.append(os.path.basename(pdf_file))

    print(f"\nTotal successfully extracted: {len(records)}")
    print(f"Total failed: {len(failed_files)}")
    if failed_files:
        print("Failed files:", failed_files)

📄 Processing: Q1 2024.pdf
❌ Failed to extract data from: Q1 2024.pdf
📄 Processing: Q1 2025.pdf
❌ Failed to extract data from: Q1 2025.pdf
📄 Processing: Q1 FY 23.pdf
❌ Failed to extract data from: Q1 FY 23.pdf
📄 Processing: Q2 2024.pdf
❌ Failed to extract data from: Q2 2024.pdf
📄 Processing: Q2 2025.pdf
❌ Failed to extract data from: Q2 2025.pdf
📄 Processing: Q2 FY 23.pdf
❌ Failed to extract data from: Q2 FY 23.pdf
📄 Processing: Q3 2024.pdf
❌ Failed to extract data from: Q3 2024.pdf
📄 Processing: Q3 2025.pdf
❌ Failed to extract data from: Q3 2025.pdf
📄 Processing: Q3 FY 23.pdf
❌ Failed to extract data from: Q3 FY 23.pdf
📄 Processing: Q4 2024.pdf
❌ Failed to extract data from: Q4 2024.pdf
📄 Processing: Q4 2025.pdf


KeyboardInterrupt: 

In [11]:
import os
import pandas as pd
import pdfplumber
import re

# --- 1. SETUP: Define your data and output directories ---
BASE_DIR = os.getcwd() 
DATA_DIR = os.path.join(BASE_DIR, "data")

# --- Helper function for cleaning numeric values ---
def clean_value(text):
    if text is None:
        return 0
    cleaned_text = re.sub(r'[^\d.-]', '', str(text))
    if not cleaned_text or cleaned_text == '-':
        return 0
    try:
        return int(float(cleaned_text))
    except (ValueError, TypeError):
        return 0

# --- 2. THE DEBUGGING FUNCTION ---
def debug_pdf_extraction(pdf_path):
    """
    Runs the extraction process step-by-step and prints detailed
    debug information at each stage.
    """
    print("="*50)
    print(f"🕵️  STARTING DEBUG FOR: {os.path.basename(pdf_path)}")
    print("="*50)

    if not os.path.exists(pdf_path):
        print(f"❌ ERROR: File not found at '{pdf_path}'. Please check the path and filename.")
        return

    try:
        with pdfplumber.open(pdf_path) as pdf:
            # --- Step 1: Provider Name Extraction ---
            print("\n--- [Step 1: Extracting Provider Name] ---")
            provider = "Unknown"
            if pdf.pages:
                first_page_text = pdf.pages[0].extract_text(x_tolerance=2) or ""
                print(f"  - Text from first page (first 300 chars):\n---\n{first_page_text[:300]}\n---")
                m = re.search(r"([A-Za-z\s&.()]+(?:Insurance|Assurance)[\s]+(?:Company\s)?(?:Ltd|Limited)\.?)", first_page_text, re.IGNORECASE)
                if m:
                    provider = " ".join(m.group(1).strip().split())
                    print(f"  ✅ Provider found via regex: '{provider}'")
                else:
                    print("  - Regex failed. Falling back to line-by-line search.")
                    for line in first_page_text.splitlines():
                        if "Insurance" in line and ("Ltd" in line or "Limited" in line):
                            provider = line.strip()
                            print(f"  ✅ Provider found via fallback: '{provider}'")
                            break
            if provider == "Unknown":
                print("  ❌ WARNING: Could not determine provider name.")

            # --- Step 2: Finding the Correct Page ---
            print("\n--- [Step 2: Finding 'FORM NL-4 PREMIUM SCHEDULE' Page] ---")
            nl4_page_found = False
            for i, page in enumerate(pdf.pages):
                print(f"  - Scanning Page {i+1}...")
                page_text = page.extract_text(x_tolerance=2, layout=True) or ""
                if "FORM NL-4" in page_text and "PREMIUM SCHEDULE" in page_text:
                    nl4_page_found = True
                    print(f"  ✅ Found 'FORM NL-4' on Page {i+1}.")
                    
                    # --- Step 3: Date Extraction ---
                    print("\n--- [Step 3: Extracting Quarter and Year] ---")
                    year, quarter = None, None
                    # Using the robust regex from the previous attempt
                    date_match = re.search(r"For\s+the\s+Quarter\s+(?:Ended\s*)?(?:on\s|at\s)?(?:the\s)?\d{0,2}(?:st|nd|rd|th)?\s*([A-Za-z]+)[, ]+\s*(\d{4})", page_text, re.IGNORECASE)
                    if date_match:
                        month_str, year_str = date_match.group(1).lower(), date_match.group(2)
                        print(f"  - Date regex match found: Month='{month_str}', Year='{year_str}'")
                        year = int(year_str)
                        month_map = {'june': 'Q1', 'september': 'Q2', 'december': 'Q3', 'march': 'Q4'}
                        for key, q_val in month_map.items():
                            if month_str.startswith(key):
                                quarter = q_val
                                break
                        print(f"  ✅ Parsed as: Year={year}, Quarter={quarter}")
                    else:
                        print("  ❌ ERROR: Could not find the date string on the page.")
                        
                    # --- Step 4: Table Extraction ---
                    print("\n--- [Step 4: Extracting Table from Page] ---")
                    table = page.extract_table({
                        "vertical_strategy": "lines", "horizontal_strategy": "text",
                        "text_x_tolerance": 2, "text_y_tolerance": 2,
                    })
                    if not table:
                        print("  ❌ ERROR: pdfplumber.extract_table() returned None. No table found with current settings.")
                    else:
                        print(f"  ✅ Table found with {len(table)} rows.")
                        print("  - Printing full extracted table for review:")
                        # Use pandas to display the table cleanly
                        df_display = pd.DataFrame(table)
                        print(df_display.to_string())

                        # --- Step 5: Header Row Detection ---
                        print("\n--- [Step 5: Finding Header Row] ---")
                        header_indices = {}
                        for r_idx, r in reversed(list(enumerate(table))):
                            row_str = " ".join(filter(None, [str(s).replace('\n', ' ') for s in r]))
                            if "Personal Accident" in row_str and "Travel" in row_str and "Health" in row_str:
                                print(f"  - Potential header found in table row {r_idx}: '{row_str}'")
                                for c_idx, cell in enumerate(r):
                                    if cell:
                                        clean_cell = cell.replace('\n', ' ').strip()
                                        if "Health" == clean_cell: header_indices['health'] = c_idx
                                        elif "Personal Accident" == clean_cell: header_indices['personal_accident'] = c_idx
                                        elif "Travel Insurance" == clean_cell: header_indices['travel'] = c_idx
                                        elif "Total" in clean_cell: header_indices['total'] = c_idx
                                if 'health' in header_indices and 'personal_accident' in header_indices:
                                    print(f"  ✅ Header indices mapped: {header_indices}")
                                    break
                        if not header_indices:
                             print("  ❌ ERROR: Could not find the header row containing 'Health', 'Personal Accident', etc.")

                        # --- Step 6: Data Row Detection ---
                        print("\n--- [Step 6: Finding 'Gross Direct Premium' Data Row] ---")
                        data_row_found = False
                        if header_indices:
                            for r_idx, r in enumerate(table):
                                if r and any(cell and "Gross Direct Premium" in str(cell) for cell in r):
                                    data_row_found = True
                                    print(f"  ✅ Found 'Gross Direct Premium' in table row {r_idx}.")
                                    print(f"  - Full data row content: {r}")
                                    
                                    # --- Step 7: Final Value Extraction ---
                                    print("\n--- [Step 7: Extracting Final Values] ---")
                                    health_val = clean_value(r[header_indices.get('health')])
                                    pa_val = clean_value(r[header_indices.get('personal_accident')])
                                    travel_val = clean_value(r[header_indices.get('travel')])
                                    total_val = clean_value(r[header_indices.get('total')])
                                    print(f"    - Health: '{r[header_indices.get('health')]}' -> {health_val}")
                                    print(f"    - Personal Accident: '{r[header_indices.get('personal_accident')]}' -> {pa_val}")
                                    print(f"    - Travel: '{r[header_indices.get('travel')]}' -> {travel_val}")
                                    print(f"    - Total: '{r[header_indices.get('total')]}' -> {total_val}")
                                    break
                            if not data_row_found:
                                print("  ❌ ERROR: Could not find the 'Gross Direct Premium' data row in the table.")
                    break # Stop after finding the first NL-4 page
            if not nl4_page_found:
                print("  ❌ ERROR: No page containing 'FORM NL-4' and 'PREMIUM SCHEDULE' was found in the entire document.")
    
    except Exception as e:
        print(f"\n🚨 A FATAL SCRIPT ERROR OCCURRED: {e}")

    print("\n" + "="*50)
    print("🕵️  DEBUGGING COMPLETE")
    print("="*50)


# --- 3. EXECUTION: Set the PDF file you want to debug here ---
if __name__ == "__main__":
    # ▼▼▼ CHANGE THIS FILENAME TO THE PDF YOU WANT TO TEST ▼▼▼
    PDF_TO_DEBUG = "Q1 2024.pdf"
    
    # Construct the full path to the PDF inside the 'data' directory
    full_pdf_path = os.path.join(DATA_DIR, PDF_TO_DEBUG)
    
    debug_pdf_extraction(full_pdf_path)


🕵️  STARTING DEBUG FOR: Q1 2024.pdf

--- [Step 1: Extracting Provider Name] ---
  - Text from first page (first 300 chars):
---
Applicability
S.No. Form No Description General Indian Branches of
& Health Reinsurer Foreign
Insurers Reinsurer in
India
1 NL-1-B-RA Revenue Account YES YES YES
2 NL-2-B-PL Profit and Loss Account YES YES YES
3 NL-3-B-BS Balance Sheet YES YES NO
4 NL-4-PREMIUM SCHEDULE Premium YES YES YES
5 NL-5-CL
---
  - Regex failed. Falling back to line-by-line search.

--- [Step 2: Finding 'FORM NL-4 PREMIUM SCHEDULE' Page] ---
  - Scanning Page 1...
  - Scanning Page 2...
  - Scanning Page 3...
  - Scanning Page 4...
  - Scanning Page 5...
  ✅ Found 'FORM NL-4' on Page 5.

--- [Step 3: Extracting Quarter and Year] ---
  ❌ ERROR: Could not find the date string on the page.

--- [Step 4: Extracting Table from Page] ---
  ✅ Table found with 1 rows.
  - Printing full extracted table for review:
                                                                                

In [12]:
import os
import glob
import pandas as pd
import pdfplumber
import re

# --- 1. SETUP: Define your data and output directories ---
BASE_DIR = os.getcwd() 
DATA_DIR = os.path.join(BASE_DIR, "data")
OUT_DIR = os.path.join(BASE_DIR, "output")

# --- Helper function for cleaning numeric values ---
def clean_value(text):
    """Cleans a string to extract a numeric value."""
    if text is None:
        return 0
    cleaned_text = re.sub(r'[^\d.-]', '', str(text))
    if not cleaned_text or cleaned_text == '-':
        return 0
    try:
        return int(float(cleaned_text))
    except (ValueError, TypeError):
        return 0

# --- 2. THE FINAL PARSER: With robust provider and data extraction ---
def parse_nl4_final_robust(pdf_path):
    """
    Extracts premium data from a FORM NL-4 PDF.
    This version robustly finds the provider name on the correct page
    and handles various formatting inconsistencies.
    """
    provider = "Unknown"
    month_map = {
        'june': 'Q1', 'jun': 'Q1',
        'september': 'Q2', 'sep': 'Q2',
        'december': 'Q3', 'dec': 'Q3',
        'march': 'Q4', 'mar': 'Q4'
    }

    try:
        with pdfplumber.open(pdf_path) as pdf:
            # Find and process the NL-4 page
            for page in pdf.pages:
                page_text = page.extract_text(x_tolerance=2, layout=True) or ""
                # Check if this is the correct page
                if "FORM NL-4" not in page_text or "PREMIUM SCHEDULE" not in page_text:
                    continue

                # --- STEP 1: (FIXED) Extract Provider Name from the CORRECT page ---
                # The provider name is usually at the top of the NL-4 schedule itself.
                # First, try to find a line like "Name of Insurer : [Name]"
                provider_match = re.search(r"Name\s+of\s+(?:the\s+)?Insurer\s*[:\s]+\s*(.+)", page_text, re.IGNORECASE)
                if provider_match:
                    provider = provider_match.group(1).strip()
                else:
                    # Fallback: Search the first few lines of the page for a company name
                    for line in page_text.splitlines()[:5]: # Check top 5 lines
                        if "Insurance" in line and ("Ltd" in line or "Limited" in line):
                            provider = line.strip()
                            break

                # --- STEP 2: Extract Year and Quarter ---
                year, quarter = None, None
                date_match = re.search(r"For\s+the\s+Quarter\s+(?:Ended\s*)?(?:on\s|at\s)?(?:the\s)?\d{0,2}(?:st|nd|rd|th)?\s*([A-Za-z]+)[, ]+\s*(\d{4})", page_text, re.IGNORECASE)
                if date_match:
                    month_str, year_str = date_match.group(1).lower(), date_match.group(2)
                    year = int(year_str)
                    for key, q_val in month_map.items():
                        if month_str.startswith(key):
                            quarter = q_val
                            break
                
                if not year or not quarter:
                    # If date not found on this page, something is wrong, skip page
                    continue 

                # --- STEP 3: Extract Table Data ---
                table = page.extract_table({
                    "vertical_strategy": "lines", "horizontal_strategy": "text",
                    "text_x_tolerance": 2, "text_y_tolerance": 2,
                })
                if not table:
                    continue

                # --- STEP 4: Find Header Row and Map Indices ---
                header_indices = {}
                header_row_found = False
                for r in reversed(table):
                    # Combine all cells in a row to a single string for easy searching
                    row_str = " ".join(filter(None, [str(s).replace('\n', ' ') for s in r]))
                    if "Personal Accident" in row_str and "Travel" in row_str and "Health" in row_str:
                        for i, cell in enumerate(r):
                            if cell:
                                clean_cell = cell.replace('\n', ' ').strip()
                                # Map the exact header text to its column index
                                if clean_cell == "Health": header_indices['health'] = i
                                elif clean_cell == "Personal Accident": header_indices['personal_accident'] = i
                                elif clean_cell == "Travel Insurance": header_indices['travel'] = i
                                elif "Total" in clean_cell: header_indices['total'] = i
                        
                        if 'health' in header_indices and 'personal_accident' in header_indices and 'travel' in header_indices:
                           header_row_found = True
                           break
                
                if not header_row_found:
                    continue

                # --- STEP 5: Find Data Row and Extract Values ---
                for r in table:
                    if r and any(cell and "Gross Direct Premium" in str(cell) for cell in r):
                        # Use the mapped header indices to get data from the correct columns
                        health_val = clean_value(r[header_indices.get('health')])
                        pa_val = clean_value(r[header_indices.get('personal_accident')])
                        travel_val = clean_value(r[header_indices.get('travel')])
                        total_val = clean_value(r[header_indices.get('total')])

                        # Once found, return the result and stop processing this PDF
                        return {
                            "provider": provider,
                            "year": year,
                            "quarter": quarter,
                            "health": health_val,
                            "personal_accident": pa_val,
                            "travel": travel_val,
                            "total": total_val,
                            "source_file": os.path.basename(pdf_path)
                        }
                
                # If we found the NL-4 page but not the data, we can stop searching
                break 

    except Exception as e:
        print(f"  -> An error occurred while processing {os.path.basename(pdf_path)}: {e}")
    
    # Return None if no data was successfully extracted from the PDF
    return None

# --- 3. PROCESSING LOOP: Extract data from all PDFs ---
records = []
failed_files = []

if not os.path.isdir(DATA_DIR):
    print(f"❌ Error: Data directory not found at '{DATA_DIR}'")
    print("Please create a 'data' directory and place your PDF files inside it.")
else:
    pdf_files = sorted(glob.glob(os.path.join(DATA_DIR, "*.pdf")))
    if not pdf_files:
        print(f"No PDF files found in '{DATA_DIR}'.")

    for pdf_file in pdf_files:
        print(f"📄 Processing: {os.path.basename(pdf_file)}")
        # Use the final, robust parsing function
        result = parse_nl4_final_robust(pdf_file)
        if result:
            print(f"✅ Extracted: {result['provider']}, {result['year']} {result['quarter']}")
            records.append(result)
        else:
            print(f"❌ Failed to extract data from: {os.path.basename(pdf_file)}")
            failed_files.append(os.path.basename(pdf_file))

    print(f"\nTotal successfully extracted: {len(records)}")
    if failed_files:
        print(f"Total failed: {len(failed_files)}")
        print("Failed files:", failed_files)

# --- 4. DATAFRAME CREATION AND SAVING ---
if records:
    df = pd.DataFrame(records)
    column_order = [
        'provider', 'year', 'quarter', 
        'health', 'personal_accident', 'travel', 'total', 'source_file'
    ]
    df = df[column_order]

    os.makedirs(OUT_DIR, exist_ok=True)
    out_csv = os.path.join(OUT_DIR, "premium_summary_final.csv")
    df.to_csv(out_csv, index=False)
    
    print(f"\n✅ Data successfully saved to CSV: {out_csv}")
    print("\n--- DataFrame Preview ---")
    print(df.head())
else:
    print("\nNo data was extracted, so no file was saved.")


📄 Processing: Q1 2024.pdf
❌ Failed to extract data from: Q1 2024.pdf
📄 Processing: Q1 2025.pdf
❌ Failed to extract data from: Q1 2025.pdf
📄 Processing: Q1 FY 23.pdf
❌ Failed to extract data from: Q1 FY 23.pdf
📄 Processing: Q2 2024.pdf
❌ Failed to extract data from: Q2 2024.pdf
📄 Processing: Q2 2025.pdf
❌ Failed to extract data from: Q2 2025.pdf
📄 Processing: Q2 FY 23.pdf
❌ Failed to extract data from: Q2 FY 23.pdf
📄 Processing: Q3 2024.pdf
❌ Failed to extract data from: Q3 2024.pdf
📄 Processing: Q3 2025.pdf
❌ Failed to extract data from: Q3 2025.pdf
📄 Processing: Q3 FY 23.pdf
❌ Failed to extract data from: Q3 FY 23.pdf
📄 Processing: Q4 2024.pdf
❌ Failed to extract data from: Q4 2024.pdf
📄 Processing: Q4 2025.pdf
❌ Failed to extract data from: Q4 2025.pdf
📄 Processing: Q4 FY 23.pdf
❌ Failed to extract data from: Q4 FY 23.pdf

Total successfully extracted: 0
Total failed: 12
Failed files: ['Q1 2024.pdf', 'Q1 2025.pdf', 'Q1 FY 23.pdf', 'Q2 2024.pdf', 'Q2 2025.pdf', 'Q2 FY 23.pdf', 'Q3 202