<a href="https://colab.research.google.com/github/silvia-j-escobar/ExternDataScience/blob/main/Extract_Key_Fields_from_the_Loan_Worksheet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [58]:
# Silvia J Escobar Zetino
# Ensure PyMuPDF is installed and available for this cell
!pip install --quiet pymupdf
import fitz # Import fitz here to ensure it's defined in this cell's scope
from google.colab import files # Import files for uploading

print("Attempting to extract Loan Program from general text...")

# Re-upload the file to ensure it's present in the environment
print("Please upload 'LenderFeesWorksheetNew (1).pdf' again if prompted:")
uploaded = files.upload()

# Get the filename from the uploaded dictionary
if uploaded:
    pdf_filename = list(uploaded.keys())[0]
else:
    print("No file was uploaded. Please upload the PDF to proceed.")
    pdf_filename = None # Or raise an error, or handle appropriately

if pdf_filename:
    doc = fitz.open(pdf_filename)
    loan_programs_found = []

    keywords = [
        "Loan Program:",
        "Loan Type:",
        "Program:",
        "FHA",
        "VA",
        "USDA",
        "Conventional",
        "Jumbo",
        "Refinance", # Could be a program type often explicitly stated
        "Purchase"   # Could be a program type often explicitly stated
    ]

    # Case-insensitive search
    keywords_lower = [k.lower() for k in keywords]

    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text("text")
        text_lower = text.lower()

        for kw_idx, keyword_search in enumerate(keywords_lower):
            start = 0
            while True:
                idx = text_lower.find(keyword_search, start)
                if idx == -1:
                    break

                # Extract surrounding text for context or value
                original_keyword = keywords[kw_idx] # Get original casing for printing

                if original_keyword.endswith(":"):
                    # Try to extract the value after the colon
                    value_start = idx + len(keyword_search)

                    # Find the end of the line or next significant text for the value
                    # Look for end of line, or another keyword, or just take a limited length
                    end_of_value = text.find("\n", value_start)
                    if end_of_value == -1: # No newline, take till end of text
                        end_of_value = len(text)

                    # Limit the length of extracted value to avoid capturing too much irrelevant text
                    max_value_length = 100
                    extracted_value = text[value_start:min(end_of_value, value_start + max_value_length)].strip()

                    # Clean up extracted value (remove extra newlines, multiple spaces)
                    extracted_value = ' '.join(extracted_value.split())

                    if extracted_value:
                        loan_programs_found.append(f"Page {page_num + 1}: Found '{original_keyword}' with value: '{extracted_value}'")
                else:
                    # For standalone keywords like FHA, VA, etc., just report its presence with context
                    context_window = 100 # characters before and after
                    context_start = max(0, idx - context_window)
                    context_end = min(len(text), idx + len(keyword_search) + context_window)
                    context = text[context_start:context_end]

                    # Clean up context for better readability
                    context = ' '.join(context.split())

                    # Fixed syntax error here: changed "...{context}..." to '...{context}...'
                    loan_programs_found.append(f"Page {page_num + 1}: Found '{original_keyword}' in context: '...{context}...'")

                start = idx + len(keyword_search) # Continue search after the current match

    doc.close()

    if loan_programs_found:
        print("\nPotential Loan Programs / Loan Types identified:")
        for lp in loan_programs_found:
            print(lp)
    else:
        print("\nNo explicit 'Loan Program' or common loan type keywords found in the document's general text.")
        print("This information might be present in a less structured format or implied, or the keywords need refinement.")


Attempting to extract Loan Program from general text...
Please upload 'LenderFeesWorksheetNew (1).pdf' again if prompted:


Saving LenderFeesWorksheetNew (1).pdf to LenderFeesWorksheetNew (1) (4).pdf

Potential Loan Programs / Loan Types identified:
Page 1: Found 'FHA' in context: '...l Estate Taxes Mortgage Insurance Homeowner Assn. Dues Other * PFC F POC = Prepaid Finance Charge = FHA Allowable Closing Cost = Paid Outside of Closing ** B S Br L TP C = Borrower = Seller = Broker = Le...'
Page 1: Found 'Purchase' in context: '...FUNDS NEEDED TO CLOSE: TOTAL ESTIMATED MONTHLY PAYMENT: Total Estimated Funds Total Monthly Payment Purchase Price (+) Alterations (+) Land (+) Refi (incl. debts to be paid off) (+) Est. Prepaid Items/Reserve...'
Page 1: Found 'Purchase' in context: '...ws.frm (09/2015) FEES WORKSHEET John Q. Smith / Mary A. Smith samplesmith 10/05/2015 30 YEAR FIXED -Purchase XYZ Lender $ 380,000 4.250 % 360 / 360 mths 475,000.00 1,121.53 4,520.00 380,000.00 Cash Deposit 5,...'


In [59]:
# Ensure PyMuPDF is installed and available
!pip install --quiet pymupdf
import fitz
import re # For regular expressions to find patterns
from google.colab import files # Import files for uploading (if needed)

print("Attempting to extract Interest Rate...")

# Get the PDF filename from the 'uploaded' dictionary, which should be available
# from the previous successful file upload in cell BR9wkiVN-SsR.
# If 'uploaded' is not in scope (e.g., if this cell is run independently and kernel reset),
# it will prompt for re-upload for robustness.
if 'uploaded' not in globals() or not uploaded:
    print("Please upload 'LenderFeesWorksheetNew (1).pdf' to proceed:")
    uploaded = files.upload()

pdf_filename = None
if uploaded:
    pdf_filename = list(uploaded.keys())[0]
else:
    print("No file was uploaded. Cannot proceed with interest rate extraction.")

if pdf_filename:
    doc = fitz.open(pdf_filename)
    interest_rates_found = []

    # Regex patterns to look for numbers followed by a percentage sign
    # This captures the numerical part, allowing for 1 to 3 decimal places or integers.
    percentage_patterns = [
        r'(\d+\.\d{1,3})\s*%',  # Captures 1-3 decimal places: e.g., "4.250 %" -> captures "4.250"
        r'(\d+)\s*%'        # Captures integers: e.g., "5 %" -> captures "5"
    ]

    # Keywords that might precede or be near an interest rate, case-insensitive
    rate_keywords_context = ['rate', 'APR', 'interest', 'fixed', 'annual percentage rate']

    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text("text")
        text_lower = text.lower()

        # Search for direct percentage values
        for pattern in percentage_patterns:
            matches = re.finditer(pattern, text)
            for match in matches:
                rate_value_str = match.group(1) # The captured number (e.g., "4.250")
                start_idx = match.start()
                end_idx = match.end()

                # Get context around the percentage to determine if it's an interest rate
                context_window = 60 # characters before and after the match
                context_start = max(0, start_idx - context_window)
                context_end = min(len(text), end_idx + context_window)
                context_raw = text[context_start:context_end]
                context_cleaned = ' '.join(context_raw.split()) # Clean up whitespace for readability

                is_interest_rate_context = False
                for kw in rate_keywords_context:
                    if kw.lower() in context_cleaned.lower():
                        is_interest_rate_context = True
                        break

                try:
                    numeric_rate = float(rate_value_str)
                    # Heuristic: Filter out rates that are unusually high or low for a typical loan interest rate.
                    # This can be adjusted based on expected rate ranges.
                    if 0.1 <= numeric_rate <= 20.0:
                        interest_rates_found.append({
                            'page': page_num + 1,
                            'rate_value': numeric_rate,
                            'context': context_cleaned,
                            'likely_interest': is_interest_rate_context
                        })
                except ValueError:
                    pass # Not a clean float, ignore this match

    doc.close()

    if interest_rates_found:
        print("\nPotential Interest Rates identified:")
        # Sort by likelihood (likely rates first) and then by rate value for consistent output
        interest_rates_found.sort(key=lambda x: (not x['likely_interest'], x['rate_value']))

        for item in interest_rates_found:
            likelihood_label = "Likely" if item['likely_interest'] else "Possible"
            print(f"Page {item['page']}: {likelihood_label} interest rate: {item['rate_value']:.3f}% (Context: '...{item['context']}...')")

        # Optionally, print the single most likely rate if a strong candidate is found
        most_likely_rate = next((item for item in interest_rates_found if item['likely_interest']), None)
        if most_likely_rate:
            print(f"\nMost likely interest rate identified: {most_likely_rate['rate_value']:.3f}% on Page {most_likely_rate['page']}")
        elif interest_rates_found:
            # If no 'likely' rates but some 'possible' ones, suggest the first one found.
            print(f"\nNo rates found with strong context, but a possible rate identified: {interest_rates_found[0]['rate_value']:.3f}% on Page {interest_rates_found[0]['page']}")

    else:
        print("\nNo explicit interest rates found using the defined patterns and heuristics.")
        print("The information might be present in a less structured format or the patterns need refinement.")


Attempting to extract Interest Rate...

Potential Interest Rates identified:
Page 1: Likely interest rate: 4.250% (Context: '...ith 10/05/2015 30 YEAR FIXED -Purchase XYZ Lender $ 380,000 4.250 % 360 / 360 mths 475,000.00 1,121.53 4,520.00 380,000.00 Cash...')

Most likely interest rate identified: 4.250% on Page 1


In [60]:
import fitz
import re
from google.colab import files

print("Attempting to extract Loan Program with bounding boxes...")

# Ensure pdf_filename is available
if 'uploaded' not in globals() or not uploaded:
    print("Please upload 'LenderFeesWorksheetNew (1).pdf' to proceed:")
    uploaded = files.upload()

pdf_filename = None
if uploaded:
    pdf_filename = list(uploaded.keys())[0]
else:
    print("No file was uploaded. Cannot proceed with loan program extraction.")

loan_programs_found_with_bbox = []

if pdf_filename:
    doc = fitz.open(pdf_filename)

    # Keywords to search for, including variations and followed by colon
    # Prioritize exact phrases or patterns that indicate a program name
    loan_program_patterns = [
        r'Loan Program:\s*(.+)', # Captures text after "Loan Program:"
        r'Loan Type:\s*(.+)',    # Captures text after "Loan Type:"
        r'Program:\s*(.+)',       # Captures text after "Program:"
        r'FHA', r'VA', r'USDA', r'Conventional', r'Jumbo',
        r'Refinance', r'Purchase'
    ]

    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text("text")

        for pattern in loan_program_patterns:
            # Use re.IGNORECASE for case-insensitive matching
            # For patterns with capture groups, we want the captured text
            # For standalone keywords, we want the keyword itself
            for match in re.finditer(pattern, text, re.IGNORECASE):
                if match.groups():
                    # If there's a capture group, extract the content
                    # and trim it to a reasonable length or until a newline/significant separator
                    extracted_value = match.group(1).split('\n')[0].strip()
                    matched_text = f"{pattern.split('(')[0].strip()} {extracted_value}" # Reconstruct for search_for
                    # For search_for, we need the exact text as it appears in the PDF for precise bbox.
                    # Let's refine this to search for the *value* if the pattern was a 'key: value' type.
                    # Or, if it's a standalone keyword, search for the keyword itself.
                    search_term = extracted_value if match.group(1) else match.group(0).strip()
                else:
                    # For direct keyword matches without a capture group
                    search_term = match.group(0).strip()

                # Use page.search_for to get bounding boxes of the matched text
                text_instances = page.search_for(search_term)

                bboxes = []
                if text_instances:
                    for inst in text_instances:
                        # Convert fitz.Rect to a list [x0, y0, x1, y1]
                        bboxes.append([inst.x0, inst.y0, inst.x1, inst.y1])

                if bboxes:
                    loan_programs_found_with_bbox.append({
                        'page': page_num,
                        'text': search_term, # Store the actual search term that yielded the bbox
                        'bbox': bboxes
                    })

    doc.close()

    if loan_programs_found_with_bbox:
        print("\nExtracted Loan Programs with Bounding Boxes:")
        for program in loan_programs_found_with_bbox:
            print(f"Page {program['page']}: '{program['text']}' BBox: {program['bbox']}")
    else:
        print("\nNo explicit loan programs found with the defined patterns.")


Attempting to extract Loan Program with bounding boxes...

Extracted Loan Programs with Bounding Boxes:
Page 0: 'Prepared By:' BBox: [[39.599998474121094, 88.7218246459961, 84.8465576171875, 97.40873718261719]]
Page 0: 'Prepared By:' BBox: [[39.599998474121094, 88.7218246459961, 84.8465576171875, 97.40873718261719]]
Page 0: 'FHA' BBox: [[178.7764892578125, 946.1212768554688, 192.698486328125, 953.920166015625]]
Page 0: 'Purchase' BBox: [[39.599998474121094, 810.0318603515625, 69.66100311279297, 817.990478515625], [535.542236328125, 98.60455322265625, 571.6966552734375, 107.53496551513672]]
Page 0: 'Purchase' BBox: [[39.599998474121094, 810.0318603515625, 69.66100311279297, 817.990478515625], [535.542236328125, 98.60455322265625, 571.6966552734375, 107.53496551513672]]


In [61]:
import fitz
import re
from google.colab import files

print("Attempting to extract Loan Program with bounding boxes...")

# Ensure pdf_filename is available
if 'uploaded' not in globals() or not uploaded:
    print("Please upload 'LenderFeesWorksheetNew (1).pdf' to proceed:")
    uploaded = files.upload()

pdf_filename = None
if uploaded:
    pdf_filename = list(uploaded.keys())[0]
else:
    print("No file was uploaded. Cannot proceed with loan program extraction.")

loan_programs_found_with_bbox = []
seen_entries = set() # To store unique (page, text, bbox_tuple) for deduplication

if pdf_filename:
    doc = fitz.open(pdf_filename)

    # Patterns for "Key: Value" structures (e.g., "Loan Program: Conventional")
    key_value_patterns = [
        r'Loan Program:\s*(.*?)(?:\n|$)', # Capture value after "Loan Program:", non-greedy, till newline or end of string
        r'Loan Type:\s*(.*?)(?:\n|$)'     # Capture value after "Loan Type:"
    ]

    # Standalone keywords (e.g., "FHA", "Purchase")
    standalone_keywords = [
        r'FHA', r'VA', r'USDA', r'Conventional', r'Jumbo',
        r'Refinance', r'Purchase'
    ]

    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text("text")

        # Process Key:Value patterns
        for pattern in key_value_patterns:
            # Use re.DOTALL to allow '.' to match newlines within the capture group if a value spans multiple lines
            for match in re.finditer(pattern, text, re.IGNORECASE | re.DOTALL):
                full_matched_phrase = match.group(0).strip() # e.g., "Loan Program: 30 YEAR FIXED"
                extracted_value = match.group(1).strip()     # e.g., "30 YEAR FIXED"

                if extracted_value: # Only proceed if a meaningful value is extracted
                    # Search for the *entire* matched phrase to get its bounding box
                    search_term = full_matched_phrase
                    text_instances = page.search_for(search_term)

                    bboxes = []
                    if text_instances:
                        for inst in text_instances:
                            bbox_coords = [inst.x0, inst.y0, inst.x1, inst.y1]
                            # Create a hashable tuple for deduplication
                            entry = (page_num, search_term, tuple(bbox_coords))
                            if entry not in seen_entries:
                                bboxes.append(bbox_coords)
                                seen_entries.add(entry)

                    if bboxes:
                        loan_programs_found_with_bbox.append({
                            'page': page_num,
                            'text': search_term,
                            'bbox': bboxes
                        })

        # Process Standalone Keywords
        for keyword_pattern in standalone_keywords:
            for match in re.finditer(keyword_pattern, text, re.IGNORECASE):
                search_term = match.group(0).strip() # The keyword itself (e.g., "FHA", "Purchase")

                text_instances = page.search_for(search_term)
                bboxes = []
                if text_instances:
                    for inst in text_instances:
                        bbox_coords = [inst.x0, inst.y0, inst.x1, inst.y1]
                        # Create a hashable tuple for deduplication
                        entry = (page_num, search_term, tuple(bbox_coords))
                        if entry not in seen_entries:
                            bboxes.append(bbox_coords)
                            seen_entries.add(entry)

                if bboxes:
                    loan_programs_found_with_bbox.append({
                        'page': page_num,
                        'text': search_term,
                        'bbox': bboxes
                    })

    doc.close()

    if loan_programs_found_with_bbox:
        print("\nExtracted Loan Programs with Bounding Boxes:")
        for program in loan_programs_found_with_bbox:
            print(f"Page {program['page']}: '{program['text']}' BBox: {program['bbox']}")
    else:
        print("\nNo explicit loan programs found with the defined patterns.")

Attempting to extract Loan Program with bounding boxes...

Extracted Loan Programs with Bounding Boxes:
Page 0: 'Loan Program:
Prepared By:' BBox: [[410.2799987792969, 98.80184173583984, 461.87005615234375, 107.48875427246094], [39.599998474121094, 88.7218246459961, 84.8465576171875, 97.40873718261719]]
Page 0: 'FHA' BBox: [[178.7764892578125, 946.1212768554688, 192.698486328125, 953.920166015625]]
Page 0: 'Purchase' BBox: [[39.599998474121094, 810.0318603515625, 69.66100311279297, 817.990478515625], [535.542236328125, 98.60455322265625, 571.6966552734375, 107.53496551513672]]


**Reasoning**:
The previous attempt to extract loan programs produced false positives due to the regex including newlines and non-programmatic text, and improper `search_term` selection. This revision refines the `key_value_patterns` to explicitly avoid matching newlines for the value, filters out non-programmatic extracted values (like 'Prepared By:'), and ensures that `page.search_for()` is used with the actual program text for accurate bounding box retrieval. Deduplication is also maintained to prevent redundant entries.



In [62]:
import fitz
import re
from google.colab import files

print("Attempting to extract Loan Program with bounding boxes (refined)...")

# Ensure pdf_filename is available
if 'uploaded' not in globals() or not uploaded:
    print("Please upload 'LenderFeesWorksheetNew (1).pdf' to proceed:")
    uploaded = files.upload()

pdf_filename = None
if uploaded:
    pdf_filename = list(uploaded.keys())[0]
else:
    print("No file was uploaded. Cannot proceed with loan program extraction.")

loan_programs_found_with_bbox = []
seen_entries = set() # To store unique (page, text, bbox_tuple) for deduplication

if pdf_filename:
    doc = fitz.open(pdf_filename)

    # Patterns for "Key: Value" structures (e.g., "Loan Program: Conventional")
    # Using [^\n]* to capture value only on the same line, no re.DOTALL needed.
    key_value_patterns = [
        r'Loan Program:\s*([^\n]*?)(?:\n|$)', # Capture value after "Loan Program:", non-greedy, till newline or end of string
        r'Loan Type:\s*([^\n]*?)(?:\n|$)'     # Capture value after "Loan Type:"
    ]

    # Standalone keywords (e.g., "FHA", "Purchase")
    standalone_keywords = [
        r'FHA', r'VA', r'USDA', r'Conventional', r'Jumbo',
        r'Refinance', r'Purchase', r'Fixed Rate', r'Adjustable Rate'
    ]

    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text("text")

        # Process Key:Value patterns
        for pattern in key_value_patterns:
            # Removed re.DOTALL as `[^\n]*?` explicitly handles newlines
            for match in re.finditer(pattern, text, re.IGNORECASE):
                extracted_value = match.group(1).strip()

                # Filter out non-programmatic values and empty strings
                if extracted_value and extracted_value.lower() != 'prepared by:':
                    # Check if the extracted_value itself is a recognized program keyword or phrase
                    is_program_like = False
                    for skw_pattern in standalone_keywords:
                        if re.search(skw_pattern, extracted_value, re.IGNORECASE):
                            is_program_like = True
                            break

                    # Also consider if the full match (key + value) is relevant if the value is generic
                    # For this, it's safer to search for the extracted_value if it's program-like
                    if is_program_like:
                        search_term = extracted_value

                        text_instances = page.search_for(search_term)
                        bboxes = []
                        if text_instances:
                            for inst in text_instances:
                                bbox_coords = [inst.x0, inst.y0, inst.x1, inst.y1]
                                # Create a hashable tuple for deduplication
                                entry = (page_num, search_term, tuple(bbox_coords))
                                if entry not in seen_entries:
                                    bboxes.append(bbox_coords)
                                    seen_entries.add(entry)

                        if bboxes:
                            loan_programs_found_with_bbox.append({
                                'page': page_num,
                                'text': search_term,
                                'bbox': bboxes
                            })

        # Process Standalone Keywords
        for keyword_pattern in standalone_keywords:
            for match in re.finditer(keyword_pattern, text, re.IGNORECASE):
                search_term = match.group(0).strip() # The keyword itself (e.g., "FHA", "Purchase")

                text_instances = page.search_for(search_term)
                bboxes = []
                if text_instances:
                    for inst in text_instances:
                        bbox_coords = [inst.x0, inst.y0, inst.x1, inst.y1]
                        # Create a hashable tuple for deduplication
                        entry = (page_num, search_term, tuple(bbox_coords))
                        if entry not in seen_entries:
                            bboxes.append(bbox_coords)
                            seen_entries.add(entry)

                if bboxes:
                    loan_programs_found_with_bbox.append({
                        'page': page_num,
                        'text': search_term,
                        'bbox': bboxes
                    })

    doc.close()

    if loan_programs_found_with_bbox:
        print("\nExtracted Loan Programs with Bounding Boxes:")
        for program in loan_programs_found_with_bbox:
            print(f"Page {program['page']}: '{program['text']}' BBox: {program['bbox']}")
    else:
        print("\nNo explicit loan programs found with the defined patterns.")


Attempting to extract Loan Program with bounding boxes (refined)...

Extracted Loan Programs with Bounding Boxes:
Page 0: 'FHA' BBox: [[178.7764892578125, 946.1212768554688, 192.698486328125, 953.920166015625]]
Page 0: 'Purchase' BBox: [[39.599998474121094, 810.0318603515625, 69.66100311279297, 817.990478515625], [535.542236328125, 98.60455322265625, 571.6966552734375, 107.53496551513672]]


In [63]:
import fitz
import re
from google.colab import files

print("Attempting to extract Interest Rate with bounding boxes...")

# Ensure pdf_filename is available
if 'uploaded' not in globals() or not uploaded:
    print("Please upload 'LenderFeesWorksheetNew (1).pdf' to proceed:")
    uploaded = files.upload()

pdf_filename = None
if uploaded:
    pdf_filename = list(uploaded.keys())[0]
else:
    print("No file was uploaded. Cannot proceed with interest rate extraction.")

interest_rates_found_with_bbox = []
seen_entries = set() # To store unique (page, text, bbox_tuple) for deduplication

if pdf_filename:
    doc = fitz.open(pdf_filename)

    # Regex patterns to look for numbers followed by a percentage sign
    # This captures the numerical part, allowing for 1 to 3 decimal places or integers.
    percentage_patterns = [
        r'(\d+\.\d{1,3})\s*%',  # Captures 1-3 decimal places: e.g., "4.250 %" -> captures "4.250"
        r'(\d+)\s*%'        # Captures integers: e.g., "5 %" -> captures "5"
    ]

    # Keywords that might precede or be near an interest rate, case-insensitive
    rate_keywords_context = ['rate', 'APR', 'interest', 'fixed', 'annual percentage rate']

    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text("text")
        text_lower = text.lower()

        # Search for direct percentage values
        for pattern in percentage_patterns:
            for match in re.finditer(pattern, text, re.IGNORECASE):
                rate_value_str = match.group(0).strip() # The entire matched string (e.g., "4.250 %")
                numeric_rate_part = match.group(1) # The captured number (e.g., "4.250")
                start_idx = match.start()
                end_idx = match.end()

                # Get context around the percentage to determine if it's an interest rate
                context_window = 60 # characters before and after the match
                context_start = max(0, start_idx - context_window)
                context_end = min(len(text), end_idx + context_window)
                context_raw = text[context_start:context_end]
                context_cleaned = ' '.join(context_raw.split()) # Clean up whitespace for readability

                is_interest_rate_context = False
                for kw in rate_keywords_context:
                    if kw.lower() in context_cleaned.lower():
                        is_interest_rate_context = True
                        break

                try:
                    numeric_rate = float(numeric_rate_part)
                    # Heuristic: Filter out rates that are unusually high or low for a typical loan interest rate.
                    # This can be adjusted based on expected rate ranges.
                    if 0.1 <= numeric_rate <= 20.0 and is_interest_rate_context: # Only consider if context is strong
                        search_term = rate_value_str
                        text_instances = page.search_for(search_term)

                        bboxes = []
                        if text_instances:
                            for inst in text_instances:
                                bbox_coords = [inst.x0, inst.y0, inst.x1, inst.y1]
                                # Create a hashable tuple for deduplication
                                entry = (page_num, search_term, tuple(bbox_coords))
                                if entry not in seen_entries:
                                    bboxes.append(bbox_coords)
                                    seen_entries.add(entry)

                        if bboxes:
                            interest_rates_found_with_bbox.append({
                                'page': page_num,
                                'text': search_term,
                                'bbox': bboxes
                            })
                except ValueError:
                    pass # Not a clean float, ignore this match

    doc.close()

    if interest_rates_found_with_bbox:
        print("\nExtracted Interest Rates with Bounding Boxes:")
        for rate_info in interest_rates_found_with_bbox:
            print(f"Page {rate_info['page']}: '{rate_info['text']}' BBox: {rate_info['bbox']}")
    else:
        print("\nNo explicit interest rates found with the defined patterns and strong context.")

Attempting to extract Interest Rate with bounding boxes...

Extracted Interest Rates with Bounding Boxes:
Page 0: '4.250 %' BBox: [[298.79998779296875, 161.96453857421875, 328.2326354980469, 170.89495849609375]]


In [64]:
def extract_tables_with_bbox(pdf_path):
    doc = fitz.open(pdf_path)
    results = []

    for page_num in range(len(doc)):
        page = doc[page_num]
        tables = page.find_tables()

        for i, table in enumerate(tables.tables):
            table_bbox = table.bbox
            table_data = []
            cell_bboxes = []

            for row_num in range(table.row_count):
                row_data = []
                row_bboxes = []

                for col_num in range(table.col_count):
                    cell_idx = row_num * table.col_count + col_num

                    if cell_idx < len(table.cells) and table.cells[cell_idx] is not None:
                        cell_bbox = table.cells[cell_idx]
                        cell_text = page.get_text("text", clip=cell_bbox).strip()
                        row_data.append(cell_text)
                        row_bboxes.append(cell_bbox)
                    else:
                        row_data.append("")
                        row_bboxes.append(None)

                table_data.append(row_data)
                cell_bboxes.append(row_bboxes)

            results.append({
                'page': page_num,
                'table_index': i,
                'bbox': table_bbox,
                'data': table_data,
                'cell_bboxes': cell_bboxes
            })

    doc.close()
    return results

**Reasoning**:
Now that the `extract_tables_with_bbox` function is defined, I need to call it with the `pdf_filename` to extract the table data from the uploaded PDF. This will populate the `results` variable, which is necessary for the final consolidated output.



In [65]:
print("Calling extract_tables_with_bbox to re-extract table data...")
# Ensure pdf_filename is available from previous steps
# If not, it means the notebook state is reset, and the file needs re-uploading
if 'uploaded' not in globals() or not uploaded:
    print("Please upload 'LenderFeesWorksheetNew (1).pdf' to proceed:")
    uploaded = files.upload()

if uploaded:
    pdf_filename = list(uploaded.keys())[0]
    results = extract_tables_with_bbox(pdf_filename)
    print("Table extraction complete.")
else:
    print("No file uploaded, cannot re-extract tables.")

Calling extract_tables_with_bbox to re-extract table data...
Table extraction complete.


In [67]:
def extract_tables_with_bbox(pdf_path):
    doc = fitz.open(pdf_path)
    results = []

    for page_num in range(len(doc)):
        page = doc[page_num]
        tables = page.find_tables()

        for i, table in enumerate(tables.tables):
            # Convert table.bbox (fitz.Rect) to a list of floats
            table_bbox_list = [table.bbox.x0, table.bbox.y0, table.bbox.x1, table.bbox.y1]
            table_data = []
            cell_bboxes_list = [] # Will store lists of [x0, y0, x1, y1]

            for row_num in range(table.row_count):
                row_data = []
                row_bboxes = []

                for col_num in range(table.col_count):
                    cell_idx = row_num * table.col_count + col_num

                    if cell_idx < len(table.cells) and table.cells[cell_idx] is not None:
                        cell_bbox_rect = table.cells[cell_idx] # This is a fitz.Rect object
                        cell_text = page.get_text("text", clip=cell_bbox_rect).strip()
                        row_data.append(cell_text)
                        # Convert fitz.Rect to a list of floats for consistency
                        row_bboxes.append([cell_bbox_rect.x0, cell_bbox_rect.y0, cell_bbox_rect.x1, cell_bbox_rect.y1])
                    else:
                        row_data.append("")
                        row_bboxes.append(None)

                table_data.append(row_data)
                cell_bboxes_list.append(row_bboxes)

            results.append({
                'page': page_num,
                'table_index': i,
                'bbox': table_bbox_list, # Store as list of floats
                'data': table_data,
                'cell_bboxes': cell_bboxes_list # Store as lists of floats
            })

    doc.close()
    return results


In [70]:
def extract_tables_with_bbox(pdf_path):
    doc = fitz.open(pdf_path)
    results = []

    for page_num in range(len(doc)):
        page = doc[page_num]
        tables = page.find_tables()

        for i, table in enumerate(tables.tables):
            # Convert table.bbox (which might be fitz.Rect or tuple) to a list of floats
            table_bbox_list = list(table.bbox)
            table_data = []
            cell_bboxes_list = [] # Will store lists of [x0, y0, x1, y1]

            for row_num in range(table.row_count):
                row_data = []
                row_bboxes = []

                for col_num in range(table.col_count):
                    cell_idx = row_num * table.col_count + col_num

                    if cell_idx < len(table.cells) and table.cells[cell_idx] is not None:
                        cell_bbox_rect = table.cells[cell_idx] # This is a fitz.Rect object
                        cell_text = page.get_text("text", clip=cell_bbox_rect).strip()
                        row_data.append(cell_text)
                        # Convert fitz.Rect to a list of floats for consistency
                        row_bboxes.append([cell_bbox_rect.x0, cell_bbox_rect.y0, cell_bbox_rect.x1, cell_bbox_rect.y1])
                    else:
                        row_data.append("")
                        row_bboxes.append(None)

                table_data.append(row_data)
                cell_bboxes_list.append(row_bboxes)

            results.append({
                'page': page_num,
                'table_index': i,
                'bbox': table_bbox_list, # Store as list of floats
                'data': table_data,
                'cell_bboxes': cell_bboxes_list # Store as lists of floats
            })

    doc.close()
    return results


In [71]:
def extract_tables_with_bbox(pdf_path):
    doc = fitz.open(pdf_path)
    results = []

    for page_num in range(len(doc)):
        page = doc[page_num]
        tables = page.find_tables()

        for i, table in enumerate(tables.tables):
            # Convert table.bbox (which might be fitz.Rect or tuple) to a list of floats
            table_bbox_list = list(table.bbox)
            table_data = []
            cell_bboxes_list = [] # Will store lists of [x0, y0, x1, y1]

            for row_num in range(table.row_count):
                row_data = []
                row_bboxes = []

                for col_num in range(table.col_count):
                    cell_idx = row_num * table.col_count + col_num

                    if cell_idx < len(table.cells) and table.cells[cell_idx] is not None:
                        cell_bbox_rect = table.cells[cell_idx] # This is a fitz.Rect object
                        cell_text = page.get_text("text", clip=cell_bbox_rect).strip()
                        row_data.append(cell_text)
                        # Convert fitz.Rect to a list of floats for consistency
                        row_bboxes.append([cell_bbox_rect.x0, cell_bbox_rect.y0, cell_bbox_rect.x1, cell_bbox_rect.y1])
                    else:
                        row_data.append("")
                        row_bboxes.append(None)

                table_data.append(row_data)
                cell_bboxes_list.append(row_bboxes)

            results.append({
                'page': page_num,
                'table_index': i,
                'bbox': table_bbox_list, # Store as list of floats
                'data': table_data,
                'cell_bboxes': cell_bboxes_list # Store as lists of floats
            })

    doc.close()
    return results


In [73]:
def extract_tables_with_bbox(pdf_path):
    doc = fitz.open(pdf_path)
    results = []

    for page_num in range(len(doc)):
        page = doc[page_num]
        tables = page.find_tables()

        for i, table in enumerate(tables.tables):
            # Convert table.bbox (which might be fitz.Rect or tuple) to a list of floats
            table_bbox_list = list(table.bbox)
            table_data = []
            cell_bboxes_list = [] # Will store lists of [x0, y0, x1, y1]

            for row_num in range(table.row_count):
                row_data = []
                row_bboxes = []

                for col_num in range(table.col_count):
                    cell_idx = row_num * table.col_count + col_num

                    if cell_idx < len(table.cells) and table.cells[cell_idx] is not None:
                        cell_bbox_rect = table.cells[cell_idx] # This can be a fitz.Rect object or a tuple
                        cell_text = page.get_text("text", clip=cell_bbox_rect).strip()
                        row_data.append(cell_text)
                        # Convert cell_bbox_rect (whether fitz.Rect or tuple) to a list of floats for consistency
                        row_bboxes.append(list(cell_bbox_rect))
                    else:
                        row_data.append("")
                        row_bboxes.append(None)

                table_data.append(row_data)
                cell_bboxes_list.append(row_bboxes)

            results.append({
                'page': page_num,
                'table_index': i,
                'bbox': table_bbox_list, # Store as list of floats
                'data': table_data,
                'cell_bboxes': cell_bboxes_list # Store as lists of floats
            })

    doc.close()
    return results

print("The `extract_tables_with_bbox` function has been redefined to robustly handle bounding box types.")

The `extract_tables_with_bbox` function has been redefined to robustly handle bounding box types.


In [74]:
print("Calling the redefined extract_tables_with_bbox to re-extract table data...")
# Ensure pdf_filename is available from previous steps
# If not, it means the notebook state is reset, and the file needs re-uploading
if 'uploaded' not in globals() or not uploaded:
    print("Please upload 'LenderFeesWorksheetNew (1).pdf' to proceed:")
    uploaded = files.upload()

if uploaded:
    pdf_filename = list(uploaded.keys())[0]
    results = extract_tables_with_bbox(pdf_filename)
    print("Table extraction complete with redefined function.")
else:
    print("No file uploaded, cannot re-extract tables.")

Calling the redefined extract_tables_with_bbox to re-extract table data...
Table extraction complete with redefined function.


**Reasoning**:
Now that the `results` variable has been correctly populated by the redefined table extraction function, I need to re-run the consolidation code to display all extracted information, ensuring table data, loan terms, loan programs, and interest rates are presented with their correct bounding boxes.



In [75]:
print("\n--- Consolidated Extracted Information ---")

# 1. Display Extracted Tables
print("\nExtracted Tables (Fees and Amounts):")
if 'results' in globals() and results:
    for table_idx, table in enumerate(results):
        print(f"  ðŸ“„ Page {table['page']}, Table {table_idx}")
        for row_idx, row in enumerate(table['data']):
            for col_idx, cell_text in enumerate(row):
                if cell_text.strip(): # Only print non-empty cells
                    # Ensure cell_bboxes exist and are correctly structured
                    if 'cell_bboxes' in table and len(table['cell_bboxes']) > row_idx and len(table['cell_bboxes'][row_idx]) > col_idx:
                        cell_bbox = table['cell_bboxes'][row_idx][col_idx]
                        if cell_bbox: # Check if bbox is not None
                            print(f"    - Text: '{cell_text.replace('\n', ' ')}' | BBox: {cell_bbox}")
                        else:
                            print(f"    - Text: '{cell_text.replace('\n', ' ')}' | BBox: No BBox (empty cell)")
                    else:
                        print(f"    - Text: '{cell_text.replace('\n', ' ')}' | BBox: Not available")
else:
    print("  No table data found. Please ensure previous steps for table extraction were executed successfully.")

# 2. Display Extracted Loan Terms
print("\nExtracted Loan Terms:")
if 'loan_terms_found_with_bbox' in globals() and loan_terms_found_with_bbox:
    for term in loan_terms_found_with_bbox:
        print(f"  ðŸ“„ Page {term['page']}: '{term['text']}' | BBox: {term['bbox']}")
else:
    print("  No loan terms found. Please ensure previous steps for loan term extraction were executed successfully.")

# 3. Display Extracted Loan Programs
print("\nExtracted Loan Programs:")
if 'loan_programs_found_with_bbox' in globals() and loan_programs_found_with_bbox:
    for program in loan_programs_found_with_bbox:
        print(f"  ðŸ“„ Page {program['page']}: '{program['text']}' | BBox: {program['bbox']}")
else:
    print("  No loan programs found. Please ensure previous steps for loan program extraction were executed successfully.")

# 4. Display Extracted Interest Rates
print("\nExtracted Interest Rates:")
if 'interest_rates_found_with_bbox' in globals() and interest_rates_found_with_bbox:
    for rate_info in interest_rates_found_with_bbox:
        print(f"  ðŸ“„ Page {rate_info['page']}: '{rate_info['text']}' | BBox: {rate_info['bbox']}")
else:
    print("  No interest rates found. Please ensure previous steps for interest rate extraction were executed successfully.")

print("\n--- Consolidated Extraction Complete ---")


--- Consolidated Extracted Information ---

Extracted Tables (Fees and Amounts):
  ðŸ“„ Page 0, Table 0
    - Text: 'Underwriting Fee XYZ Lender Borrower $ 550.00' | BBox: [39.599998474121094, 193.67999267578125, 537.3600158691406, 203.6400146484375]
    - Text: 'Wire Transfer Fee XYZ Lender Borrower $ 75.00' | BBox: [39.599998474121094, 203.6400146484375, 537.3600158691406, 213.84002685546875]
    - Text: 'Administration Fee XYZ Lender Borrower $ 445.00' | BBox: [39.599998474121094, 213.84002685546875, 537.3600158691406, 224.52001953125]
  ðŸ“„ Page 0, Table 1
    - Text: 'Appraisal Fee XYZ Lender Borrower $ 525.00' | BBox: [39.599998474121094, 234.0, 537.3600158691406, 244.08001708984375]
    - Text: 'Credit Report Fee XYZ Lender Borrower $ 25.00' | BBox: [39.599998474121094, 244.08001708984375, 537.3600158691406, 254.03997802734375]
    - Text: 'Tax Service Fee XYZ Lender Borrower $ 80.00' | BBox: [39.599998474121094, 254.03997802734375, 537.3600158691406, 264.239990234375]
    - T

**Reasoning**:
Now that the `results` variable has been correctly populated by the redefined table extraction function, I need to re-run the consolidation code to display all extracted information, ensuring table data, loan terms, loan programs, and interest rates are presented with their correct bounding boxes.



In [76]:
print("\n--- Consolidated Extracted Information ---")

# 1. Display Extracted Tables
print("\nExtracted Tables (Fees and Amounts):")
if 'results' in globals() and results:
    for table_idx, table in enumerate(results):
        print(f"  ðŸ“„ Page {table['page']}, Table {table_idx}")
        for row_idx, row in enumerate(table['data']):
            for col_idx, cell_text in enumerate(row):
                if cell_text.strip(): # Only print non-empty cells
                    # Ensure cell_bboxes exist and are correctly structured
                    if 'cell_bboxes' in table and len(table['cell_bboxes']) > row_idx and len(table['cell_bboxes'][row_idx]) > col_idx:
                        cell_bbox = table['cell_bboxes'][row_idx][col_idx]
                        if cell_bbox: # Check if bbox is not None
                            print(f"    - Text: '{cell_text.replace('\n', ' ')}' | BBox: {cell_bbox}")
                        else:
                            print(f"    - Text: '{cell_text.replace('\n', ' ')}' | BBox: No BBox (empty cell)")
                    else:
                        print(f"    - Text: '{cell_text.replace('\n', ' ')}' | BBox: Not available")
else:
    print("  No table data found. Please ensure previous steps for table extraction were executed successfully.")

# 2. Display Extracted Loan Terms
print("\nExtracted Loan Terms:")
if 'loan_terms_found_with_bbox' in globals() and loan_terms_found_with_bbox:
    for term in loan_terms_found_with_bbox:
        print(f"  ðŸ“„ Page {term['page']}: '{term['text']}' | BBox: {term['bbox']}")
else:
    print("  No loan terms found. Please ensure previous steps for loan term extraction were executed successfully.")

# 3. Display Extracted Loan Programs
print("\nExtracted Loan Programs:")
if 'loan_programs_found_with_bbox' in globals() and loan_programs_found_with_bbox:
    for program in loan_programs_found_with_bbox:
        print(f"  ðŸ“„ Page {program['page']}: '{program['text']}' | BBox: {program['bbox']}")
else:
    print("  No loan programs found. Please ensure previous steps for loan program extraction were executed successfully.")

# 4. Display Extracted Interest Rates
print("\nExtracted Interest Rates:")
if 'interest_rates_found_with_bbox' in globals() and interest_rates_found_with_bbox:
    for rate_info in interest_rates_found_with_bbox:
        print(f"  ðŸ“„ Page {rate_info['page']}: '{rate_info['text']}' | BBox: {rate_info['bbox']}")
else:
    print("  No interest rates found. Please ensure previous steps for interest rate extraction were executed successfully.")

print("\n--- Consolidated Extraction Complete ---")


--- Consolidated Extracted Information ---

Extracted Tables (Fees and Amounts):
  ðŸ“„ Page 0, Table 0
    - Text: 'Underwriting Fee XYZ Lender Borrower $ 550.00' | BBox: [39.599998474121094, 193.67999267578125, 537.3600158691406, 203.6400146484375]
    - Text: 'Wire Transfer Fee XYZ Lender Borrower $ 75.00' | BBox: [39.599998474121094, 203.6400146484375, 537.3600158691406, 213.84002685546875]
    - Text: 'Administration Fee XYZ Lender Borrower $ 445.00' | BBox: [39.599998474121094, 213.84002685546875, 537.3600158691406, 224.52001953125]
  ðŸ“„ Page 0, Table 1
    - Text: 'Appraisal Fee XYZ Lender Borrower $ 525.00' | BBox: [39.599998474121094, 234.0, 537.3600158691406, 244.08001708984375]
    - Text: 'Credit Report Fee XYZ Lender Borrower $ 25.00' | BBox: [39.599998474121094, 244.08001708984375, 537.3600158691406, 254.03997802734375]
    - Text: 'Tax Service Fee XYZ Lender Borrower $ 80.00' | BBox: [39.599998474121094, 254.03997802734375, 537.3600158691406, 264.239990234375]
    - T

In [77]:
import fitz
import re
from google.colab import files

print("Attempting to re-extract Loan Terms with bounding boxes...")

# Ensure pdf_filename is available
if 'uploaded' not in globals() or not uploaded:
    print("Please upload 'LenderFeesWorksheetNew (1).pdf' to proceed:")
    uploaded = files.upload()

pdf_filename = None
if uploaded:
    pdf_filename = list(uploaded.keys())[0]
else:
    print("No file was uploaded. Cannot proceed with loan term extraction.")

loan_terms_found_with_bbox = []
seen_entries = set() # To store unique (page, text, bbox_tuple) for deduplication

if pdf_filename:
    doc = fitz.open(pdf_filename)

    # Regular expression patterns for loan terms
    loan_term_patterns = [
        r'(\d+\s*YEAR(?:S)?\s*FIXED)', # e.g., "30 YEAR FIXED"
        r'(\d+\s*MONTH(?:S)?)',       # e.g., "360 MONTHS", "360 MONTH"
        r'(\d+\s*mths)',              # e.g., "360 mths"
        r'(\d+\s*/\s*\d+\s*mths)'    # e.g., "360 / 360 mths"
    ]

    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text("text")

        for pattern in loan_term_patterns:
            for match in re.finditer(pattern, text, re.IGNORECASE):
                search_term = match.group(0).strip()

                # Filter out generic numbers that might not be loan terms
                # For example, if a pattern catches '100' but it's not near 'year', 'month', 'fixed' etc.
                # A simple heuristic: ensure it's not just a number, but has context from the pattern
                if re.search(r'year|month|mths|fixed', search_term, re.IGNORECASE):
                    text_instances = page.search_for(search_term)

                    bboxes = []
                    if text_instances:
                        for inst in text_instances:
                            bbox_coords = [inst.x0, inst.y0, inst.x1, inst.y1]
                            # Create a hashable tuple for deduplication
                            entry = (page_num, search_term, tuple(bbox_coords))
                            if entry not in seen_entries:
                                bboxes.append(bbox_coords)
                                seen_entries.add(entry)

                    if bboxes:
                        loan_terms_found_with_bbox.append({
                            'page': page_num,
                            'text': search_term,
                            'bbox': bboxes
                        })
    doc.close()

    if loan_terms_found_with_bbox:
        print("\nExtracted Loan Terms with Bounding Boxes:")
        for term in loan_terms_found_with_bbox:
            print(f"Page {term['page']}: '{term['text']}' | BBox: {term['bbox']}")
    else:
        print("\nNo explicit loan terms found with the defined patterns.")

Attempting to re-extract Loan Terms with bounding boxes...

Extracted Loan Terms with Bounding Boxes:
Page 0: '30 YEAR FIXED' | BBox: [[471.6000061035156, 98.60455322265625, 530.5532836914062, 107.53496551513672]]
Page 0: '360 mths' | BBox: [[491.77752685546875, 161.96453857421875, 526.578857421875, 170.89495849609375]]
Page 0: '360 / 360 mths' | BBox: [[471.6000061035156, 161.96453857421875, 526.578857421875, 170.89495849609375]]


**Reasoning**:
Now that the `loan_terms_found_with_bbox` variable has been correctly populated by the re-execution of the loan term extraction, I need to re-run the consolidation code to display all extracted information, ensuring table data, loan terms, loan programs, and interest rates are presented with their correct bounding boxes.



In [78]:
print("\n--- Consolidated Extracted Information ---")

# 1. Display Extracted Tables
print("\nExtracted Tables (Fees and Amounts):")
if 'results' in globals() and results:
    for table_idx, table in enumerate(results):
        print(f"  ðŸ“„ Page {table['page']}, Table {table_idx}")
        for row_idx, row in enumerate(table['data']):
            for col_idx, cell_text in enumerate(row):
                if cell_text.strip(): # Only print non-empty cells
                    # Ensure cell_bboxes exist and are correctly structured
                    if 'cell_bboxes' in table and len(table['cell_bboxes']) > row_idx and len(table['cell_bboxes'][row_idx]) > col_idx:
                        cell_bbox = table['cell_bboxes'][row_idx][col_idx]
                        if cell_bbox: # Check if bbox is not None
                            print(f"    - Text: '{cell_text.replace('\n', ' ')}' | BBox: {cell_bbox}")
                        else:
                            print(f"    - Text: '{cell_text.replace('\n', ' ')}' | BBox: No BBox (empty cell)")
                    else:
                        print(f"    - Text: '{cell_text.replace('\n', ' ')}' | BBox: Not available")
else:
    print("  No table data found. Please ensure previous steps for table extraction were executed successfully.")

# 2. Display Extracted Loan Terms
print("\nExtracted Loan Terms:")
if 'loan_terms_found_with_bbox' in globals() and loan_terms_found_with_bbox:
    for term in loan_terms_found_with_bbox:
        print(f"  ðŸ“„ Page {term['page']}: '{term['text']}' | BBox: {term['bbox']}")
else:
    print("  No loan terms found. Please ensure previous steps for loan term extraction were executed successfully.")

# 3. Display Extracted Loan Programs
print("\nExtracted Loan Programs:")
if 'loan_programs_found_with_bbox' in globals() and loan_programs_found_with_bbox:
    for program in loan_programs_found_with_bbox:
        print(f"  ðŸ“„ Page {program['page']}: '{program['text']}' | BBox: {program['bbox']}")
else:
    print("  No loan programs found. Please ensure previous steps for loan program extraction were executed successfully.")

# 4. Display Extracted Interest Rates
print("\nExtracted Interest Rates:")
if 'interest_rates_found_with_bbox' in globals() and interest_rates_found_with_bbox:
    for rate_info in interest_rates_found_with_bbox:
        print(f"  ðŸ“„ Page {rate_info['page']}: '{rate_info['text']}' | BBox: {rate_info['bbox']}")
else:
    print("  No interest rates found. Please ensure previous steps for interest rate extraction were executed successfully.")

print("\n--- Consolidated Extraction Complete ---")


--- Consolidated Extracted Information ---

Extracted Tables (Fees and Amounts):
  ðŸ“„ Page 0, Table 0
    - Text: 'Underwriting Fee XYZ Lender Borrower $ 550.00' | BBox: [39.599998474121094, 193.67999267578125, 537.3600158691406, 203.6400146484375]
    - Text: 'Wire Transfer Fee XYZ Lender Borrower $ 75.00' | BBox: [39.599998474121094, 203.6400146484375, 537.3600158691406, 213.84002685546875]
    - Text: 'Administration Fee XYZ Lender Borrower $ 445.00' | BBox: [39.599998474121094, 213.84002685546875, 537.3600158691406, 224.52001953125]
  ðŸ“„ Page 0, Table 1
    - Text: 'Appraisal Fee XYZ Lender Borrower $ 525.00' | BBox: [39.599998474121094, 234.0, 537.3600158691406, 244.08001708984375]
    - Text: 'Credit Report Fee XYZ Lender Borrower $ 25.00' | BBox: [39.599998474121094, 244.08001708984375, 537.3600158691406, 254.03997802734375]
    - Text: 'Tax Service Fee XYZ Lender Borrower $ 80.00' | BBox: [39.599998474121094, 254.03997802734375, 537.3600158691406, 264.239990234375]
    - T

In [82]:
print("Attempting to extract Appraisal Fee from extracted tables...")

appraisal_fees_found = []

# Ensure 'results' (extracted table data) is available
if 'results' in globals() and results:
    for table_idx, table in enumerate(results):
        for row_idx, row_data in enumerate(table['data']):
            for col_idx, cell_text in enumerate(row_data):
                if cell_text and 'Appraisal Fee' in cell_text:
                    # Assuming the appraisal fee and amount are in the same cell or closely associated
                    if 'cell_bboxes' in table and len(table['cell_bboxes']) > row_idx and len(table['cell_bboxes'][row_idx]) > col_idx:
                        cell_bbox = table['cell_bboxes'][row_idx][col_idx]
                        if cell_bbox:
                            appraisal_fees_found.append({
                                'page': table['page'],
                                'table_index': table_idx,
                                'text': cell_text.replace('\n', ' '), # Clean up newlines for display
                                'bbox': cell_bbox
                            })

if appraisal_fees_found:
    print("\nExtracted Appraisal Fees:")
    for fee in appraisal_fees_found:
        print(f"Page {fee['page']}, Table {fee['table_index']}: '{fee['text']}' | BBox: {fee['bbox']}")
else:
    print("\nNo 'Appraisal Fee' found in the extracted tables.")


Attempting to extract Appraisal Fee from extracted tables...

Extracted Appraisal Fees:
Page 0, Table 1: 'Appraisal Fee XYZ Lender Borrower $ 525.00' | BBox: [39.599998474121094, 234.0, 537.3600158691406, 244.08001708984375]


In [83]:
import fitz
import re
from google.colab import files

print("Attempting to extract Borrower's Name with bounding boxes...")

# Ensure pdf_filename is available
if 'uploaded' not in globals() or not uploaded:
    print("Please upload 'LenderFeesWorksheetNew (1).pdf' to proceed:")
    uploaded = files.upload()

pdf_filename = None
if uploaded:
    pdf_filename = list(uploaded.keys())[0]
else:
    print("No file was uploaded. Cannot proceed with borrower name extraction.")

borrower_names_found_with_bbox = []
seen_entries = set() # To store unique (page, text, bbox_tuple) for deduplication

if pdf_filename:
    doc = fitz.open(pdf_filename)

    # Patterns for borrower's name
    # 1. Look for 'Applicant(s):' or 'Borrower:' followed by a name
    #    (assuming names are usually capitalized words, potentially with a middle initial)
    name_patterns = [
        r'(?:Applicant(?:s)?:|Borrower:|Name:)\s*([A-Z][a-z]+(?:\s+[A-Z]\.?)?\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)',
        r'([A-Z][a-z]+\s+[A-Z][a-z]+)', # General pattern for Two Capitalized Words (e.g., John Smith)
        r'([A-Z][a-z]+\s+[A-Z]\.?\s+[A-Z][a-z]+)' # Three Capitalized Words (e.g., John D Smith)
    ]

    # List of known non-name phrases to filter out, case-insensitive for robustness
    known_non_names = set([
        'Application No', 'Loan Estimate', 'Fee Details', 'Summary Applicants',
        'Date Prepared', 'Fees Worksheet', 'Total Loan', 'Interest Rate', 'Due In',
        'Fee Paid', 'To Paid', 'Fee Split', 'Total Estimated', 'Funds Total',
        'Monthly Payment', 'Purchase Price', 'Prepaid Items', 'Closing Costs',
        'Loan Amount', 'Interest Other', 'Hazard Insurance', 'Real Estate',
        'Taxes Mortgage', 'Insurance Homeowner', 'Dues Other', 'Prepaid Finance',
        'Allowable Closing', 'Paid Outside', 'Third Party', 'Correspondent Calyx',
        'Cash Deposit', 'Underwriting Fee', 'Wire Transfer', 'Administration Fee',
        'Appraisal Fee', 'Credit Report', 'Tax Service', 'Flood Certification',
        'Escrow Fee', 'Settlement Agent', 'Document Preparation', 'Notary Fee',
        'Title Insurance', 'Courier Fee', 'Electronic Document', 'Delivery Fee',
        'Pest Inspection', 'Home Inspection', 'Mortgage Recording', 'Charge Borrower',
        'Daily Interest', 'Premium Borrower', 'Loan Program', 'Loan Type', 'Program',
        'FHA', 'VA', 'USDA', 'Conventional', 'Jumbo', 'Refinance', 'Purchase',
        '30 YEAR FIXED', '360 mths', '360 / 360 mths', 'Prepared By'
    ])
    known_non_names_lower = {name.lower() for name in known_non_names}

    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text("text")

        for pattern in name_patterns:
            for match in re.finditer(pattern, text):
                # If the pattern has a capture group, use it for the name
                if match.groups():
                    extracted_name = match.group(1).strip()
                else:
                    # Otherwise, use the full match
                    extracted_name = match.group(0).strip()

                # Filter out common non-name phrases that might match patterns
                if extracted_name.lower() not in known_non_names_lower:
                    search_term = extracted_name
                    text_instances = page.search_for(search_term)

                    bboxes = []
                    if text_instances:
                        for inst in text_instances:
                            bbox_coords = [inst.x0, inst.y0, inst.x1, inst.y1]
                            # Create a hashable tuple for deduplication
                            entry = (page_num, search_term, tuple(bbox_coords))
                            if entry not in seen_entries:
                                bboxes.append(bbox_coords)
                                seen_entries.add(entry)

                    if bboxes:
                        borrower_names_found_with_bbox.append({
                            'page': page_num,
                            'text': search_term,
                            'bbox': bboxes
                        })

    doc.close()

    if borrower_names_found_with_bbox:
        print("\nExtracted Borrower Names with Bounding Boxes:")
        for name_info in borrower_names_found_with_bbox:
            print(f"Page {name_info['page']}: '{name_info['text']}' | BBox: {name_info['bbox']}")
    else:
        print("\nNo borrower names found using the defined patterns.")


Attempting to extract Borrower's Name with bounding boxes...

Extracted Borrower Names with Bounding Boxes:
Page 0: 'Summary
Applicants' | BBox: [[322.13043212890625, 58.18072509765625, 376.87939453125, 71.53446197509766], [39.599998474121094, 78.64180755615234, 75.47529602050781, 87.32872009277344]]
Page 0: 'Fee
Paid' | BBox: [[39.599998474121094, 174.15185546875, 51.838375091552734, 182.1104736328125], [162.0, 174.15185546875, 176.85301208496094, 182.1104736328125]]
Page 0: 'To
Paid' | BBox: [[178.70877075195312, 174.15185546875, 187.35052490234375, 182.1104736328125], [291.6000061035156, 174.15185546875, 306.55499267578125, 182.1104736328125]]
Page 0: 'Funds
Total' | BBox: [[94.0674819946289, 930.9918823242188, 115.3144760131836, 938.9505004882812], [396.0, 930.9918823242188, 412.8256530761719, 938.9505004882812]]
Page 0: 'Interest
Other' | BBox: [[431.7011413574219, 810.0318603515625, 455.31597900390625, 817.990478515625], [396.0, 819.9918823242188, 413.7236328125, 827.950500488281