In [1]:
import sys
from utils.azure_doc_intel import parse_azureDocIntell
from rade import RADE, RetrievedPage, DocumentPage

In [2]:
#test the class

import time
import torch
from tqdm import tqdm

# Start timing execution
start_time = time.time()

pdf_path = "../data/trustdocs/living-trust-forms-04_repopulated.pdf"

# Clear GPU cache before loading model
torch.cuda.empty_cache()

# Instantiate RADE
colpali_model = "../colpali_model_v1.3"
rade = RADE(retrieval_model_name=colpali_model, use_flash_attention=False, max_pages=3)

Using device map: {'retrieval': 'cuda:0', 'qa': 'cuda:1'}
Initializing retrieval model...


`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:1


Initializing QA model and entity extraction models...
Models initialized successfully!


In [3]:
# ✅ Define additional QA Queries
pdf_path = "../data/trustdocs/living-trust-forms-04_repopulated.pdf"

rade.add_document(pdf_path, 1)
rade.build_index()

Loading document: 1
Added 21 pages from 1
Building index for 21 pages...


Building index: 100%|██████████| 6/6 [00:01<00:00,  3.00it/s]

Embeddings shape: (21, 128)
Building exact index...
Index built successfully!





In [4]:
# Define entity queries and corresponding labels
entity_queries = [
    "GRANTOR: what are the Grantors names? (Grantor)",
    "TRUSTEE: what are the Trustees names? (Trustee)",
    "Successor TRUSTEE: what are the Successor Trustee names (SUCCESSOR TRUSTEE)?",
    "BENEFCIARIES: what are the beneficiaries names (BENEFICIARIES)?",
    "SUCCESSOR BENEFCIARIES: what are the Successor beneficiaries names (SUCCESSOR Beneficiary)? "
]
labels_list = [
    ["Grantor names"],
    ["Trustee Names"],
    ["Successor Trustee Names"],
    ["Beneficiary names"],
    ["Successor Beneficiary Names"],
    ["Trust Name"],
    ["Trust Date"],
    ["Revocable", "Irevocable"],
    
]
qa_queries = [
    "What is the name of this trust?",
    "What is the date of this trust?",
    "Is this trust revocable or irrevocable?",
]
all_queries_dict = {
    "entity_queries": entity_queries,
    "qa_queries": qa_queries
}


In [5]:
all_queries = all_queries_dict["entity_queries"] + all_queries_dict["qa_queries"]


In [6]:
search_result = rade.retrieve(all_queries)
assert len(search_result) == len(all_queries)

In [7]:
# Store results and parsed page contexts
qa_results = []
contexts_map = {}  # Dictionary to cache parsed pages

for query_idx in tqdm(range(len(all_queries)), desc="Processing QA Queries"):
    query = all_queries[query_idx]
    pages = search_result[query_idx]

    all_pages = []
    retrieved_page_nums = []
    for page in pages:
        page_num = page.page.page_num
        retrieved_page_nums.append(str(page_num))

        # Check if page has been parsed before
        if page_num in contexts_map:
            parsed = contexts_map[page_num]
            print(f"Using cached parsed result for page: {page_num}")
        else:
            print(f"Parsing new page: {page_num}")
            parsed = parse_azureDocIntell(page.page.image)
            contexts_map[page_num] = parsed

        all_pages.append(parsed)

    # Combine parsed pages into a single string
    combined_pages = " ".join(all_pages) if all_pages else ""
    page_nums = " ".join(retrieved_page_nums) if retrieved_page_nums else ""

    # print(f"retrived pages {page_nums} ")
    # Run QA model to extract answer
    qa_answer = rade.run_qa_pipeline(query, combined_pages)

    # Store result
    qa_results.append({
        "query": query,
        "retrieved_pages": page_nums,
        "context": combined_pages,
        "RoBerta Answer": qa_answer
    })


Processing QA Queries:   0%|          | 0/8 [00:00<?, ?it/s]

Parsing new page: 18
Parsing new page: 1
Parsing new page: 4


Processing QA Queries:  12%|█▎        | 1/8 [00:10<01:12, 10.33s/it]

Using cached parsed result for page: 1
Parsing new page: 0
Parsing new page: 5


Processing QA Queries:  25%|██▌       | 2/8 [00:16<00:46,  7.74s/it]

Parsing new page: 11


Processing QA Queries:  38%|███▊      | 3/8 [00:19<00:29,  5.82s/it]

Using cached parsed result for page: 1
Using cached parsed result for page: 0
Parsing new page: 14
Using cached parsed result for page: 4
Parsing new page: 13


Processing QA Queries:  50%|█████     | 4/8 [00:24<00:21,  5.43s/it]

Using cached parsed result for page: 11
Using cached parsed result for page: 14
Using cached parsed result for page: 4
Using cached parsed result for page: 1
Using cached parsed result for page: 0
Parsing new page: 3


Processing QA Queries:  75%|███████▌  | 6/8 [00:27<00:06,  3.22s/it]

Using cached parsed result for page: 18
Using cached parsed result for page: 0
Using cached parsed result for page: 1
Using cached parsed result for page: 0
Parsing new page: 2


Processing QA Queries: 100%|██████████| 8/8 [00:30<00:00,  3.84s/it]

Using cached parsed result for page: 5





In [11]:
entity_results = []
for query_idx in tqdm(range(len(all_queries)), desc="Processing Entity Queries"):
    query = all_queries[query_idx]
    labels = labels_list[query_idx]
    pages = search_result[query_idx]

    all_pages = []
    retrieved_page_nums = []
    for page in pages:
        page_num = page.page.page_num
        retrieved_page_nums.append(str(page_num))
        # Check if page has been parsed before
        if page_num in contexts_map:
            parsed = contexts_map[page_num]
            print(f"Using cached parsed result for page: {page_num}")
        else:
            print(f"Parsing new page: {page_num}")
            parsed = parse_azureDocIntell(page.page.image)
            contexts_map[page_num] = parsed

        all_pages.append(parsed)

    # Combine parsed pages into a single string
    combined_pages = " ".join(all_pages) if all_pages else ""
    
    page_nums = " ".join(retrieved_page_nums) if retrieved_page_nums else ""


    # Extract entities from parsed text if only single gpu
    entities = rade.extract_entities_with_gliner(combined_pages, labels)

    # Store result
    entity_results.append({
        "query": query,
        "retrieved_pages": page_nums,
        "context": combined_pages,
        "GLiNER Answer": entities
    })



Using cached parsed result for page: 18
Using cached parsed result for page: 1
Using cached parsed result for page: 4




Using cached parsed result for page: 1
Using cached parsed result for page: 0
Using cached parsed result for page: 5




Using cached parsed result for page: 11
Using cached parsed result for page: 1
Using cached parsed result for page: 0




Using cached parsed result for page: 14
Using cached parsed result for page: 4
Using cached parsed result for page: 13




Using cached parsed result for page: 11
Using cached parsed result for page: 14
Using cached parsed result for page: 4




Using cached parsed result for page: 1
Using cached parsed result for page: 0
Using cached parsed result for page: 3




Using cached parsed result for page: 18
Using cached parsed result for page: 0
Using cached parsed result for page: 1




Using cached parsed result for page: 0
Using cached parsed result for page: 2
Using cached parsed result for page: 5


Processing Entity Queries: 100%|██████████| 8/8 [00:10<00:00,  1.37s/it]


In [19]:
import pandas as pd

# Convert lists of dictionaries to pandas DataFrames
df_entities = pd.DataFrame(entity_results)
df_qa = pd.DataFrame(qa_results)

# Perform the merge and prioritize 'context' from entity_results
combined_df = pd.merge(
    df_entities.drop(columns='context', errors='ignore'),  # Remove 'context' to avoid conflict
    df_qa,
    on='query',
    suffixes=('_gliner', '_qa'),
    how='outer'
)



combined_df.head()

Unnamed: 0,query,retrieved_pages_gliner,GLiNER Answer,retrieved_pages_qa,context,answer
0,BENEFCIARIES: what are the beneficiaries names...,14 4 13,"[{'text': 'Lisa Smith', 'label': 'Beneficiary ...",14 4 13,"NONPUBLIC//FDIC INTERNAL ONLY. beneficiary, sh...","{'score': 0.06967747956514359, 'start': 3179, ..."
1,GRANTOR: what are the Grantors names? (Grantor),18 1 4,"[{'text': 'Mark Mori', 'label': 'Grantor names...",18 1 4,NONPUBLIC//FDIC INTERNAL ONLY. Mark Mori. CO-T...,"{'score': 0.04415975883603096, 'start': 2533, ..."
2,Is this trust revocable or irrevocable?,0 2 5,"[{'text': 'Revocable Trust', 'label': 'Revocab...",0 2 5,NONPUBLIC//FDIC INTERNAL ONLY. REVOCABLE LIVIN...,"{'score': 0.3610323667526245, 'start': 4044, '..."
3,SUCCESSOR BENEFCIARIES: what are the Successor...,11 14 4,"[{'text': 'beneficiary', 'label': 'Successor B...",11 14 4,NONPUBLIC//FDIC INTERNAL ONLY. (F) SUCCESSOR T...,"{'score': 0.0017009678995236754, 'start': 35, ..."
4,Successor TRUSTEE: what are the Successor Trus...,11 1 0,"[{'text': 'Laurie Mori-Smith', 'label': 'Succe...",11 1 0,NONPUBLIC//FDIC INTERNAL ONLY. (F) SUCCESSOR T...,"{'score': 0.7432011365890503, 'start': 4361, '..."


In [22]:
# Define the output directory
import os
output_dir = "output"
os.makedirs(output_dir, exist_ok=True)  # Ensure the output directory exists

# Generate the filename based on the PDF file basename
output_filename = os.path.splitext(os.path.basename(pdf_path))[0] + "_results.csv"

# Save the combined DataFrame to the output directory
output_file_path = os.path.join(output_dir, output_filename)
combined_df.to_csv(output_file_path, index=False)

print(f"Combined DataFrame saved to: {output_file_path}")

Combined DataFrame saved to: output/living-trust-forms-04_repopulated_results.csv
