In [0]:
%sh
sudo apt-get update -y
sudo apt-get install -y poppler-utils



In [0]:
!pip install --requirement requirements.txt
dbutils.library.restartPython() 

In [0]:
dbutils.library.restartPython() 

In [0]:
from rade import RADE, RetrievedPage, DocumentPage
from utils.azure_doc_intel import parse_azureDocIntell

In [0]:
#test the class

import time
import torch
from tqdm import tqdm
import os
# Directory containing PDF files
pdf_dir = "/Volumes/dopdatabricks/trustdocs/doc_analyzer_uploads/in/"  
pdf_files = [os.path.join(pdf_dir, f) for f in os.listdir(pdf_dir) if f.endswith(".pdf")]

if not pdf_files:
    print(f"No PDF files found in directory: {pdf_dir}")


# # Start timing execution
# start_time = time.time()

pdf_path = pdf_files[:1]

# # Clear GPU cache before loading model
torch.cuda.empty_cache()

# # Instantiate RADE
gliner = "knowledgator/gliner-multitask-large-v0.5"
colpali_model = "vidore/colpali-v1.3"
rade = RADE(retrieval_model_name=colpali_model, 
            qa_model_name= "deepset/roberta-base-squad2",  
            entity_extraction_model= "knowledgator/gliner-multitask-large-v0.5",
            use_flash_attention=False, max_pages=3)

In [0]:
# âœ… Define additional QA Queries
# Directory containing PDF files

pdf_dir = "/Volumes/dopdatabricks/trustdocs/doc_analyzer_uploads/in/" 
pdf_files = [os.path.join(pdf_dir, f) for f in os.listdir(pdf_dir) if f.endswith(".pdf")]

if not pdf_files:
    print(f"No PDF files found in directory: {pdf_dir}")
        
pdf_path = pdf_files[2]
# pdf_path
rade.add_document(pdf_path, 1)
rade.build_index()


In [0]:
# Define entity queries and corresponding labels
entity_queries = [
    "GRANTOR: what are the Grantors names? (Grantor)",
    "TRUSTEE: what are the Trustees names? (Trustee)",
    "Successor TRUSTEE: what are the Successor Trustee names (SUCCESSOR TRUSTEE)?",
    "BENEFCIARIES: what are the beneficiaries names (BENEFICIARIES)?",
    "SUCCESSOR BENEFCIARIES: what are the Successor beneficiaries names (SUCCESSOR Beneficiary)? "
]
labels_list = [
    ["Grantor names"],
    ["Trustee Names"],
    ["Successor Trustee Names"],
    ["Beneficiary names"],
    ["Successor Beneficiary Names"],
    ["Trust Name"],
    ["Trust Date"],
    ["Revocable", "Irevocable"],
    
]
qa_queries = [
    "What is the name of this trust?",
    "What is the date of this trust?",
    "Is this trust revocable or irrevocable?",
]
all_queries_dict = {
    "entity_queries": entity_queries,
    "qa_queries": qa_queries
}


In [0]:
all_queries = all_queries_dict["entity_queries"] + all_queries_dict["qa_queries"]


In [0]:
search_result = rade.retrieve(all_queries)
assert len(search_result) == len(all_queries)

In [0]:
# Store results and parsed page contexts
qa_results = []
contexts_map = {}  # Dictionary to cache parsed pages

for query_idx in tqdm(range(len(all_queries)), desc="Processing QA Queries"):
    query = all_queries[query_idx]
    pages = search_result[query_idx]

    all_pages = []
    retrieved_page_nums = []
    for page in pages:
        page_num = page.page.page_num
        retrieved_page_nums.append(str(page_num))

        # Check if page has been parsed before
        if page_num in contexts_map:
            parsed = contexts_map[page_num]
            print(f"Using cached parsed result for page: {page_num}")
        else:
            print(f"Parsing new page: {page_num}")
            parsed = parse_azureDocIntell(page.page.image)
            contexts_map[page_num] = parsed

        all_pages.append(parsed)

    # Combine parsed pages into a single string
    combined_pages = " ".join(all_pages) if all_pages else ""
    page_nums = " ".join(retrieved_page_nums) if retrieved_page_nums else ""

    # print(f"retrived pages {page_nums} ")
    # Run QA model to extract answer
    qa_answer = rade.run_qa_pipeline(query, combined_pages)

    # Store result
    qa_results.append({
        "query": query,
        "retrieved_pages": page_nums,
        "context": combined_pages,
        "RoBerta Answer": qa_answer
    })


In [0]:
entity_results = []
for query_idx in tqdm(range(len(all_queries)), desc="Processing Entity Queries"):
    query = all_queries[query_idx]
    labels = labels_list[query_idx]
    pages = search_result[query_idx]

    all_pages = []
    retrieved_page_nums = []
    for page in pages:
        page_num = page.page.page_num
        retrieved_page_nums.append(str(page_num))
        # Check if page has been parsed before
        if page_num in contexts_map:
            parsed = contexts_map[page_num]
            print(f"Using cached parsed result for page: {page_num}")
        else:
            print(f"Parsing new page: {page_num}")
            parsed = parse_azureDocIntell(page.page.image)
            contexts_map[page_num] = parsed

        all_pages.append(parsed)

    # Combine parsed pages into a single string
    combined_pages = " ".join(all_pages) if all_pages else ""
    
    page_nums = " ".join(retrieved_page_nums) if retrieved_page_nums else ""


    # Extract entities from parsed text if only single gpu
    entities = rade.extract_entities_with_gliner(combined_pages, labels)

    # Store result
    entity_results.append({
        "query": query,
        "retrieved_pages": page_nums,
        "context": combined_pages,
        "GLiNER Answer": entities
    })

In [0]:
import pandas as pd

# Convert lists of dictionaries to pandas DataFrames
df_entities = pd.DataFrame(entity_results)
df_qa = pd.DataFrame(qa_results)

# Perform the merge and prioritize 'context' from entity_results
combined_df = pd.merge(
    df_entities.drop(columns='context', errors='ignore'),  # Remove 'context' to avoid conflict
    df_qa,
    on='query',
    suffixes=('_gliner', '_qa'),
    how='outer'
)

combined_df.head(10)

In [0]:
# Define the output directory
import os
output_dir = "output"
os.makedirs(output_dir, exist_ok=True)  # Ensure the output directory exists

# Generate the filename based on the PDF file basename
output_filename = os.path.splitext(os.path.basename(pdf_path))[0] + "_results.csv"

# Save the combined DataFrame to the output directory
output_file_path = os.path.join(output_dir, output_filename)
combined_df.to_csv(output_file_path, index=False)

print(f"Combined DataFrame saved to: {output_file_path}")