In [1]:
! pip install -q litellm tqdm PyMuPDF Pillow instructor openai python-dotenv weaviate-client pandas

In [2]:
import os 
from dotenv import load_dotenv

load_dotenv()

In [3]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import fitz
import io
import base64
import time
from litellm import completion
from PIL import Image

def pdf_to_base64_images(pdf_path):
    base64_images = []

    pdf_document = fitz.open(pdf_path)
    
    def process_page(page):
        pix = page.get_pixmap()
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        buffered = io.BytesIO()
        img.save(buffered, format="PNG")
        return base64.b64encode(buffered.getvalue()).decode()
        
    with ThreadPoolExecutor() as executor:
        pages = [pdf_document[i] for i in range(len(pdf_document))]
        base64_images = list(executor.map(process_page, pages))
    pdf_document.close()
    return base64_images

* 'fields' has been removed


In [46]:
def process_summary_from_image(base64_str, max_retries=3, retry_delay=5):
    message = [{
    "role": "system",
    "content": """
    You will be given an image containing text. Your task is to accurately transcribe all the text from this image. Pay special attention to dates, facts, events, names, locations, and numbers.

    Follow these steps to complete the task:

    1. Carefully examine the entire image.
    2. Transcribe all visible text exactly as it appears in the image.
    3. If any text is unclear or illegible, do not attempt to guess or fill in information. Instead, indicate unclear text with [unclear] in your transcription.
    4. Pay particular attention to dates, facts, events, names, locations, and numbers. Ensure these are transcribed accurately.
    5. If the order of information in the image is not clear, think step by step about the logical flow of the content. Arrange the transcribed information in a relevant and coherent order.
    6. Do not add any information that is not present in the image.
    7. Do not include any preamble or explanation about the transcription process in your response.
    8. For diagrams or charts:
        Describe the type of visual element (e.g., flowchart, bar graph, pie chart)
        Explain the layout and structure
        Detail any labels, legends, or keys
        Describe the relationships or data shown
        If applicable, explain the sequence or flow of information

    Your response should only contain the transcribed content from the image, organized in a logical manner if necessary.

    If you encounter any issues or if the image is not clear enough to transcribe, explain the problem instead of providing a transcription.
    """
}, {
    "role": "user",
    "content": [{
        "type": "image_url", "image_url": {
            "url": f"data:image/jpeg;base64,{base64_str}"
        }},
        {"type": "text", "text": "Please transcribe all the text from this image, emphasizing any dates found."}
    ]
}]

    for attempt in range(max_retries):
        try:
            response = completion(
                model="openrouter/google/gemini-flash-1.5",
                messages=message,
                temperature=0.2,
            )
            return response.choices[0].message.content
        except Exception as e:
            if attempt < max_retries - 1:
                print(f"APIError occurred: {str(e)}. Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
            else:
                print(f"Max retries reached. Unable to process image.")
                return f"Error processing image: {str(e)}"

In [47]:
def process_images_to_summary(base64_images):
    def process_image_with_index(args):
        index, base64_str = args
        processed_summary = process_summary_from_image(base64_str)
        return index, processed_summary

    summary = [None] * len(base64_images)
    with ThreadPoolExecutor(max_workers=30) as executor:
        futures = [executor.submit(process_image_with_index, (i, img)) for i, img in enumerate(base64_images)]
        
        for future in tqdm(as_completed(futures), total=len(base64_images), desc="Processing images"):
            index, processed_summary = future.result()
            summary[index] = processed_summary
    
    return summary

In [48]:
def prepare_data_for_ingestion(summary, file_path):
    data_to_ingest = []
    contents = []

    for page_num, result in enumerate(summary, start=1):
        # print(f"Page {page_num}:")
        # print(result)
        # print("\n+++\n")
        data_to_ingest.append({
            "page_number": page_num,
            "text": result,
            "source": file_path
        })

    for item in data_to_ingest:
        page_number = f"### Page Number: [PG:{item['page_number']}]\n ### Source Document: {item['source']}\n\n"
        contents.append(page_number + item['text'] + "\n\n\n")
    
    # Save the contents to a file
    

    return data_to_ingest, contents

In [49]:
import instructor
from pydantic import BaseModel, Field
from openai import OpenAI


class PageRange(BaseModel):
    start: int = Field(..., description="Starting page number")
    end: int = Field(..., description="Ending page number")

class Segment(BaseModel):
    heading: str = Field(..., description="The heading or title of the section")
    description: str = Field(..., description="A brief description or summary of the section")
    page_range: PageRange = Field(..., description="Page range for this section")
    
client = instructor.from_openai(OpenAI())

In [50]:
def process_chunk(chunk_data,client):
    i, text_chunk, step, start_page, end_page = chunk_data

    SYSTEM_PROMPT = """
You are a highly skilled legal analysis AI specializing in case document segmentation. 
Your task is to analyze and segment a legal document, providing a comprehensive breakdown of its contents with accurate page citations.

Your objective is to divide this document into logical sections based on content and structure, ensuring complete coverage of all pages.

Instructions:
1. Read through the entire document carefully.
2. Identify logical sections based on the content and structure of the document.
3. For each section:
a. Determine the section's content and create an appropriate title.
b. Identify the page number range where the section is located.
c. Format the section with its title and page citation.
4. Ensure comprehensive coverage of the entire document, from page 1 through the final page.
5. The page number in the document is provided at the top of each page in the format "### Page Number: [PG:X]". Always refer to these page numbers in your response to accurately reference the source of information.
6. Do not rely on any index section or table of contents for page numbers. Always use the  "### Page Number: [PG:X]" format provided at the top of each page for accurate page references.
7. Output only factual, documentable information with accurate page references.
"""
    
    USER_PROMPT = f"""
    Please provide a detailed segmentation of the legal document for pages {start_page} to {end_page}.

    Important:
    - The page number in the document is provided at the top of each page in the format "### Page Number: [PG:X]". Always refer to these page numbers in your response to accurately reference the source of information.
    - Do not rely on any index section or table of contents for page numbers. Always use the  "### Page Number: [PG:X]" format provided at the top of each page for accurate page references.

    <Document>
        {text_chunk}
    </Document>

    Avoid any preamble or introduction.
    """
    
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": USER_PROMPT}
            ],
            response_model=list[Segment]
        )
        response_segments = response

        chunk_segments = []
        for segment in response_segments:
            page_range = segment.page_range
            chunk_segments.append({
                "heading": segment.heading,
                "description": segment.description,
                "page_range": {
                    "start": page_range.start,
                    "end": page_range.end
                }
            })
        return chunk_segments
    
    except Exception as e:
        print(f"Error processing chunk {i//step + 1}: {str(e)}")
        return []

def process_content_instructor(content_array, client, chunk_size=200, overlap=20, max_workers=10):
    all_segmentations = []
    step = chunk_size - overlap

    chunks = []
    total_pages = len(content_array)
    print(f"Total pages: {total_pages}")

    for i in range(0, len(content_array), step):
        chunk_end = min(i + chunk_size, len(content_array))
        text_chunk = content_array[i:chunk_end]
        start_page = (i // step) * (chunk_size - overlap) + 1
        end_page = min(start_page + chunk_size - 1, total_pages)
        
        chunks.append((i, text_chunk, step, start_page, end_page))

    ordered_results = {}

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_chunk, chunk_data,client): chunk_data[0] for chunk_data in chunks}
        
        with tqdm(total=len(chunks), desc="Processing chunks") as pbar:
            for future in as_completed(futures):
                chunk_index = futures[future]
                chunk_segments = future.result()
                ordered_results[chunk_index] = chunk_segments
                pbar.update(1)

    for i in range(0, len(content_array), step):
        if i in ordered_results:
            all_segmentations.extend(ordered_results[i])
    
    return all_segmentations

In [51]:
def process_segmentation(contents, segmentation, file_path):
    pages = "\n".join(contents).split("### Page Number:")
    pages = ["### Page Number:" + page for page in pages[1:]] 
    pages = [page.strip() for page in pages if page.strip()]

    data = []
    for section in segmentation:

        page_content = ""
        for page in pages[section['page_range']['start']-1:section['page_range']['end']]:
            page_content += page + "\n\n"
        
        data.append({
            "heading": section['heading'],
            "page_range": section['page_range'],
            "content": page_content,
            "source": file_path
        })
        
        # print(data)
        # print("\n\n")
    
    return data

In [52]:
def create_or_get_collection(client, name, generative_model):
    """
    Create a Weaviate collection with specified configurations or return an existing one.

    Args:
    client: Weaviate client instance
    name (str): Name of the collection
    vectorizer (str): Name of the vectorizer to use
    generative_model (str): Name of the generative model to use

    Returns:
    Collection: The created or existing Weaviate collection
    
    """
    try:
        print("Creating a new collection")
        # Try to create a new collection
        return client.collections.create(
            name=name,
            vectorizer_config=Configure.Vectorizer.text2vec_voyageai(),
            generative_config=Configure.Generative.openai(model=generative_model)
        )
    except weaviate.exceptions.UnexpectedStatusCodeException:
        print("Collection already exists")
        # If creation fails (likely because it already exists), get the existing collection
        return client.collections.get(name)


def add_data_to_collection(collection, data):
    """
    Add data to the specified Weaviate collection.

    Args:
    collection: Weaviate collection object
    data: List of documents to be added

    Returns:
    None
    """
    for document in data:
        try:
            # Insert the document into the collection
            uuid = collection.data.insert(document)
            # print(f"Successfully inserted document with UUID: {uuid}")
        except Exception as e:
            print(f"Error inserting document: {str(e)}")

    # Ensure all data is indexed
    # collection.data.flush()
    print(f"All documents have been added to the {collection.name} collection.")

In [53]:
def get_weaviate_client():
    # Best practice: store your credentials in environment variables
    wcd_url = os.environ["WCD_URL"]
    wcd_api_key = os.environ["WCD_API_KEY"]
    openai_api_key = os.environ["OPENAI_API_KEY"]
    voyageai_api_key = os.environ["VOYAGEAI_API_KEY"]

    headers = {
        "X-VoyageAI-Api-Key": voyageai_api_key,
        "X-OpenAI-Api-Key": openai_api_key
    }

    client = weaviate.connect_to_weaviate_cloud(
        cluster_url=wcd_url,
        auth_credentials=Auth.api_key(wcd_api_key),
        headers=headers,
    )

    if client.is_ready():
        print("Weaviate client is ready")
        return client
    else:
        print("Weaviate client is not ready")
        return None



In [58]:
import weaviate
from weaviate.classes.init import Auth
from weaviate.classes.config import Configure

workspace_name = "Finance_Bench_2"
pdf_files = [f for f in os.listdir(os.getcwd()) if "ACTIVSIONBLIZZARD_2023Q2_10Q" in f.upper()]

# Add TQDM progress bar

for pdf_file in pdf_files:
    print(f"Processing {pdf_file}")
    pdf_path = os.path.join(os.getcwd(), pdf_file) 
    base64_images = pdf_to_base64_images(pdf_path)
    summary = process_images_to_summary(base64_images)
    data_to_ingest, contents = prepare_data_for_ingestion(summary, pdf_path)
    segments = process_content_instructor(contents, client,chunk_size=75, overlap=10, max_workers=10)
    final_data = process_segmentation(contents, segments, pdf_path)
    client_weaviate = get_weaviate_client()
    collection = create_or_get_collection(client_weaviate, workspace_name, "gpt-4o-mini")
    add_data_to_collection(collection, final_data)
    client_weaviate.close()  

Processing ACTIVSIONBLIZZARD_2023Q2_10Q.pdf


Processing images: 100%|██████████| 75/75 [00:25<00:00,  2.92it/s]


Total pages: 75


Processing chunks: 100%|██████████| 2/2 [00:26<00:00, 13.07s/it]


Weaviate client is ready
Creating a new collection
Collection already exists
All documents have been added to the Finance_Bench_2 collection.


In [92]:
def get_answer_for_question(Question,client):
    
    # Get the collection
    questions = client.collections.get(WorkSpace_name)
    
    # Query the collection
    response = questions.query.hybrid(
        query=Question,
        limit=5,
        alpha=0.5
    )
    # client.close()
    # Convert Weaviate objects to text
    def weaviate_objects_to_text(objects):
        text = ""
        for obj in objects:
            text += f"<Content>\n{obj.properties['content']}\n</Content>\n\n"
        return text

    llm_input = weaviate_objects_to_text(response.objects)

    # print(llm_input)

    SYSTEM_PROMPT = """
You are a specialized Financial Analysis AI focused on extracting concise, data-driven insights from financial documents. Provide short, precise answers with a brief explanation of your reasoning (thinking process). Follow these guidelines:

1. **Accuracy & Notation**  
   - Present exact numeric values with proper units (e.g., $1.5B, 15%).  
   - Maintain precision; use standard financial terminology.  

2. **Context & Citations**  
   - Cite specific pages/sources (e.g., [PG:X, Source: DocumentName]) and relevant fiscal periods.  
   - Clearly distinguish between Income Statements, Balance Sheets, and Cash Flow statements.  

3. **Key Metrics & Trends**  
   - Prioritize critical KPIs and ratios (e.g., revenue, profit margins).  
   - Compare current figures to previous periods (YoY changes) when possible.  

4. **Brief Analytical Structure**  
   - Begin with key figures (bullet points).  
   - Provide a short thinking process (a few bullets) for how you reached conclusions.  
   - End with a concise final answer or insight.  

5. **Data Gaps & Risks**  
   - Note missing data, adjustments, unusual items, or restatements.  
   - Mention relevant risk disclosures or one-time events.  

6. **Regulatory & Compliance**  
   - Specify if figures follow GAAP/IFRS.  
   - Flag any compliance or restatement issues.  

**Remember**:  
- Stick strictly to information from the documents.  
- Keep responses focused on numerical data and short insights.  
- Avoid speculation or overly detailed elaboration.  
- Provide a brief but clear thinking process, then a concise final answer.
"""

    USER_PROMPT = f"""
Context:
<Context>
{llm_input}
</Context>

Question: 
<Question>
{Question}
</Question>

Please provide a concise, reader-friendly answer following these requirements:

1. Format:
   - Use proper markdown formatting with clear headings and subheadings
   - Start directly with the answer without any preamble or phrases like "Based on..."
   - Use bullet points or numbered lists for complex information

2. Citations:
   - Include citations for every factual statement
   - Use the format [PG:X, Source: document_name]
   - Extract page numbers and sources from the context headers:
     "### Page Number: [PG:X]"
     "### Source Document: Y"

3. Content:
   - Provide direct, concise answers suitable for FAQ format
   - Focus on accuracy and readability
   - Include only information found in the provided context
   - If information is incomplete or unclear, state this explicitly

"""

    response = completion(
        model="openrouter/google/gemini-flash-1.5",
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": USER_PROMPT}
        ]
    )
    print("Question: \n")
    print(Question)
    print("Answer: \n")
    print(response.choices[0].message.content.strip())
    return response.choices[0].message.content.strip()

# Example usage:
WorkSpace_name = workspace_name
question = "Is 3M a capital-intensive business based on FY2022 data?"
weaviate_client = get_weaviate_client()
answer = get_answer_for_question(question,weaviate_client)
print(answer)
weaviate_client.close()

Weaviate client is ready
Question: 

Is 3M a capital-intensive business based on FY2022 data?
Answer: 

## 3M Capital Intensity: FY2022

**Answer:** Yes, 3M is a capital-intensive business in FY2022.

**Reasoning:**

*   **High Capital Expenditures (CAPEX):** 3M's capital spending in FY2022 was $1,749 million [PG:39, Source: c:\Source\Finance_Bench\3M_2022_10K.pdf].  This represents a substantial investment in property, plant, and equipment (PP&E).
*   **Significant PP&E:**  The net book value of 3M's property, plant, and equipment was $9.178 billion at the end of FY2022 [PG:50, Source: c:\Source\Finance_Bench\3M_2022_10K.pdf].  This large PP&E base indicates a high level of capital investment already in place.
*   **Planned Future CAPEX:**  3M projected capital expenditures of $1.5 billion to $1.8 billion for 2023 [PG:39, Source: c:\Source\Finance_Bench\3M_2022_10K.pdf], continuing the trend of significant capital investment.

**Conclusion:** The combination of high current CAPEX, a s

In [93]:
import pandas as pd
df_questions = pd.read_json("financebench_open_source.jsonl", lines=True)

In [94]:
df_questions.head(3)

Unnamed: 0,financebench_id,company,doc_name,question_type,question_reasoning,domain_question_num,question,answer,justification,dataset_subset_label,evidence
0,financebench_id_03029,3M,3M_2018_10K,metrics-generated,Information extraction,,What is the FY2018 capital expenditure amount ...,$1577.00,The metric capital expenditures was directly e...,OPEN_SOURCE,[{'evidence_text': 'Table of Contents 3M Comp...
1,financebench_id_04672,3M,3M_2018_10K,metrics-generated,Information extraction,,Assume that you are a public equities analyst....,$8.70,"The metric ppne, net was directly extracted fr...",OPEN_SOURCE,[{'evidence_text': 'Table of Contents 3M Comp...
2,financebench_id_00499,3M,3M_2022_10K,domain-relevant,Logical reasoning (based on numerical reasoning),dg06,Is 3M a capital-intensive business based on FY...,"No, the company is managing its CAPEX and Fixe...",CAPEX/Revenue\nFixed Assets/Total Assets\nROA=...,OPEN_SOURCE,[{'evidence_text': '3M Company and Subsidiarie...


In [95]:
filtered_df = df_questions[df_questions["doc_name"].str.contains("AES|3M|ACTIVISIONBLIZZARD", case=False, na=False)]

In [96]:
len(filtered_df)

13

In [97]:
# Create a list to store answers
answers = []

# Process each question individually
for question in filtered_df['question']:
    # Create a new client for each question
    weaviate_client = get_weaviate_client()
    try:
        answer = get_answer_for_question(question, weaviate_client)
        answers.append(answer)
    finally:
        # Ensure client is closed even if there's an error
        weaviate_client.close()



Weaviate client is ready
Question: 

What is the FY2018 capital expenditure amount (in USD millions) for 3M? Give a response to the question by relying on the details shown in the cash flow statement.
Answer: 

### 3M FY2018 Capital Expenditure

$1,577 million

* This figure represents purchases of property, plant, and equipment (PP&E) during fiscal year 2018. [PG:46, Source: c:\Source\Finance_Bench\3M_2018_10K.pdf]
* The Consolidated Statement of Cash Flows shows this amount as a negative value, indicating a cash outflow related to capital investments. [PG:46, Source: c:\Source\Finance_Bench\3M_2018_10K.pdf]
Weaviate client is ready
Question: 

Assume that you are a public equities analyst. Answer the following question by primarily using information that is shown in the balance sheet: what is the year end FY2018 net PPNE for 3M? Answer in USD billions.
Answer: 

### 3M FY2018 Net PP&E

$8.7B

* This figure represents the net property, plant, and equipment for 3M at the end of fiscal 

In [78]:
answers

['# 3M Capital Expenditure (FY2018)\n\n$1,577 million\n\nThis figure is shown as "Purchases of property, plant and equipment (PP&E)" in the Consolidated Statement of Cash Flows [PG:46, Source: 3M_2018_10K.pdf].\n\nFor additional context:\n- The capital expenditure (PP&E purchases) increased from $1,373 million in 2017\n- Primary focus was on growth, productivity and manufacturing investments [PG:47, Source: 3M_2018_10K.pdf]',
 '# Net Property, Plant and Equipment (PPNE) - FY2018\n\n**Answer: $8.738 billion**\n\n## Supporting Data\nNet PPNE can be found directly on 3M\'s balance sheet as "Property, plant and equipment - net" of $8.738 billion as of December 31, 2018 [PG:58, Source: 3M_2018_10K.pdf].\n\n## Calculation Components\n- Gross property, plant and equipment: $24.873 billion\n- Less: Accumulated depreciation: ($16.135) billion\n- Net PPNE: $8.738 billion\n[PG:58, Source: 3M_2018_10K.pdf]',
 "# Analysis of 3M's Capital Intensity (FY2022)\n\n## Key Metrics\n* Capital expenditures:

In [79]:
i = 0
# Print each row's information
for idx, row in filtered_df.iterrows():
    print(f"\nQuestion {i+1}:")
    print("Q:", row['question'])
    print("Reference Answer:", row['answer'])
    print("Generated Answer:", answers[i])
    i = i + 1
    print("-" * 80)  # Print a separator line


Question 1:
Q: What is the FY2018 capital expenditure amount (in USD millions) for 3M? Give a response to the question by relying on the details shown in the cash flow statement.
Reference Answer: $1577.00
Generated Answer: # 3M Capital Expenditure (FY2018)

$1,577 million

This figure is shown as "Purchases of property, plant and equipment (PP&E)" in the Consolidated Statement of Cash Flows [PG:46, Source: 3M_2018_10K.pdf].

For additional context:
- The capital expenditure (PP&E purchases) increased from $1,373 million in 2017
- Primary focus was on growth, productivity and manufacturing investments [PG:47, Source: 3M_2018_10K.pdf]
--------------------------------------------------------------------------------

Question 2:
Q: Assume that you are a public equities analyst. Answer the following question by primarily using information that is shown in the balance sheet: what is the year end FY2018 net PPNE for 3M? Answer in USD billions.
Reference Answer: $8.70
Generated Answer: # Net