In [1]:
import os
from dotenv import load_dotenv 
import json
from unstructured_client import UnstructuredClient
from unstructured_client.models import shared, operations
from unstructured_client.models.errors import SDKError
from unstructured.staging.base import dict_to_elements, elements_to_json
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_ollama import ChatOllama
import uuid
from langchain_ollama import OllamaEmbeddings
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain.schema.document import Document
from langchain_openai import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings

from langchain.retrievers.multi_vector import MultiVectorRetriever
load_dotenv()

True

## Unstructed api call

In [2]:
unstructured_api_key = os.getenv("UNSTRUCTURED_API_KEY")
unstructured_api_url = os.getenv("UNSTRUCTURED_API_URL")
openai_api_key = os.getenv("OPENAI_API_KEY")

In [3]:
openai_api_key

# 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

'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'

## Define a cilent

In [3]:
client = UnstructuredClient(
    api_key_auth=unstructured_api_key,
    server_url=unstructured_api_url,
)

## Load a PDF File 

In [4]:
pdf_path="statement.pdf"


with open(pdf_path, "rb") as f:
    files = shared.Files(
        content=f.read(),
        file_name=pdf_path
    )

## Requests to api for PDF

In [5]:
req = operations.PartitionRequest(
    partition_parameters=shared.PartitionParameters(
        files=files,
        strategy="hi_res",
        hi_res_model_name="yolox",
        skip_infer_table_types=[],
        pdf_infer_table_structure=True,
        coordinates = True,
    )
)
try:
    resp = client.general.partition(request=req)
    elements = dict_to_elements(resp.elements)
except SDKError as e:
    print(e)

INFO: HTTP Request: GET https://api.unstructuredapp.io/general/docs "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"


### Print element of the PDF
1. Table
2. Text
3. Footer
4. ListItem
5. NarrativeText
6. UncategorizedText

In [6]:
elements_category=set()
for el in elements:
    elements_category.add(el.category)
    

In [7]:
elements_category

{'Footer',
 'Image',
 'ListItem',
 'NarrativeText',
 'Table',
 'Title',
 'UncategorizedText'}

## Extract only the table element from the elements of the pdf

In [8]:
table_elements = [element for element in elements if element.category == "Table"]

# Length of the table elements

Means no of the table in a pdf

In [9]:
len(table_elements)

49

In [10]:
table_elements

[<unstructured.documents.elements.Table at 0x11fd16cc0>,
 <unstructured.documents.elements.Table at 0x12b0f0c20>,
 <unstructured.documents.elements.Table at 0x12ba0ddf0>,
 <unstructured.documents.elements.Table at 0x12b6d3ad0>,
 <unstructured.documents.elements.Table at 0x12b6f2630>,
 <unstructured.documents.elements.Table at 0x12b6f2900>,
 <unstructured.documents.elements.Table at 0x12bc42000>,
 <unstructured.documents.elements.Table at 0x12bc43080>,
 <unstructured.documents.elements.Table at 0x12bc439b0>,
 <unstructured.documents.elements.Table at 0x12bc43b60>,
 <unstructured.documents.elements.Table at 0x12bc43fb0>,
 <unstructured.documents.elements.Table at 0x12bc58290>,
 <unstructured.documents.elements.Table at 0x12bc59040>,
 <unstructured.documents.elements.Table at 0x12bc59a30>,
 <unstructured.documents.elements.Table at 0x12bc59dc0>,
 <unstructured.documents.elements.Table at 0x12bc5a300>,
 <unstructured.documents.elements.Table at 0x12bc5a4b0>,
 <unstructured.documents.elemen

# Generate a title and headers using LLM for the table elements

To do this first convert all the table elemets convert ino the HTML Format

In [11]:
table_html = [table.metadata.text_as_html for table in table_elements]

In [12]:
len(table_html) == len(table_elements)

True

In [14]:
first_table = table_html[:2]
first_table

['<table><thead><tr><th>ndex</th><th>Page No.</th></tr></thead><tbody><tr><td>“ondensed Consolidated Balance Sheet</td><td>1</td></tr><tr><td>“ondensed Consolidated Statement of Profit and Loss</td><td></td></tr><tr><td>“ondensed Consolidated Statement of Changes in Equity</td><td>E</td></tr><tr><td>“ondensed Consolidated Statement of Cash Flows</td><td>5</td></tr><tr><td colspan="2">)verview and Notes to the Interim Condensed Consolidated Financial Statements</td></tr><tr><td colspan="2">. Overview</td></tr><tr><td>1.1 Company overview</td><td></td></tr><tr><td>1.2 Basis of preparation of financial statements</td><td>R 7</td></tr><tr><td>1.3 Basis of consolidation</td><td>S</td></tr><tr><td>1.4 Use of estimates and judgments</td><td>7</td></tr><tr><td>1.5 Critical accounting estimates and judgments....</td><td>[ R 8</td></tr><tr><td colspan="2">. Notes to the Interim Condensed Consolidated Financial Statements</td></tr><tr><td>2.1 Business Combinations</td><td>10</td></tr><tr><td>2.2 

In [13]:
# summary =  ChatPromptTemplate.from_template(
#     """
#     generate summary for the following tables given : \n {doc} 
#     """
# )

# title = ChatPromptTemplate.from_template(
#     """
#     provide a title of the given {summary}
    
#     """
# )



# prompt = ChatPromptTemplate.from_template(
#     """
#     1. Generate a summary for the following tables given: \n {doc}
#     2. Provide a title for the from the summary
#     """
# )

prompt = ChatPromptTemplate.from_template(
    """
    Given the following document containing tables, perform the following tasks:
    1. Analyze the data in the tables and generate a comprehensive summary that highlights the key insights, trends, or patterns present in the data. Ensure the summary is concise, informative, and easy to understand.
    2. From the generated summary, extract and provide a meaningful and descriptive title that encapsulates the core findings.

    The output should follow this format:
    - Summary: <Provide the summary here>
    - Title: <Provide the title here>
    
    Document: \n {doc}
    """
)



In [90]:
model="gpt-3.5-turbo"
llm = ChatOpenAI(model = model)

In [111]:
prompt1 = ChatPromptTemplate.from_template(
    """
    Given the document containing financial tables and data, complete the following tasks:

    1. **Analyze**: Summarize the key trends, patterns, and significant changes in the data. Focus on comparisons, reductions, and increases over the time periods presented. Ensure that the summary is concise, clear, and highlights the most impactful insights.
    
    2. **Title**: Extract a descriptive and meaningful title that reflects the core findings of the data analysis.

    3. **Hypothetical Questions**: Generate exactly 3 thoughtful and relevant hypothetical questions that could be answered by the data. These should explore potential causal relationships, trends, or broader implications based on the document’s content.

    **Output Format:**
    - **Summary**: <Generated summary here>
    - **Title**: <Generated title here>
    - **Questions**: <List of 3 hypothetical questions>

    Document: \n {doc}
    """
)


In [61]:
summary_question_chain = (
    {"doc": lambda x: x}
    |prompt1
    |llm
    |StrOutputParser()  
)

In [62]:
response_with_question = summary_question_chain.batch(table_html,{"max_concurrency":5})

INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/cha

In [63]:
response_with_question

['The document you provided contains a financial statement index, but unfortunately, it lacks specific data or figures necessary to conduct a comprehensive analysis. However, I\'ll still draft a generalized output based on the typical content of such documents.\n\n- **Summary**: The financial statements suggest an expansive overview of the company\'s financial health, covering key areas such as balance sheets, profit and loss statements, and cash flows. Significant insights include potential trends in revenue changes, asset valuations, and liabilities. Any notable increase or decrease in areas like revenue from operations, provisions, and liabilities likely represents the company\'s operational scalability and financial risk management. The condensed balance sheet may reveal changes in liquidity and investment patterns, while the notes could provide valuable explanations for any observed trends or anomalies.\n\n- **Title**: "Comprehensive Financial Trends and Insights in Quarterly Fina

In [15]:
summary_chain = (
    {"doc": lambda x: x}
    |prompt
    |llm
    |StrOutputParser()  
)


In [16]:
prompt

ChatPromptTemplate(input_variables=['doc'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['doc'], input_types={}, partial_variables={}, template='\n    Given the following document containing tables, perform the following tasks:\n    1. Analyze the data in the tables and generate a comprehensive summary that highlights the key insights, trends, or patterns present in the data. Ensure the summary is concise, informative, and easy to understand.\n    2. From the generated summary, extract and provide a meaningful and descriptive title that encapsulates the core findings.\n\n    The output should follow this format:\n    - Summary: <Provide the summary here>\n    - Title: <Provide the title here>\n    \n    Document: \n {doc}\n    '), additional_kwargs={})])

In [17]:
summary_chain

{
  doc: RunnableLambda(lambda x: x)
}
| ChatPromptTemplate(input_variables=['doc'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['doc'], input_types={}, partial_variables={}, template='\n    Given the following document containing tables, perform the following tasks:\n    1. Analyze the data in the tables and generate a comprehensive summary that highlights the key insights, trends, or patterns present in the data. Ensure the summary is concise, informative, and easy to understand.\n    2. From the generated summary, extract and provide a meaningful and descriptive title that encapsulates the core findings.\n\n    The output should follow this format:\n    - Summary: <Provide the summary here>\n    - Title: <Provide the title here>\n    \n    Document: \n {doc}\n    '), additional_kwargs={})])
| ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x12bce0350>, async_client=<openai.resources.cha

In [18]:
response = summary_chain.batch(table_html,{"max_concurrency":5})


INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/cha

In [19]:
response

['- Summary: The table contains a list of sections from a financial document, along with corresponding page numbers. The sections include condensed consolidated financial statements, notes to the financial statements, and specific details on different financial assets, liabilities, equity, income, and expenses.\n- Title: Financial Document Structure and Analysis',
 '- Summary: The data in the tables presents a detailed breakdown of the Condensed Consolidated Balance Sheets as at March 31, 2024, compared to the previous year. The assets are divided into non-current assets and current assets, with significant changes observed in property, plant and equipment, investments, and trade receivables. On the other hand, the equity and liabilities sections highlight variations in equity share capital, other equity, lease liabilities, and trade payables. Overall, total assets, total equity, and total liabilities have shown growth compared to the previous year.\n- Title: Analysis of Condensed Cons

In [22]:
# Separate lists for summaries and titles
import re
# Separate lists for summaries and titles
summaries = []
titles = []

# Extracting summaries and titles
for entry in response:
    # Extract summary and title using regex
    summary = re.search(r"- Summary:\s*(.*?)(?=\n)", entry, re.DOTALL)
    title = re.search(r"- Title:\s*(.*)", entry)
    
    if summary and title:
        summaries.append(summary.group(1).strip())
        titles.append(title.group(1).strip())

# Output the variables
print("Summaries:", summaries)
print("Titles:", titles)

Summaries: ['The table contains a list of sections from a financial document, along with corresponding page numbers. The sections include condensed consolidated financial statements, notes to the financial statements, and specific details on different financial assets, liabilities, equity, income, and expenses.', 'The data in the tables presents a detailed breakdown of the Condensed Consolidated Balance Sheets as at March 31, 2024, compared to the previous year. The assets are divided into non-current assets and current assets, with significant changes observed in property, plant and equipment, investments, and trade receivables. On the other hand, the equity and liabilities sections highlight variations in equity share capital, other equity, lease liabilities, and trade payables. Overall, total assets, total equity, and total liabilities have shown growth compared to the previous year.', "The data in the tables represents the Condensed Consolidated Statement of Profit and Loss for a c

In [23]:
summaries

['The table contains a list of sections from a financial document, along with corresponding page numbers. The sections include condensed consolidated financial statements, notes to the financial statements, and specific details on different financial assets, liabilities, equity, income, and expenses.',
 'The data in the tables presents a detailed breakdown of the Condensed Consolidated Balance Sheets as at March 31, 2024, compared to the previous year. The assets are divided into non-current assets and current assets, with significant changes observed in property, plant and equipment, investments, and trade receivables. On the other hand, the equity and liabilities sections highlight variations in equity share capital, other equity, lease liabilities, and trade payables. Overall, total assets, total equity, and total liabilities have shown growth compared to the previous year.',
 "The data in the tables represents the Condensed Consolidated Statement of Profit and Loss for a company fo

In [24]:
summaries[0]

'The table contains a list of sections from a financial document, along with corresponding page numbers. The sections include condensed consolidated financial statements, notes to the financial statements, and specific details on different financial assets, liabilities, equity, income, and expenses.'

In [25]:
titles

['Financial Document Structure and Analysis',
 'Analysis of Condensed Consolidated Balance Sheets: March 31, 2024 vs. March 31, 2023',
 'Financial Performance Analysis of Company for the Year Ended March 31',
 'Analysis of Equity Components and Changes in Equity for a Company',
 'Comprehensive Analysis of Cash Flow Activities for Years 2023 and 2024',
 'Financial Summary and Cash Flow Analysis of Business Operations',
 'Analysis of Acquisition Components and Purchase Price Allocation',
 'Depreciation Periods for Various Assets',
 'Analysis of Company Assets and Depreciation from January to March 2024',
 'Analysis of Company Assets and Depreciation Trends from January to March 2023',
 'Analysis of Asset Values and Depreciation Trends from April 2023 to March 2024',
 'Analysis of Asset Values and Changes from 2022 to 2023',
 'Analysis of Carrying Value Trends and Factors for Fiscal Years 2023 and 2024',
 'Analysis of Non-Current and Current Investments as of March 31, 2024 and 2023',
 'A

# process the data

In [26]:
from typing import Any
from pydantic import BaseModel

class Element(BaseModel):
    type: str
    page_content: Any
    title: str


In [27]:

table_list = []

assert len(titles) == len(table_html)

# Iterate through the table_html list
for i, element in enumerate(table_html):
    # Retrieve the corresponding title from the titles list
    title = titles[i] if i < len(titles) else "Default Title"

    # Append the Element instance to the categorized_elements list
    table_list.append(
        Element(
            type="table",
            title=str(title),       
            page_content=str(element)  
        )
    )


In [28]:
table_list

[Element(type='table', page_content='<table><thead><tr><th>ndex</th><th>Page No.</th></tr></thead><tbody><tr><td>“ondensed Consolidated Balance Sheet</td><td>1</td></tr><tr><td>“ondensed Consolidated Statement of Profit and Loss</td><td></td></tr><tr><td>“ondensed Consolidated Statement of Changes in Equity</td><td>E</td></tr><tr><td>“ondensed Consolidated Statement of Cash Flows</td><td>5</td></tr><tr><td colspan="2">)verview and Notes to the Interim Condensed Consolidated Financial Statements</td></tr><tr><td colspan="2">. Overview</td></tr><tr><td>1.1 Company overview</td><td></td></tr><tr><td>1.2 Basis of preparation of financial statements</td><td>R 7</td></tr><tr><td>1.3 Basis of consolidation</td><td>S</td></tr><tr><td>1.4 Use of estimates and judgments</td><td>7</td></tr><tr><td>1.5 Critical accounting estimates and judgments....</td><td>[ R 8</td></tr><tr><td colspan="2">. Notes to the Interim Condensed Consolidated Financial Statements</td></tr><tr><td>2.1 Business Combinatio

In [79]:
first = table_list[16].page_content

In [80]:
Markdown(first)

<table><thead><tr><th rowspan="2">Particulars</th><th>Asat</th><th></th></tr><tr><th>March 31, 2024</th><th>March 31, 2023</th></tr></thead><tbody><tr><td colspan="3">Non Current</td></tr><tr><td>Security deposits</td><td>259</td><td>287</td></tr><tr><td># Unbilled revenues</td><td>1,677</td><td>1,185</td></tr><tr><td>) Net investment in sublease of right-of-use asset a</td><td>3</td><td>305</td></tr><tr><td>Restricted deposits "</td><td>47</td><td>96</td></tr><tr><td>Others</td><td>1,119</td><td>925</td></tr><tr><td>Total non-current other financial assets</td><td>3,105</td><td>2,798</td></tr><tr><td colspan="3">Current</td></tr><tr><td>Security deposits</td><td>75</td><td>42</td></tr><tr><td>Restricted deposits V"</td><td>2,535</td><td>2,348</td></tr><tr><td># Unbilled revenues</td><td>7,923</td><td>8,317</td></tr><tr><td>Interest accrued but not due</td><td>537</td><td>488</td></tr><tr><td>Foreign currency forward and options contracts</td><td>84</td><td>101</td></tr><tr><td>Net investment in sublease of right of-use-asset</td><td>6</td><td>53</td></tr><tr><td>Others V™"</td><td>925</td><td>255</td></tr><tr><td>Total current other financial assets</td><td>12,085</td><td>11,604</td></tr><tr><td>Total other financial assets</td><td>15,190</td><td>14,402</td></tr><tr><td>O Financial assets carried at amortized cost</td><td>15,106</td><td>14,301</td></tr><tr><td>@ Financial assets carried at fair value through other comprehensive income</td><td>23</td><td>32</td></tr><tr><td>) Financial assets carried at fair value through profit or loss</td><td>61</td><td>69</td></tr></tbody></table>

In [29]:
len(table_list)


49

# store the summaries in the vectorDB

Chroma is vector store, it is used for storing and retriveing vector embeedings. 

Hew does chroma DB work
1. Datasttucted " organises data in a structured format optimized
2. storage
3. indesing
4. querying
5. analysis

- InMemoryStore stores the raw text, tables
- vectorstore stores the embedded summaries

In [30]:
llm

ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x12bce0350>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x12bce1760>, root_client=<openai.OpenAI object at 0x1044d8830>, root_async_client=<openai.AsyncOpenAI object at 0x12bce0380>, model_kwargs={}, openai_api_key=SecretStr('**********'))

In [32]:
pip install -U langchain-openai langchain-chroma


Collecting langchain-openai
  Downloading langchain_openai-0.3.1-py3-none-any.whl.metadata (2.7 kB)
Collecting langchain-chroma
  Using cached langchain_chroma-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Collecting chromadb!=0.5.10,!=0.5.11,!=0.5.12,!=0.5.4,!=0.5.5,!=0.5.7,!=0.5.9,<0.6.0,>=0.4.0 (from langchain-chroma)
  Using cached chromadb-0.5.23-py3-none-any.whl.metadata (6.8 kB)
Collecting tokenizers<=0.20.3,>=0.13.2 (from chromadb!=0.5.10,!=0.5.11,!=0.5.12,!=0.5.4,!=0.5.5,!=0.5.7,!=0.5.9,<0.6.0,>=0.4.0->langchain-chroma)
  Using cached tokenizers-0.20.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.7 kB)
Downloading langchain_openai-0.3.1-py3-none-any.whl (54 kB)
Using cached langchain_chroma-0.2.0-py3-none-any.whl (11 kB)
Using cached chromadb-0.5.23-py3-none-any.whl (628 kB)
Using cached tokenizers-0.20.3-cp312-cp312-macosx_11_0_arm64.whl (2.6 MB)
Installing collected packages: tokenizers, langchain-openai, chromadb, langchain-chroma
  Attempting uninstall: tokenizers
    Found ex

In [33]:
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="summaries",
    embedding_function=OpenAIEmbeddings(model="text-embedding-3-large"),
    persist_directory="./chroma_data",
)





In [34]:
# The storage layer for the parent documents
store = InMemoryStore()
id_key = "doc_id"

In [35]:
# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)

In [36]:
table_ids = [str(uuid.uuid4()) for _ in table_list]
summary_tables = [
    Document(page_content=s, metadata={id_key: table_ids[i], "title": titles[i]})
    for i, s in enumerate(summaries)
]

In [37]:
summary_tables


[Document(metadata={'doc_id': '7b2d3e8b-54d2-4887-b7d5-3030fe4a51cf', 'title': 'Financial Document Structure and Analysis'}, page_content='The table contains a list of sections from a financial document, along with corresponding page numbers. The sections include condensed consolidated financial statements, notes to the financial statements, and specific details on different financial assets, liabilities, equity, income, and expenses.'),
 Document(metadata={'doc_id': 'b4e4bb8a-a7c8-4976-83fb-68881ecf1228', 'title': 'Analysis of Condensed Consolidated Balance Sheets: March 31, 2024 vs. March 31, 2023'}, page_content='The data in the tables presents a detailed breakdown of the Condensed Consolidated Balance Sheets as at March 31, 2024, compared to the previous year. The assets are divided into non-current assets and current assets, with significant changes observed in property, plant and equipment, investments, and trade receivables. On the other hand, the equity and liabilities sections

In [38]:
retriever.vectorstore.add_documents(summary_tables)

INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


['11b5d637-a6c9-4e44-9c2d-0b47484e1fee',
 '14ef4c7b-61ef-4cae-b768-71699828c255',
 '697c3abd-7224-4e8c-9bf2-5a298f762f30',
 '16c29b9b-f26b-4a2d-8db3-d7b3c8291f79',
 '39961cfb-00fb-4a62-a3b6-c61cd39879ee',
 '563785fc-a629-4339-b42d-3f0c96336060',
 '636119d1-c0d6-46b3-b4f1-b273aed7fe16',
 '7bd0e285-175c-4490-83e9-bc219372634f',
 '03541133-be86-406c-bc72-c18b238b7146',
 'd3272647-a846-4e49-9a70-06c2fdbe8b0d',
 'f839b282-0637-4180-9067-f223b0d8a46a',
 '9af02f86-5415-4e96-9cf7-9e8ed0d51508',
 '4e9d7a74-1486-48aa-9075-ef1ccbe0f0ee',
 '7bfbaada-b211-4e55-bd70-2e97f9b4810f',
 'fc05b5a7-f323-47a8-a729-da1fd3840602',
 '30e7aa08-efa8-412f-b6f6-ac7b25e08fbd',
 '9c36df5d-5425-40c3-a4ee-23cf71c94d81',
 '173e42d3-60a9-4a18-9299-dbb23fde0f49',
 'fe3ce1e2-6a29-4124-bd42-1f3b43a8adfb',
 '26ef8fae-0604-4eaf-b772-979e68f4e53f',
 '21d2002d-55cf-47ff-9284-4ba04644fd99',
 'ac9af9d0-551b-4623-a707-f8da5ef27092',
 '7962c284-0f93-4d57-b740-a40232679040',
 'e3821138-7042-4912-b209-8a7be3b734fa',
 '41787417-d258-

Traceback (most recent call last):
  File "/Users/poorna/.vscode/extensions/ms-python.python-2024.14.0-darwin-arm64/python_files/python_server.py", line 130, in exec_user_input
    retval = callable_(user_input, user_globals)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<string>", line 2
    def table_data(self, table_html: List[str], titles: List[str]) -> List[Element]:
IndentationError: unexpected indent



In [40]:
retriever.docstore.mset(list(zip(table_ids, table_list)))

In [41]:
retriever_first_response = retriever.invoke("what is the total equity value in march 2023")

INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [42]:
print(retriever_first_response)

retriever_first_response_page_content = retriever_first_response[0].page_content
print(retriever_first_response_page_content)

[Element(type='table', page_content='<table><thead><tr><th rowspan="2">Particulars</th><th colspan="2">(In &lt; crore, except as otherwise stated) As at</th></tr><tr><th>March 31, 2024</th><th>March 31, 2023</th></tr></thead><tbody><tr><td colspan="3">Authorized</td></tr><tr><td colspan="3">Equity shares, Z5/- par value</td></tr><tr><td>4,80,00,00,000 (4,80,00,00,000) equity shares</td><td>2,400</td><td>2,400</td></tr><tr><td colspan="3">Issued, Subscribed and Paid-Up</td></tr><tr><td>Equity shares, Z5/- par value®</td><td>2,071</td><td>2,069</td></tr><tr><td colspan="3">4,13,99,50,635 (4,13,63,87,925) equity shares fully paid-up®</td></tr><tr><td></td><td>2,071</td><td>2,069</td></tr></tbody></table>', title='Analysis of Equity Shares Data as of March 31, 2024'), Element(type='table', page_content='<table><thead><tr><th rowspan="2">Particulars</th><th rowspan="2">Equity \n Share capital</th><th rowspan="2">Capital \n reserve</th><th colspan="5">OTHER EQUITY</th><th></th><th colspan="4

In [43]:
from IPython.display import Markdown


Markdown(retriever_first_response_page_content)

<table><thead><tr><th rowspan="2">Particulars</th><th colspan="2">(In &lt; crore, except as otherwise stated) As at</th></tr><tr><th>March 31, 2024</th><th>March 31, 2023</th></tr></thead><tbody><tr><td colspan="3">Authorized</td></tr><tr><td colspan="3">Equity shares, Z5/- par value</td></tr><tr><td>4,80,00,00,000 (4,80,00,00,000) equity shares</td><td>2,400</td><td>2,400</td></tr><tr><td colspan="3">Issued, Subscribed and Paid-Up</td></tr><tr><td>Equity shares, Z5/- par value®</td><td>2,071</td><td>2,069</td></tr><tr><td colspan="3">4,13,99,50,635 (4,13,63,87,925) equity shares fully paid-up®</td></tr><tr><td></td><td>2,071</td><td>2,069</td></tr></tbody></table>

In [81]:
# Prompt template
template = """Answer the question based only on the following context, which can include text and tables:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# LLM
model = llm

# RAG pipeline
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [93]:
chain.invoke("what is the total equity value in march29 2023")

INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


'The total equity value as of March 31, 2023, is 75,795 crore.'