In [165]:
import os
from dotenv import load_dotenv 
import json
from unstructured_client import UnstructuredClient
from unstructured_client.models import shared, operations
from unstructured_client.models.errors import SDKError
from unstructured.staging.base import dict_to_elements, elements_to_json
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_ollama import ChatOllama
import uuid
from langchain_ollama import OllamaEmbeddings
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain.schema.document import Document

from langchain.retrievers.multi_vector import MultiVectorRetriever
load_dotenv()

True

## Unstructed api call

In [2]:
unstructured_api_key = os.getenv("UNSTRUCTURED_API_KEY")
unstructured_api_url = os.getenv("UNSTRUCTURED_API_URL")

## Define a cilent

In [3]:
client = UnstructuredClient(
    api_key_auth=unstructured_api_key,
    server_url=unstructured_api_url,
)

## Load a PDF File 

In [4]:
pdf_path="statement.pdf"


with open(pdf_path, "rb") as f:
    files = shared.Files(
        content=f.read(),
        file_name=pdf_path
    )

## Requests to api for PDF

In [7]:
req = operations.PartitionRequest(
    partition_parameters=shared.PartitionParameters(
        files=files,
        strategy="hi_res",
        hi_res_model_name="yolox",
        skip_infer_table_types=[],
        pdf_infer_table_structure=True,
        coordinates = True,
    )
)
try:
    resp = client.general.partition(request=req)
    elements = dict_to_elements(resp.elements)
except SDKError as e:
    print(e)

INFO: HTTP Request: GET https://api.unstructuredapp.io/general/docs "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"


### Print element of the PDF
1. Table
2. Text
3. Footer
4. ListItem
5. NarrativeText
6. UncategorizedText

In [11]:
elements_category=set()
for el in elements:
    elements_category.add(el.category)
    

In [13]:
elements_category

{'Footer',
 'Image',
 'ListItem',
 'NarrativeText',
 'Table',
 'Title',
 'UncategorizedText'}

## Extract only the table element from the elements of the pdf

In [16]:
table_elements = [element for element in elements if element.category == "Table"]

# Length of the table elements

Means no of the table in a pdf

In [18]:
len(table_elements)

49

# Generate a title and headers using LLM for the table elements

To do this first convert all the table elemets convert ino the HTML Format

In [151]:
table_html = [table.metadata.text_as_html for table in table_elements]

In [23]:
len(table_html) == len(table_elements)

True

In [41]:
first_table = table_html[:2]
first_table

['<table><thead><tr><th>ndex</th><th>Page No.</th></tr></thead><tbody><tr><td>“ondensed Consolidated Balance Sheet</td><td>1</td></tr><tr><td>“ondensed Consolidated Statement of Profit and Loss</td><td></td></tr><tr><td>“ondensed Consolidated Statement of Changes in Equity</td><td>E</td></tr><tr><td>“ondensed Consolidated Statement of Cash Flows</td><td>5</td></tr><tr><td colspan="2">)verview and Notes to the Interim Condensed Consolidated Financial Statements</td></tr><tr><td colspan="2">. Overview</td></tr><tr><td>1.1 Company overview</td><td></td></tr><tr><td>1.2 Basis of preparation of financial statements</td><td>R 7</td></tr><tr><td>1.3 Basis of consolidation</td><td>S</td></tr><tr><td>1.4 Use of estimates and judgments</td><td>7</td></tr><tr><td>1.5 Critical accounting estimates and judgments....</td><td>[ R 8</td></tr><tr><td colspan="2">. Notes to the Interim Condensed Consolidated Financial Statements</td></tr><tr><td>2.1 Business Combinations</td><td>10</td></tr><tr><td>2.2 

In [46]:
# summary =  ChatPromptTemplate.from_template(
#     """
#     generate summary for the following tables given : \n {doc} 
#     """
# )

# title = ChatPromptTemplate.from_template(
#     """
#     provide a title of the given {summary}
    
#     """
# )



prompt = ChatPromptTemplate.from_template(
    """
    1. Generate a summary for the following tables given: \n {doc}
    2. Provide a title for the from the summary
    """
)


In [47]:
model="llama3:latest"
llm = ChatOllama(model = model)

In [48]:
summary_chain = (
    {"doc": lambda x: x}
    |prompt
    |llm
    |StrOutputParser()  
)


In [50]:
summary_chain

{
  doc: RunnableLambda(lambda x: x)
}
| ChatPromptTemplate(input_variables=['doc'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['doc'], input_types={}, partial_variables={}, template='\n    1. Generate a summary for the following tables given: \n {doc}\n    2. Provide a title for the from the summary\n    '), additional_kwargs={})])
| ChatOllama(model='llama3:latest')
| StrOutputParser()

In [65]:
response = summary_chain.batch(table_html,{"max_concurrency":2})



INFO: HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO: HTTP Request: POST 

In [66]:
response

['**Summary:**\n\nThe table provides an overview of the company\'s financial statements, including its condensed consolidated balance sheet, statement of profit and loss, changes in equity, and cash flows. The "Overview" section covers the basis of preparation of financial statements, consolidation, use of estimates and judgments, and critical accounting estimates and judgments.\n\nThe "Notes to the Interim Condensed Consolidated Financial Statements" section provides additional information on various topics, including:\n\n* Business combinations\n* Property, plant, and equipment\n* Goodwill and other intangible assets\n* Investments\n* Loans\n* Other financial assets and liabilities\n* Trade receivables and cash and cash equivalents\n* Other assets and equity\n* Financial instruments\n* Provisions, income taxes, revenue from operations, and expenses\n\nThe notes also cover leases, contingent liabilities and commitments, related party transactions, segment reporting, and function-wise 

## Lengof th response

In [67]:
len(response)

49

In [186]:
# Initialize lists to store summaries and titles
summaries = []
titles = []

# Extract summary and title from each entry in the response
for entry in response:
    # Check if both Summary and Title delimiters exist
    if '**Summary:**' in entry and '**Title:**' in entry:
        # Split the entry at the '**Title:**' delimiter
        parts = entry.split('**Title:**')
        
        # Extract the summary (before '**Title:**')
        summary = parts[0].replace('**Summary:**', '').strip()
        
        # Extract the title (after '**Title:**')
        title = parts[1].strip()
        
        # Append to lists
        summaries.append(summary)
        titles.append(title)
    else:
        # Handle the case where the format is unexpected (missing title)
        summary = entry.replace('**Summary:**', '').strip()
        # Take the first 200 characters of the summary if no title is available
        title = summary[:230] if summary else "No title available"
        
        summaries.append(summary)
        titles.append(title)

# Now `summaries` and `titles` contain the extracted values
print("Summaries:", summaries)
print("Titles:", titles)


Summaries: ['The table provides an overview of the company\'s financial statements, including its condensed consolidated balance sheet, statement of profit and loss, changes in equity, and cash flows. The "Overview" section covers the basis of preparation of financial statements, consolidation, use of estimates and judgments, and critical accounting estimates and judgments.\n\nThe "Notes to the Interim Condensed Consolidated Financial Statements" section provides additional information on various topics, including:\n\n* Business combinations\n* Property, plant, and equipment\n* Goodwill and other intangible assets\n* Investments\n* Loans\n* Other financial assets and liabilities\n* Trade receivables and cash and cash equivalents\n* Other assets and equity\n* Financial instruments\n* Provisions, income taxes, revenue from operations, and expenses\n\nThe notes also cover leases, contingent liabilities and commitments, related party transactions, segment reporting, and function-wise class

In [188]:
len(summaries)
len(titles)

49

# process the data

In [189]:
from typing import Any
from pydantic import BaseModel

class Element(BaseModel):
    type: str
    page_content: Any
    title: str

# # Categorize by type
# categorized_elements = []

# for element in elements:
#         if "unstructured.documents.elements.Table" in str(type(element)):
#             categorized_elements.append(Element(type="table",year="2024",title = title , page_content=str(element.metadata.text_as_html)))


In [190]:
# categorized_elements = []

# # Assuming `elements` is a list of your elements and titles should be assigned from the titles list
# for i, element in enumerate(elements):
#     # Check if the element is of type 'Table' (or other types you want to categorize)
#     if "unstructured.documents.elements.Table" in str(type(element)):
#         # Extract the title from the titles list, using `i` to index the correct title
#         title = titles[i] if i < len(titles) else "Default Title"  # Fallback if the index is out of range
        
#         # Create an instance of Element and append to the categorized_elements list
#         categorized_elements.append(
#             Element(
#                 type="table",
#                 title=str(title),
#                 page_content=str(element.metadata.text_as_html)  # Extract HTML content as a string
#             )
#         )



In [191]:

categorized_elements = []

assert len(titles) == len(table_html)

# Iterate through the table_html list
for i, element in enumerate(table_html):
    # Retrieve the corresponding title from the titles list
    title = titles[i] if i < len(titles) else "Default Title"

    # Append the Element instance to the categorized_elements list
    categorized_elements.append(
        Element(
            type="table",
            title=str(title),         
            page_content=str(element)  
        )
    )


In [192]:
len(categorized_elements)


49

# store the summaries in the vectorDB

Chroma is vector store, it is used for storing and retriveing vector embeedings. 

Hew does chroma DB work
1. Datasttucted " organises data in a structured format optimized
2. storage
3. indesing
4. querying
5. analysis

- InMemoryStore stores the raw text, tables
- vectorstore stores the embedded summaries

In [193]:
llm

ChatOllama(model='llama3:latest')

In [194]:
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="summaries",
    embedding_function=OllamaEmbeddings(model=model),
    persist_directory="./chroma_data",
)


In [195]:
# The storage layer for the parent documents
store = InMemoryStore()
id_key = "doc_id"

In [196]:
# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)

In [None]:
table_ids = [str(uuid.uuid4()) for _ in categorized_elements]
summary_tables = [
    Document(page_content=s, metadata={id_key: table_ids[i], "title": titles[i]})
    for i, s in enumerate(summaries)
]

In [202]:
summary_tables


[Document(metadata={'doc_id': '158a5fe6-679c-428a-873f-3b0abb8b1535', 'title': 'Interim Condensed Consolidated Financial Statements Overview and Notes'}, page_content='The table provides an overview of the company\'s financial statements, including its condensed consolidated balance sheet, statement of profit and loss, changes in equity, and cash flows. The "Overview" section covers the basis of preparation of financial statements, consolidation, use of estimates and judgments, and critical accounting estimates and judgments.\n\nThe "Notes to the Interim Condensed Consolidated Financial Statements" section provides additional information on various topics, including:\n\n* Business combinations\n* Property, plant, and equipment\n* Goodwill and other intangible assets\n* Investments\n* Loans\n* Other financial assets and liabilities\n* Trade receivables and cash and cash equivalents\n* Other assets and equity\n* Financial instruments\n* Provisions, income taxes, revenue from operations, 

In [203]:
retriever.vectorstore.add_documents(summary_tables)

INFO: HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


['ce5a56ba-e9d5-4628-beb4-6e35e2c814ee',
 'ed61a884-5660-4b41-80c4-adb19a0ec2ce',
 '544f98ea-05e4-42a7-aabf-b5f33ac1d7b3',
 'e38d1595-3aa9-4733-8fce-f17c37833e74',
 'a649ed81-8c4f-4415-9152-26c46e5d8350',
 '7a7c5c61-f111-400b-b686-a66e360ad9a6',
 '0b349923-b6bd-4b10-bfba-430f3126db30',
 '33f59cda-aea6-4ff4-9968-99798cb4bca5',
 '638fcb58-ce26-42c2-8f0f-6e10f7f91f77',
 '94a53883-5bac-43c7-abf8-b9d172d7c426',
 'bc5b8c44-716d-4408-bbe5-1b2e1f860aa3',
 '9e55ec62-41cb-4130-bdb5-933c7cd25b2b',
 '2dc36851-7598-41ac-88bb-b11c2e8c9bf0',
 '8611d96f-e27f-40f9-9690-afee5e798863',
 '6799507a-d936-422c-a9e3-87c05e8de991',
 '88db1dec-446c-4058-a7af-eff54a084f7e',
 '0eef1171-cb9d-41b7-af97-7d89c41990a5',
 'f8ca46ec-38f2-4f81-9b3d-5c3c4bdd7c10',
 'e82bad60-3aa8-471e-8f24-0a0569d6fb24',
 '69626e5a-cc49-4749-80de-3a08be18ae69',
 '56da847a-3693-4470-96b0-3fb3de19f902',
 '23830b7b-b191-47f0-9574-712b97df6184',
 '7cda5cc3-4f97-41c5-8c14-959e1f5e289f',
 '2cec9294-ad13-4fc5-a55f-c2970dcb8d9e',
 'cad53d1e-9888-

In [205]:
retriever.docstore.mset(list(zip(table_ids, categorized_elements)))

In [206]:
retriever_first_response = retriever.invoke("what is the total equity value in march 2023")

INFO: HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


In [208]:
print(retriever_first_response)

retriever_first_response_page_content = retriever_first_response[0].page_content
print(retriever_first_response_page_content)

[Element(type='table', page_content='<table><thead><tr><th rowspan="3">Class of investment</th><th rowspan="3">Method</th><th colspan="2">(In X crore)</th></tr><tr><th colspan="2">Fair value as at</th></tr><tr><th>March 31, 2024</th><th>March 31, 2023</th></tr></thead><tbody><tr><td>Liquid mutual fund units - carried at fair value through profit or loss</td><td>Quoted price</td><td>2,615</td><td>975</td></tr><tr><td>Target maturity fund units - carried at fair value through profit or loss</td><td>Quoted price</td><td>431</td><td>402</td></tr><tr><td>Tax free bonds and government bonds - carried at amortized cost</td><td>Quoted price and market observable inputs</td><td>1,973</td><td>2,148</td></tr><tr><td>Non-convertible debentures - carried at fair value through other comprehensive income</td><td>Quoted price and market observable inputs</td><td>4,179</td><td>3,868</td></tr><tr><td>Government securities - carried at fair value through other comprehensive income</td><td>Quoted price an

In [210]:
from IPython.display import Markdown


Markdown(retriever_first_response_page_content)

<table><thead><tr><th rowspan="3">Class of investment</th><th rowspan="3">Method</th><th colspan="2">(In X crore)</th></tr><tr><th colspan="2">Fair value as at</th></tr><tr><th>March 31, 2024</th><th>March 31, 2023</th></tr></thead><tbody><tr><td>Liquid mutual fund units - carried at fair value through profit or loss</td><td>Quoted price</td><td>2,615</td><td>975</td></tr><tr><td>Target maturity fund units - carried at fair value through profit or loss</td><td>Quoted price</td><td>431</td><td>402</td></tr><tr><td>Tax free bonds and government bonds - carried at amortized cost</td><td>Quoted price and market observable inputs</td><td>1,973</td><td>2,148</td></tr><tr><td>Non-convertible debentures - carried at fair value through other comprehensive income</td><td>Quoted price and market observable inputs</td><td>4,179</td><td>3,868</td></tr><tr><td>Government securities - carried at fair value through other comprehensive income</td><td>Quoted price and market observable inputs</td><td>7,362</td><td>7,632</td></tr><tr><td>Commercial Papers - carried at fair value through other comprehensive income</td><td>Market observable inputs</td><td>4,830</td><td>742</td></tr><tr><td>Certificates of deposit - carried at fair value through other comprehensive income</td><td>Market observable inputs</td><td>3,043</td><td>3,574</td></tr><tr><td>Quoted Equity securities - carried at fair value through other comprehensive income</td><td>Quoted price</td><td>113</td><td>—</td></tr><tr><td>Unquoted equity and preference securities - carried at fair value through other comprehensive income</td><td>Discounted cash flows method, Market multiples method, Option pricing model</td><td>93</td><td>196</td></tr><tr><td>Others - carried at fair value through profit or loss</td><td>Discounted cash flows method, Market multiples method, Option pricing model</td><td>198</td><td>169</td></tr><tr><td>Total</td><td></td><td>24,837</td><td>19,706</td></tr></tbody></table>

In [215]:
# Prompt template
template = """Answer the question based only on the following context, which can include text and tables:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# LLM
model = llm

# RAG pipeline
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [216]:
chain.invoke("What is the Total assets in march31, 2024?")

INFO: HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


'Based on the given context, I can answer the question as follows:\n\nThe "Share Capital and Amount Held by the Company over Two Consecutive Years" table shows that the total share capital at the end of March 31, 2024 is ₹413,99,50,635.\n\nAdditionally, the "Non-Current Other Assets" table shows that the non-current other assets as of March 31, 2024 are ₹14,929 crore. Similarly, the "Current Other Assets" table shows that the current other assets as of March 31, 2024 are ₹12,808 crore.\n\nTherefore, the total assets in march 31, 2024 would be:\n\nShare Capital: ₹413,99,50,635\nNon-Current Other Assets: ₹14,929 crore\nCurrent Other Assets: ₹12,808 crore\n\nTotal Assets: ₹441,73,87,368'