## This example tries to extract the data by chunking any document (large document) with small model

In [1]:
import sys
import os
import warnings
import requests
warnings.filterwarnings(action='ignore')
sys.path.append(".")

In [2]:
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_openai import AzureChatOpenAI
from bs4 import BeautifulSoup
from dotenv import load_dotenv
load_dotenv()

True

In [18]:
llm = AzureChatOpenAI(
    openai_api_version=os.environ["OPENAI_API_VERSION"],
    deployment_name=os.environ["AZURE_DEPLOYMENT_NAME"],
    openai_api_type=os.environ["OPENAI_API_TYPE"],
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    openai_api_key=os.environ["AZURE_OPENAI_API_KEY"],
    default_headers={
            "fds-message-id": "14485820-1e81-4f0b-a708-d386d4672a81",
            "fds-conversation-id": "9616f92c-070c-4f3b-8e51-5321b472b24c"
        }
)
llm_transformer = LLMGraphTransformer(llm=llm)

In [19]:
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_core.documents import Document
username=os.environ["NEO4J_USERNAME"]
password=os.environ["NEO4J_PASSWORD"]
url=os.environ["NEO4J_URI"]
graph = Neo4jGraph(username=username, password=password, url=url)

In [20]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [21]:
print(f"Below document has {'118,735'} tokens.")
url = "https://ffdocproxy.prod.factset.com/DocProxy/Fetch/EA7CA834-8E1F-EF11-ABE1-12BC89A8F273"
print(url)
response = requests.get(url)
text = response.content

Below document has 118,735 tokens.
https://ffdocproxy.prod.factset.com/DocProxy/Fetch/EA7CA834-8E1F-EF11-ABE1-12BC89A8F273


In [22]:
soup = BeautifulSoup(text, 'html.parser')

In [23]:
text = soup.text

In [33]:
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=50000,
    chunk_overlap=1000,
    length_function=len,
    is_separator_regex=False,
)

In [34]:
documents = text_splitter.create_documents([text])

In [35]:
#type(texts[0])

In [36]:
#documents = [Document(page_content=text)]
graph_documents = llm_transformer.convert_to_graph_documents(documents)
print(f"Nodes:{graph_documents[0].nodes}")
print(f"Relationships:{graph_documents[0].relationships}")

Nodes:[Node(id='Registration Statement No. 333-264388', type='Registration statement'), Node(id='Rule 424(B)(2)', type='Rule'), Node(id='Pricing Supplement', type='Document'), Node(id='May 30, 2024', type='Date'), Node(id='Prospectus', type='Document'), Node(id='May 26, 2022', type='Date'), Node(id='Us$2,000,000,000', type='Amount'), Node(id='Senior Medium-Term Notes, Series H', type='Financial instrument'), Node(id='Us$750,000,000 5.370% Senior Notes Due 2027', type='Financial instrument'), Node(id='Us$800,000,000 5.511% Senior Notes Due 2031', type='Financial instrument'), Node(id='Us$450,000,000 Floating Rate Notes Due 2027', type='Financial instrument'), Node(id='Bank Of Montreal', type='Organization'), Node(id='Compounded Sofr', type='Interest rate'), Node(id='Sofr Index', type='Index'), Node(id='Canada Deposit Insurance Corporation Act (Canada)', type='Legislation'), Node(id='Province Of Ontario', type='Location'), Node(id='Federal Laws Of Canada', type='Legislation'), Node(id='B

In [37]:
graph.add_graph_documents(
  graph_documents, 
  baseEntityLabel=True, 
  include_source=True
)

In [48]:
query = """Please extract the following information from the given document:
 
ISIN: ISIN is a standard security code that is used worldwide to identify specific securities such as bonds, stocks (common and preferred), futures, warrant, rights, trusts, commercial paper, and options.
 
Example Value: US606822DE19
Entity: Issuer is a legal entity that develops, registers and sells securities with the purpose of financing its operations.
 
Example Value: Mitsubishi UFJ Financial Group, Inc.
Deal Type: Deal Type is the security type grouping of the instrument being offered.
 
Example Value: Medium-Term Notes
Seniority: Seniority of a security shows the order of repayment in the event the issuer goes bankrupt or defaults.
 
Example Value: Senior
COCO: Flag for Contingent convertibles, also known as CoCos, are a type of hybrid debt security that can be converted into equity shares if a specified trigger event occurs.
 
Example Value: null
Pledge Status: Indicator whether the debt is secured by a specific asset of the issuer to give protection to the bondholders in case the issuer defaults.
 
Example Value: Unsecured
Issue Price (%): Issue Price is the price at which investors buy the bonds when they are first issued, which will typically be approximately equal to the nominal amount.
 
Example Value: 100.00000
Face Value: Face Value/Par Value is the actual currency amount that each security is worth. Any calculation of redemption, conversion or liquidation of the security is based on Face Value.
 
Example Value: 1000.00000
Min Denomination/Subscription: Min Denomination is the minimum amount an investor must subscribe to in order to purchase the securities.
 
Example Value: 200000.00
Issue Date: Date when the security is issued.
 
Example Value: 2024-04-17
Principal Amount: The aggregate nominal amount issued for the security at the time of issuance.
 
Example Value: 900000000.00000
Issue Currency: Currency of the Principal Amount that the notes are issued.
 
Example Value: U.S. Dollar
Scheduled Maturity Date: Date when the security is redeemed.
 
Example Value: 2030-04-17
First Payment Date: Date when the first coupon/divided payment will happen.
 
Example Value: 2024-10-17
Type: Type of Coupon (e.g., Fixed Rate).
 
Example Value: Fixed Rate
Rate: Fixed Annual Coupon Rate/Annual Accretion Rate Value.
 
Example Value: 5.25800
Base Index: Reference instrument the Coupon Type Variable/Conditional/Floating Rate will be using.
 
Example Value: null
Spread: Fixed value that is added/subtracted to the Base Index when calculating the coupon rate in a coupon period.
 
Example Value: null
Day Count: Day Count Conventions are used to count the appropriate number of days between two dates in order to calculate accrued interest, yields and odd coupon amounts.
 
Example Value: 30/360
First Accrual Date: First Accrual Date is the start date of the accrual interest period.
 
Example Value: 2024-04-17
Payment Frequency: Payment Frequency and Payment Frequency Units show how frequent the coupon payments are made.
 
Example Value: 2
Payment Frequency Units: Units of payment frequency (e.g., Times per Year).
 
Example Value: Times per Year
Effective Payment Date: Effective Payment Date is the first interest payment date in a coupon leg.
 
Example Value: 2024-10-17
End Payment Date: End Payment Date is the last interest payment date in a coupon leg.
 
Example Value: 2029-04-17
Call Type: Type of Early Redemption.
 
Example Value: Optional
Call Frequency Type: Early Redemption Frequency type shows how frequently the bonds may be redeemed early.
 
Example Value: Discrete on Schedule
Call Effective Date: Early redemption start date (for continuous on schedule or every coupon or discrete on schedule with periodic dates) or the effective early redemption date (for discrete on schedule – on the effective date).
 
Example Value: 2024-04-17
Call End Date: The end date for the early redemption period.
 
Example Value: 2030-04-17
Call Price: The price at which the principal will be redeemed.
 
Example Value: 100.00000
CONVERTIBILITY: Conversion details of the Bond.
 
Example Value: null
Document to extract from:
 
If any value is not available, return null for that field. Return the output in markdown table format with columns Field, Value with proper column width.
If there are multiple transactions return each in a different markdown table
"""

In [49]:
from langchain_openai import AzureOpenAIEmbeddings

embeddings = AzureOpenAIEmbeddings(
    model="text-embedding-ada-002",
    openai_api_version=os.environ["OPENAI_API_VERSION"],
    openai_api_type=os.environ["OPENAI_API_TYPE"],
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    openai_api_key=os.environ["AZURE_OPENAI_API_KEY"],
)

vector_index = Neo4jVector.from_existing_graph(
    embeddings,
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["text"],
    embedding_node_property="embedding"
)



In [50]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm, retriever=vector_index.as_retriever()
)

result = qa_chain({"query": query})
print(result["result"])

```markdown
### Transaction 1

| Field                          | Value                          |
|--------------------------------|--------------------------------|
| ISIN                           | null                           |
| Entity                         | Bank of Montreal               |
| Deal Type                      | Medium-Term Notes              |
| Seniority                      | Senior                         |
| COCO                           | null                           |
| Pledge Status                  | Unsecured                      |
| Issue Price (%)                | null                           |
| Face Value                     | 1000.00000                     |
| Min Denomination/Subscription  | 1000.00000                     |
| Issue Date                     | null                           |
| Principal Amount               | null                           |
| Issue Currency                 | U.S. Dollar                    |
| Scheduled Matur

In [42]:
llm

AzureChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x00000185152AC210>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x00000185151F2210>, openai_api_key=SecretStr('**********'), openai_proxy='', default_headers={'fds-message-id': '14485820-1e81-4f0b-a708-d386d4672a81', 'fds-conversation-id': '9616f92c-070c-4f3b-8e51-5321b472b24c'}, azure_endpoint='https://azure-llm.factset.com', deployment_name='gpt-4o-0513', openai_api_version='2024-02-01', openai_api_type='azure')