In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_groq import ChatGroq
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import UnstructuredPDFLoader
import os
import sys
from dotenv import load_dotenv
load_dotenv()

groq_api_key=os.getenv("GROQ_API_KEY")
llm=ChatGroq(groq_api_key=groq_api_key,model_name="Llama3-8b-8192")
embeddings=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ["OPENAI_API_KEY"]= os.getenv("OPENAI_API_KEY")
os.environ["LLAMA_CLOUD_API_KEY"]= os.getenv("LLAMA_CLOUD_API_KEY")

In [3]:
from llama_cloud_services import LlamaParse

parser = LlamaParse(
    result_type="markdown",
    system_prompt_append=(
        """This is an Homeowner insurance policy document. If any page does not contain
        headings, find from the previous page for the context. Also, document should clearly
        state what should be covered and what should not be covered in the respective
        categories. Categories can be found in the headings of the pages with largest
        font size."""
    ),
    use_vendor_multimodal_model=True,
    vendor_multimodal_model_name="openai-gpt4o",
    show_progress=True,
)

files = ["sample_policy_doc_AU1234.pdf"]  # get all files from the claims/ directory
md_json_objs = parser.get_json_result(
    files
)  # extract markdown data for insurance claim document
 

Parsing files:   0%|          | 0/1 [00:00<?, ?it/s]

Started parsing the file under job_id ba33feed-4218-447a-b46a-da3f76754c68


Parsing files: 100%|██████████| 1/1 [00:50<00:00, 50.82s/it]


In [4]:
md_json_list = []
for obj in md_json_objs:
    md_json_list.extend(obj["pages"])

In [5]:
from langchain.schema import Document
document_list = [Document(page_content=doc["md"],
                          metadata={"page_number": i+1}) for i, doc in enumerate(md_json_list)]

In [11]:
print(document_list[6])

page_content='# Home Insurance

2. **Sudden and unexpected water leaking from or freezing in any washing machine, dishwasher, fridge, freezer or plumbed in domestic water, drainage or heating installation**

   We will also pay up to the limit for any one claim for necessary and reasonable costs that you incur in finding the source of the damage to the home. This includes reinstating any wall, floor, ceiling, drive, fence or path removed or damaged during the search.

   **Limit** – please refer to your schedule

   **Loss or damage:**
   - to the fittings/installation/appliances themselves is only covered if the damage has happened as a result of an insured cause or cover;
   - while your home is unoccupied or unfurnished;

   The cost of repair of the source of the damage unless the cause is covered elsewhere in this policy.

3. **Theft or attempted theft**

   **Loss or damage:**
   - while your home is unoccupied or unfurnished;
   - caused by you, your family, your domestic staff,

In [11]:
# extract list of pages for insurance claim doc
md_json_list = []
for obj in md_json_objs:
    md_json_list.extend(obj["pages"])

In [21]:
md_json_list[0]

{'page': 1,
 'md': 'This is a sample home insurance policy document from AXA. It provides full wording for all the covers they offer. The document mentions that all available options are on their website, allowing you to choose the level and type of cover. Once a policy is purchased, specific documentation will be provided.',
 'images': [{'name': 'page_1.jpg',
   'height': 0,
   'width': 0,
   'x': 0,
   'y': 0,
   'type': 'full_page_screenshot',
   'path': 'output/data_images\\688df016-02a3-4e53-84fa-afd657620cd1-page_1.jpg',
   'job_id': '688df016-02a3-4e53-84fa-afd657620cd1',
   'original_file_path': 'sample_policy_doc_AU1234.pdf',
   'page_number': 1}],
 'charts': [],
 'items': [{'type': 'text',
   'value': 'This is a sample home insurance policy document from AXA. It provides full wording for all the covers they offer. The document mentions that all available options are on their website, allowing you to choose the level and type of cover. Once a policy is purchased, specific docu

In [27]:
docs = [doc["md"] for doc in md_json_list]

In [28]:
print(docs[5])

This document is a section of a home insurance policy detailing coverage for accidental damage and causes of loss or damage. Here's a summary:

### Accidental Damage (Optional Extra)
- **Coverage Includes:**
  - Damage to cables, drains, pipes, or tanks for which you are responsible.
  - Costs for finding the source of damage, including reinstating any removed or damaged structures.
  - Accidental damage to buildings, including breakage of fixed glass, ceramic hobs, and sanitary ware.

- **Exclusions:**
  - Damage due to inherent defects, frost, faulty workmanship, insects, pets, mechanical failure, and certain external structures unless the home is also damaged.

### Causes
- **Covered:**
  - Loss or damage to buildings from storms or floods.

- **Not Covered:**
  - Damage to gates, hedges, fences, and certain external structures unless the home is also damaged.
  - Storm damage to radio or television aerials or satellite dishes.

The policy schedule will indicate if the accidental da

In [None]:
from langchain_core.documents import Document

document_list = [Document(page_content=doc["md"],metadata={"page_number": i+1}) for i, doc in enumerate(md_json_list)]

In [26]:
len(docs)

58