In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_groq import ChatGroq
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import UnstructuredPDFLoader
import os
import sys
from dotenv import load_dotenv
load_dotenv()

groq_api_key=os.getenv("GROQ_API_KEY")
llm=ChatGroq(groq_api_key=groq_api_key,model_name="Llama3-8b-8192")
embeddings=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ["OPENAI_API_KEY"]= os.getenv("OPENAI_API_KEY")
os.environ["LLAMA_CLOUD_API_KEY"]= os.getenv("LLAMA_CLOUD_API_KEY")

In [3]:
from llama_cloud_services import LlamaParse

parser = LlamaParse(
    result_type="markdown",
    system_prompt_append=(
        """This is an Homeowner insurance policy document. If any page does not contain
        headings, find from the previous page for the context. Also, document should clearly
        state what should be covered and what should not be covered in the respective
        categories. Categories can be found in the headings of the pages with largest
        font size."""
    ),
    use_vendor_multimodal_model=True,
    vendor_multimodal_model_name="openai-gpt4o",
    show_progress=True,
)

files = ["sample_policy_doc_AU1234.pdf"]  # get all files from the claims/ directory
md_json_objs = parser.get_json_result(
    files
)  # extract markdown data for insurance claim document
 

Parsing files:   0%|          | 0/1 [00:00<?, ?it/s]

Started parsing the file under job_id ba33feed-4218-447a-b46a-da3f76754c68


Parsing files: 100%|██████████| 1/1 [00:50<00:00, 50.82s/it]


In [4]:
md_json_list = []
for obj in md_json_objs:
    md_json_list.extend(obj["pages"])

In [5]:
from langchain.schema import Document
document_list = [Document(page_content=doc["md"],
                          metadata={"page_number": i+1}) for i, doc in enumerate(md_json_list)]

In [17]:
policy_number = "AU1234"
document_list = [Document(page_content=doc["md"],metadata={"page_number": i+1}) for i, doc in enumerate(md_json_list)]
print("Document objects created successfully")

#adding policy number information in the document metadata
for doc in document_list:
    doc.metadata["policy_number"] = policy_number

Document objects created successfully


In [12]:
import boto3
import os
import uuid
from typing import List
import json
from urllib.parse import unquote_plus
from pinecone import Pinecone

In [13]:
pinecone_api_key="pcsk_2mM5fx_9gfowJX1vxPtpM4ModN4NubrZh6T3YyoTp4x6XUMiShMFU1pReuGTUmUyeqtHYk"


In [15]:
pc = Pinecone(api_key=pinecone_api_key)
policy_number = "AU1234"

In [20]:
dense_index_name = "insurance-virtual-agent-dense"
sparse_index_name = "insurance-virtual-agent-sparse"
dense_index_response = pc.describe_index(name=dense_index_name)
dense_dns_host = dense_index_response["host"]
dense_index = pc.Index(host=dense_dns_host)

sparse_index_response = pc.describe_index(name=sparse_index_name)
sparse_dns_host = sparse_index_response["host"]
sparse_index = pc.Index(host=sparse_dns_host)

In [19]:
if not pc.has_index(dense_index_name):
    pc.create_index_for_model(
        name=dense_index_name,
        cloud="aws",
        region="us-east-1",
        embed={
            "model":"llama-text-embed-v2",
            "field_map":{"text": "chunk_text"}
        }

    )
    print("Pinecone dense index created successfully")
else:
    print(f"Pinecone index {dense_index_name} has already created")

Pinecone dense index created successfully


In [21]:
#store embeddings in pinecone
vectors = []
print("Starting embedding generation")
for doc in document_list:
    vectors.append({
        "id":str(uuid.uuid4()),
        "chunk_text":doc.page_content,
        "policy_number":policy_number
    })

#upsert the records into hybrid index
dense_upsert_response = dense_index.upsert_records(
    "policy-documents",
    vectors
)

sparse_upsert_response = sparse_index.upsert_records(
    "policy-documents",
    vectors
)


Starting embedding generation


In [22]:
print("Dense Upsert response is : ", dense_upsert_response["status_code"])
print(f"Total number of dense vectors are loaded in database : {dense_upsert_response["upsertedCount"]}")
print("Upsert response is : ", sparse_upsert_response.get("status_code"))
print(f"Total number of sparse vectors are loaded in database : {sparse_upsert_response["upsertedCount"]}")

TypeError: 'NoneType' object is not subscriptable