In [1]:
import boto3
import os
import uuid
from typing import List
import json
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.llms import Bedrock
from urllib.parse import unquote_plus
from pinecone import Pinecone
from pinecone import ServerlessSpec

def initialization(pinecone_api_key:str):
    
    try:
        #pinecone client
        pc = Pinecone(api_key=pinecone_api_key)
        index_name = "insurance-virtual-agent-hybrid"
        #Get the index host
        index_response = pc.describe_index(name=index_name)
        dns_host = index_response["host"]
        index = pc.Index(host=dns_host)
        return pc,index
    except Exception as e:
        print(f"Error occured in Initialization : {e}")

def document_processing(file_path,policy_number):
    #load the pdf file
    loader = PyPDFLoader(file_path)
    docs = loader.load()
    #split the docs
    splitter = RecursiveCharacterTextSplitter(
        chunk_size = 1000,
        chunk_overlap = 200,
        length_function = len
    )
    docs = splitter.split_documents(docs)
    
    #adding policy number information in the document metadata
    for doc in docs:
        doc.metadata["policy_number"] = policy_number

    return docs[0:2]

pinecone_api_key = ""
pc,index = initialization(pinecone_api_key)
policy_number = "AU1234"
download_path = "sample_policy_doc_AU1234.pdf"
documents = document_processing(download_path,policy_number)
vectors = []
for doc in documents:
    sparse_embeddings = pc.inference.embed(
        model="pinecone-sparse-english-v0",
        inputs=[doc.page_content],
        parameters={"input_type": "passage", "truncate": "END"}
    )
  
   

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sparse_embeddings

EmbeddingsList(
  model='pinecone-sparse-english-v0',
  vector_type='sparse',
  data=[
    {'vector_type': sparse, 'sparse_values': [1.8847656, 2.7597656, ..., 2.078125, 0.42382812], 'sparse_indices': [19522071, 58671053, ..., 4061671695, 4243668012]}
  ],
  usage={'total_tokens': 67}
)

In [7]:
sparse_embeddings.data[0]["sparse_values"]

[1.8847656,
 2.7597656,
 1.6796875,
 2.3417969,
 2.7519531,
 1.8505859,
 2.1074219,
 1.8261719,
 2.1269531,
 1.5224609,
 0.41357422,
 1.1777344,
 1.4804688,
 0.64941406,
 0.50927734,
 2.0722656,
 0.35083008,
 1.6455078,
 2.2558594,
 0.6201172,
 1.8076172,
 2.5839844,
 1.2373047,
 2.0546875,
 1.34375,
 2.6679688,
 0.45751953,
 0.3720703,
 1.1621094,
 1.8125,
 3.1523438,
 2.0566406,
 1.0449219,
 0.17504883,
 2.2695312,
 2.3671875,
 1.5478516,
 1.9169922,
 2.4296875,
 3.3203125,
 0.9453125,
 1.7451172,
 4.1992188,
 0.0,
 0.62402344,
 0.640625,
 2.2539062,
 2.078125,
 0.42382812]

In [8]:
sparse_embeddings.data[0]["sparse_indices"]

[19522071,
 58671053,
 62396946,
 264741300,
 273066799,
 494070171,
 613148321,
 670727360,
 766128868,
 963854120,
 1009084850,
 1026658409,
 1061511187,
 1150284091,
 1234868792,
 1265401351,
 1299665196,
 1475817810,
 1520550099,
 1732333218,
 1813251204,
 1852771076,
 1954640689,
 2219907560,
 2295025838,
 2430202379,
 2523355832,
 2621333525,
 2737747873,
 2942967631,
 2968096829,
 3098242165,
 3131381387,
 3165597058,
 3292575152,
 3319683975,
 3417768591,
 3455420776,
 3510989507,
 3597663484,
 3649586921,
 3701427380,
 3753307315,
 3789030606,
 3965692316,
 3973262016,
 3982917068,
 4061671695,
 4243668012]

### pymupdf  

In [1]:
import pymupdf

In [None]:
def extract_data(file_path):
    text_data=[]
    table_data=[]

    with pymupdf.open(file_path) as pdf_file:
        
        #loop through every page in pdf
        for page_number in range(len(pdf_file)):
            page = pdf_file[page_number]

            #get the text on page
            text = page.get_text().strip()
            text_data.append({'response':text,"name":page_number+1})

            #Get the tables on page

In [6]:
file_path = "sample_policy_doc_AU1234.pdf"
with pymupdf.open(file_path) as pdf_file:
    for page_number in range(5):
        page = pdf_file[page_number]
        
        # Extract text from the page
        text = page.find_tables()
        if text:
            text = text[0].extract()
        else:
            text = "No table found on this page."
       
        print(f"Page {page_number + 1} Text:\n{text}\n")

Page 1 Text:
[['', None, None], ['', '', ''], [None, '', None], [None, '', None], [None, '', None], [None, '', None], [None, '', None], [None, '', None], [None, 'This is a sample Policy document that provides full', None], [None, 'wording for all the covers we offer.', None], [None, '', None], [None, 'All available options are on our website which will enable you to choose the level and type of cover. Once you', None], [None, 'have bought your Policy you will be provided with the documentation specific to what you have requested.', None], [None, '', None], [None, '', None], [None, '', None], [None, '', None], [None, '', None], [None, '', None], [None, '', None], [None, '', None], [None, '', None], [None, '', None], [None, '', None], [None, '', None], [None, '', None], [None, '', None], [None, '', None], [None, '', None], [None, '', None], [None, '', None], [None, '', None], [None, '', None], [None, '', None], [None, '', None], [None, '', None], [None, '', None], [None, '', None], [None

In [None]:
def document_processing(file_path,policy_number):
    #load the pdf file
    loader = PyPDFLoader(file_path)
    docs = loader.load()
    #split the docs
    splitter = RecursiveCharacterTextSplitter(
        chunk_size = 1000,
        chunk_overlap = 200,
        length_function = len
    )
    docs = splitter.split_documents(docs)
    
    #adding policy number information in the document metadata
    for doc in docs:
        doc.metadata["policy_number"] = policy_number

    return docs[0:2]

In [None]:
policy_number = "AU1234"
download_path = "sample_policy_doc_AU1234.pdf"

### Llama parser

In [20]:
import boto3
import json

In [22]:
secret_list = ["pinecone_key","llama_api_key"]
region_name = "us-east-1"
# Create a Secrets Manager client
session = boto3.session.Session()
client = session.client(
    service_name='secretsmanager',
    region_name=region_name
)
try:
    response = client.batch_get_secret_value(
        SecretIdList=secret_list
    )
except ClientError as e:
    raise e

secrets = {item['Name']: item['SecretString'] for item in response['SecretValues']}

print(secrets)

{'llama_api_key': '{"llama_api_key":"llx-8d0QLwW88xcToIhwFUqeEan8bVlmy6zqrZFgquVmFTglyxcR"}', 'pinecone_key': '{"pinecone_key":"pcsk_6BsWwy_5JoCtkwshBx812WztFtrsCSB7Z3ReV8bB6Q39N93ygwZ8qt6EmEE6sZJkEKnZ9y"}'}


In [24]:
pinecone_key_data = json.loads(secrets["pinecone_key"])
llama_api_data = json.loads(secrets["llama_api_key"])

In [27]:
print(pinecone_key_data)

{'pinecone_key': 'pcsk_6BsWwy_5JoCtkwshBx812WztFtrsCSB7Z3ReV8bB6Q39N93ygwZ8qt6EmEE6sZJkEKnZ9y'}


In [28]:
pinecone_api_key = pinecone_key_data["pinecone_key"]
print(pinecone_api_key)

pcsk_6BsWwy_5JoCtkwshBx812WztFtrsCSB7Z3ReV8bB6Q39N93ygwZ8qt6EmEE6sZJkEKnZ9y


In [16]:
pinecone_api_key = secrets["pinecone_key"]
llama_api_key = secrets["llama_api_key"]

In [17]:
print(pinecone_api_key)
print(llama_api_key)

{"pinecone_key":"pcsk_6BsWwy_5JoCtkwshBx812WztFtrsCSB7Z3ReV8bB6Q39N93ygwZ8qt6EmEE6sZJkEKnZ9y"}
{"llama_api_key":"llx-8d0QLwW88xcToIhwFUqeEan8bVlmy6zqrZFgquVmFTglyxcR"}


In [18]:
type(pinecone_api_key)

str