In [1]:
import boto3
import os
import uuid
from typing import List
import json
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.llms import Bedrock
from urllib.parse import unquote_plus
from pinecone import Pinecone
from pinecone import ServerlessSpec

def initialization(pinecone_api_key:str):
    
    try:
        #pinecone client
        pc = Pinecone(api_key=pinecone_api_key)
        index_name = "insurance-virtual-agent-hybrid"
        #Get the index host
        index_response = pc.describe_index(name=index_name)
        dns_host = index_response["host"]
        index = pc.Index(host=dns_host)
        return pc,index
    except Exception as e:
        print(f"Error occured in Initialization : {e}")

def document_processing(file_path,policy_number):
    #load the pdf file
    loader = PyPDFLoader(file_path)
    docs = loader.load()
    #split the docs
    splitter = RecursiveCharacterTextSplitter(
        chunk_size = 1000,
        chunk_overlap = 200,
        length_function = len
    )
    docs = splitter.split_documents(docs)
    
    #adding policy number information in the document metadata
    for doc in docs:
        doc.metadata["policy_number"] = policy_number

    return docs[0:2]

pinecone_api_key = ""
pc,index = initialization(pinecone_api_key)
policy_number = "AU1234"
download_path = "sample_policy_doc_AU1234.pdf"
documents = document_processing(download_path,policy_number)
vectors = []
for doc in documents:
    sparse_embeddings = pc.inference.embed(
        model="pinecone-sparse-english-v0",
        inputs=[doc.page_content],
        parameters={"input_type": "passage", "truncate": "END"}
    )
  
   

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sparse_embeddings

EmbeddingsList(
  model='pinecone-sparse-english-v0',
  vector_type='sparse',
  data=[
    {'vector_type': sparse, 'sparse_values': [1.8847656, 2.7597656, ..., 2.078125, 0.42382812], 'sparse_indices': [19522071, 58671053, ..., 4061671695, 4243668012]}
  ],
  usage={'total_tokens': 67}
)

In [7]:
sparse_embeddings.data[0]["sparse_values"]

[1.8847656,
 2.7597656,
 1.6796875,
 2.3417969,
 2.7519531,
 1.8505859,
 2.1074219,
 1.8261719,
 2.1269531,
 1.5224609,
 0.41357422,
 1.1777344,
 1.4804688,
 0.64941406,
 0.50927734,
 2.0722656,
 0.35083008,
 1.6455078,
 2.2558594,
 0.6201172,
 1.8076172,
 2.5839844,
 1.2373047,
 2.0546875,
 1.34375,
 2.6679688,
 0.45751953,
 0.3720703,
 1.1621094,
 1.8125,
 3.1523438,
 2.0566406,
 1.0449219,
 0.17504883,
 2.2695312,
 2.3671875,
 1.5478516,
 1.9169922,
 2.4296875,
 3.3203125,
 0.9453125,
 1.7451172,
 4.1992188,
 0.0,
 0.62402344,
 0.640625,
 2.2539062,
 2.078125,
 0.42382812]

In [8]:
sparse_embeddings.data[0]["sparse_indices"]

[19522071,
 58671053,
 62396946,
 264741300,
 273066799,
 494070171,
 613148321,
 670727360,
 766128868,
 963854120,
 1009084850,
 1026658409,
 1061511187,
 1150284091,
 1234868792,
 1265401351,
 1299665196,
 1475817810,
 1520550099,
 1732333218,
 1813251204,
 1852771076,
 1954640689,
 2219907560,
 2295025838,
 2430202379,
 2523355832,
 2621333525,
 2737747873,
 2942967631,
 2968096829,
 3098242165,
 3131381387,
 3165597058,
 3292575152,
 3319683975,
 3417768591,
 3455420776,
 3510989507,
 3597663484,
 3649586921,
 3701427380,
 3753307315,
 3789030606,
 3965692316,
 3973262016,
 3982917068,
 4061671695,
 4243668012]