# Installing required modules

%pip install --quiet --no-build-isolation --force-reinstall \
    "boto3" \
    "awscli" \
    "botocore" \
    "faiss-cpu" \
    "langchain" \
    "pypdf" \
    "sqlalchemy" \
    "pickle5" \
    "transformers"

# Connecting to aws bedrock service & get a client

In [2]:
import json
import os
import sys

import boto3

module_path = ".."
sys.path.append(os.path.abspath(module_path))
from utils import bedrock, print_ww


# ---- ⚠️ Un-comment and edit the below lines as needed for your AWS setup ⚠️ ----

os.environ["AWS_DEFAULT_REGION"] = "us-west-2"  # E.g. "us-east-1"
# os.environ["AWS_PROFILE"] = "<YOUR_PROFILE>"
os.environ["BEDROCK_ASSUME_ROLE"] = "arn:aws:iam::195364414018:role/Crossaccountbedrock"  # E.g. "arn:aws:..."

boto3_bedrock = bedrock.get_bedrock_client(
    assumed_role=os.environ.get("BEDROCK_ASSUME_ROLE", None),
    region=os.environ.get("AWS_DEFAULT_REGION", None)
)

Create new client
  Using region: us-west-2
  Using role: arn:aws:iam::195364414018:role/Crossaccountbedrock ... successful!
boto3 Bedrock client successfully created!
bedrock-runtime(https://bedrock-runtime.us-west-2.amazonaws.com)


# Creating object of embedding and llm

In [3]:
from langchain.embeddings import BedrockEmbeddings
from langchain.llms.bedrock import Bedrock

br_embeddings = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1", client=boto3_bedrock)
br_llm = Bedrock(model_id="anthropic.claude-v2",client=boto3_bedrock)

# Load PDF files from dir and store in vectorstore

In [4]:
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS

loader = PyPDFDirectoryLoader("single_v_docs")
pages = loader.load()

chunk_size = 1000
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, 
    chunk_overlap=100,
    length_function = len,
)

docs, metadata = [], []

for i in range(len(pages)):
    print(f"Spliting the content with length", len(pages[i].page_content))
    splits = text_splitter.split_text(pages[i].page_content)
    docs.extend(splits)
    metadata.extend([{"source": pages[i].metadata["source"]}] * len(splits))

vectorstore_faiss_aws = FAISS.from_texts(
    docs,
    br_embeddings,
    metadatas=metadata,
)

print(f"vectorstore_faiss_aws: number of elements in the index={vectorstore_faiss_aws.index.ntotal}")
    


Spliting the content with length 5757
Spliting the content with length 4317
Spliting the content with length 280
Spliting the content with length 4045
Spliting the content with length 4043
Spliting the content with length 6742
Spliting the content with length 3221
Spliting the content with length 5830
Spliting the content with length 5840
Spliting the content with length 6663
Spliting the content with length 3329
Spliting the content with length 3385
Spliting the content with length 4846
Spliting the content with length 6515
Spliting the content with length 6493
Spliting the content with length 6994
Spliting the content with length 7020
Spliting the content with length 332
Spliting the content with length 2421
Spliting the content with length 85
Spliting the content with length 1182
Spliting the content with length 3463
Spliting the content with length 85
Spliting the content with length 1295
Spliting the content with length 85
Spliting the content with length 2214
Spliting the content

# Save vector store for later use 

In [5]:
# import pickle
# with open("vectorstore_faiss_aws.pkl", "wb") as f:
#     pickle.dump(vectorstore_faiss_aws, f)
# exit()

# Adding index wapper to vector store for faster querying

In [6]:
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
wrapper_store_faiss = VectorStoreIndexWrapper(vectorstore=vectorstore_faiss_aws)
print_ww(wrapper_store_faiss.query("Account name of the policy?", llm=br_llm))

 Based on the provided context, I do not have enough information to determine the account name or
policyholder for this insurance policy. The excerpts mention contacting to make a payment and
renewing a policy, but do not specify an account name. Without more context about the policy
details, I cannot confidently provide the account name.


# Let check with actually return from pdf 

In [7]:
v = br_embeddings.embed_query("Account name of the policy?")

for i in vectorstore_faiss_aws.similarity_search_by_vector(v, k=4):
    print(f"From {i.metadata['source']} ss return : \n")
    print(f"Content \n{i.page_content} \n")

From single_v_docs/insurance-motor-important-information-document-NMDMG10248.pdf ss return : 

Content 
ask you to contact us to make payment before we can renew 
your policy.
AV803557_NMDMG10248_1222.indd   3AV803557_NMDMG10248_1222.indd   3 21/10/22   12:21 PM21/10/22   12:21 PM 

From single_v_docs/insurance_motor_car_motor_policy_booklet_241017_NMDMG10249_v3.pdf ss return : 

Content 
take out or renew your policy;
AV888149_NMDMG10249_0623.indd   3AV888149_NMDMG10249_0623.indd   3 10/05/23   12:14 PM10/05/23   12:14 PM 

From single_v_docs/insurance_motor_car_motor_policy_booklet_241017_NMDMG10249_v3.pdf ss return : 

Content 
take out or renew your policy;Contents
AV888149_NMDMG10249_0623.indd   2AV888149_NMDMG10249_0623.indd   2 10/05/23   12:14 PM10/05/23   12:14 PM 

From single_v_docs/insurance_motor_car_motor_policy_booklet_241017_NMDMG10249_v3.pdf ss return : 

Content 
policyholder or partner and the Finance Company agrees.
AV888149_NMDMG10249_0623.indd   8AV888149_NMDMG102

#### Parameters used for ConversationRetrievalChain
* **retriever**: We used `VectorStoreRetriever`, which is backed by a `VectorStore`. To retrieve text, there are two search types you can choose: `"similarity"` or `"mmr"`. `search_type="similarity"` uses similarity search in the retriever object where it selects text chunk vectors that are most similar to the question vector.

* **memory**: Memory Chain to store the history 

* **condense_question_prompt**: Given a question from the user, we use the previous conversation and that question to make up a standalone question

* **chain_type**: If the chat history is long and doesn't fit the context you use this parameter and the options are `stuff`, `refine`, `map_reduce`, `map-rerank`

If the question asked is outside the scope of context, then the model will reply it doesn't know the answer

**Note**: if you are curious how the chain works, uncomment the `verbose=True` line.

# Prompt template for chain

In [8]:
from langchain.prompts.prompt import PromptTemplate

_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

In [33]:
# turn verbose to true to see the full logs and documents
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferWindowMemory

# store previous interactions using ConversationalBufferMemory and add custom prompts to the chat.
memory = ConversationBufferWindowMemory(memory_key="chat_history", return_messages=True, k=3, output_key="answer")


qa = ConversationalRetrievalChain.from_llm(
    llm=br_llm, 
    # embeddings=br_embeddings,
    retriever=vectorstore_faiss_aws.as_retriever(search_type="similarity"), 
    memory=memory,
    condense_question_prompt=CONDENSE_QUESTION_PROMPT,
    condense_question_llm = br_llm,
    verbose=False, 
    chain_type='map_reduce',
    return_source_documents=True,
)

In [34]:
result = qa({"question": "How much is the third party damage limit?"})

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
Token indices sequence length is longer than the specified maximum sequence length for this model (1789 > 1024). Running this sequence through the model will result in indexing errors


In [35]:
print("AI Answer : ")
print(result["answer"])
print()
print("Referance : ")
for k, i in enumerate(result["source_documents"]):
    print(f"{k+1}. From {i.metadata['source']} ss return : \n")
    print(f"Content \n{i.page_content} \n")

AI Answer : 
 Unfortunately the provided text does not contain enough information to determine the specific amount of the third party damage limit. The text discusses various types of coverage but does not mention a specific limit for third party damage. Without that key detail in the text, I do not have enough context to provide a definitive answer to the question asked.

Referance : 
1. From single_v_docs/insurance_motor_car_motor_policy_booklet_241017_NMDMG10249_v3.pdf ss return : 

Content 
vehicle  after unloading it.
• 
 any claim if any 
persons insured under this section do not keep to the terms and 
conditions of this policy.
Section 3. Motor Injury Protection
There is no cover under this section if you have purchased Third Party, Fire and Theft cover.
If the vehicle policyholder, their partner or named drivers suffer accidental bodily injury as a result of:•
  a r
oad traffic accident in direct connection with the use of your vehicle  
 
and/or
•
  tr
avelling in, getting in 

In [12]:
result = qa({"question": "What is my policy number?"})
result

{'question': 'What is my policy number?',
 'chat_history': [HumanMessage(content='How much is the third party damage limit?'),
  AIMessage(content=' Based on the context provided, I do not see a specific amount stated for the third party damage limit. The relevant section says:\n\n"Persons insured are covered against all amounts which may have to be paid as a result of them being legally liable for an accident, involving your vehicle, resulting in:\n- another person’s death or injury  \n- damage to another person’s property (up to a maximum amount as shown in your schedule, plus an additional amount to cover claimant’s costs and expenses)."\n\nHowever, a specific limit amount is not provided. The passage states the limit is "a maximum amount as shown in your schedule", but the schedule is not included. So unfortunately I cannot determine the specific third party damage limit based on the information given.')],
 'answer': " Unfortunately there is no policy number provided in the given c

In [13]:
print("AI Answer : ")
print(result["answer"])
print()
print("Referance : ")
for k, i in enumerate(result["source_documents"]):
    print(f"{k+1}. From {i.metadata['source']} ss return : \n")
    print(f"Content \n{i.page_content} \n")

" Unfortunately there is no policy number provided in the given context. The text mentions contacting a helpline and providing a policy number, but does not actually specify what the policy number is. Since a specific policy number is not provided, I don't know what it is."