# Installing required modules

%pip install --quiet --no-build-isolation --force-reinstall \
    "boto3" \
    "awscli" \
    "botocore" \
    "faiss-cpu" \
    "langchain" \
    "pypdf" \
    "sqlalchemy" \
    "pickle5" \
    "transformers"

# Connecting to aws bedrock service & get a client

In [2]:
import json
import os
import sys

import boto3

module_path = ".."
sys.path.append(os.path.abspath(module_path))
from utils import bedrock, print_ww


# ---- ⚠️ Un-comment and edit the below lines as needed for your AWS setup ⚠️ ----

os.environ["AWS_DEFAULT_REGION"] = "us-west-2"  # E.g. "us-east-1"
# os.environ["AWS_PROFILE"] = "<YOUR_PROFILE>"
os.environ["BEDROCK_ASSUME_ROLE"] = "arn:aws:iam::195364414018:role/Crossaccountbedrock"  # E.g. "arn:aws:..."

boto3_bedrock = bedrock.get_bedrock_client(
    assumed_role=os.environ.get("BEDROCK_ASSUME_ROLE", None),
    region=os.environ.get("AWS_DEFAULT_REGION", None)
)

Create new client
  Using region: us-west-2
  Using role: arn:aws:iam::195364414018:role/Crossaccountbedrock ... successful!
boto3 Bedrock client successfully created!
bedrock-runtime(https://bedrock-runtime.us-west-2.amazonaws.com)


# Creating object of embedding and llm

In [3]:
from langchain.embeddings import BedrockEmbeddings
from langchain.llms.bedrock import Bedrock

br_embeddings = BedrockEmbeddings(
    model_id="amazon.titan-embed-text-v1", 
    client=boto3_bedrock
)

br_llm = Bedrock(
    model_id="anthropic.claude-v2",
    client=boto3_bedrock,
    model_kwargs={"temperature":0.1}
)

is_login=True
user="MV123456789"

# Load PDF files from dir and store in vectorstore

In [4]:
from langchain.document_loaders import PyPDFDirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.vectorstores import FAISS
import os

chunk_size = 1000
chunk_overlap = 500
text_splitter = CharacterTextSplitter(
    chunk_size=chunk_size, 
    chunk_overlap=chunk_overlap,
    length_function = len,
)


In [5]:
pages = []
files = os.listdir("public/")
for file_name in files:
    if file_name.endswith(".pdf"):
        loader = PyPDFLoader("public/" + file_name)
        pages.extend(loader.load())

docs, metadata = [], []

for i in range(len(pages)):
    print(f'Spliting {pages[i].metadata["source"]} page number : {pages[i].metadata["page"]+1}')
    splits =  text_splitter.split_text(pages[i].page_content)
    docs.extend(splits)
    metadata.extend([{"source": pages[i].metadata["source"], "page_number": pages[i].metadata["page"]+1}] * len(splits))

pub_vs = FAISS.from_texts(
    docs,
    br_embeddings,
    metadatas=metadata,
)

print(f"pub_vs: number of elements in the index={pub_vs.index.ntotal}")

Spliting public/nmdmg14060.pdf page number : 1
Spliting public/nmdmg14060.pdf page number : 2
Spliting public/insurance-motor-car-motoring-in-europe-241017-NMDMG2918.pdf page number : 1
Spliting public/insurance-motor-car-motoring-in-europe-241017-NMDMG2918.pdf page number : 2
Spliting public/insurance-motor-car-motoring-in-europe-241017-NMDMG2918.pdf page number : 3
Spliting public/insurance-motor-car-motoring-in-europe-241017-NMDMG2918.pdf page number : 4
Spliting public/insurance-motor-car-motoring-in-europe-241017-NMDMG2918.pdf page number : 5
Spliting public/insurance-motor-car-motoring-in-europe-241017-NMDMG2918.pdf page number : 6
Spliting public/insurance-motor-car-motoring-in-europe-241017-NMDMG2918.pdf page number : 7
Spliting public/insurance-motor-car-motoring-in-europe-241017-NMDMG2918.pdf page number : 8
Spliting public/insurance-motor-car-motoring-in-europe-241017-NMDMG2918.pdf page number : 9
Spliting public/insurance-motor-car-motoring-in-europe-241017-NMDMG2918.pdf pa

In [6]:
print(len(pages))
print(pages[77].metadata)

79
{'source': 'public/insurance-motor-important-information-document-NMDMG10248.pdf', 'page': 2}


In [7]:
loader = PyPDFDirectoryLoader("private")
pages = loader.load()

docs, metadata = [], []

for i in range(len(pages)):
    print(f'Spliting {pages[i].metadata["source"]} page number : {pages[i].metadata["page"]+1}')
    splits =  text_splitter.split_text(pages[i].page_content)
    docs.extend(splits)
    metadata.extend([{"source": pages[i].metadata["source"], "page_number": pages[i].metadata["page"]+1}] * len(splits))

pvt_vs = FAISS.from_texts(
    docs,
    br_embeddings,
    metadatas=metadata,
)

print(f"pvt_vs: number of elements in the index={pvt_vs.index.ntotal}")

Spliting private/policy_certifcate_multiple_vehicle.pdf page number : 1
Spliting private/policy_certifcate_multiple_vehicle.pdf page number : 2
Spliting private/policy_certifcate_multiple_vehicle.pdf page number : 3
Spliting private/policy_certifcate_multiple_vehicle.pdf page number : 4
Spliting private/policy_certifcate_multiple_vehicle.pdf page number : 5
Spliting private/policy_certifcate_multiple_vehicle.pdf page number : 6
Spliting private/policy_certifcate_multiple_vehicle.pdf page number : 7
Spliting private/policy_certifcate_multiple_vehicle.pdf page number : 8
Spliting private/policy_certifcate_multiple_vehicle.pdf page number : 9
Spliting private/policy_certifcate_multiple_vehicle.pdf page number : 10
Spliting private/policy_certifcate_multiple_vehicle.pdf page number : 11
Spliting private/policy_certifcate_multiple_vehicle.pdf page number : 12
Spliting private/policy_certifcate_multiple_vehicle.pdf page number : 13
Spliting private/policy_certifcate_multiple_vehicle.pdf page

# Save vector store for later use 

In [8]:
# import pickle
# print(type(pub_vs))
# with open("pub_vs.pkl", "wb") as f:
#     pickle.dump(pub_vs, f)

# Adding index wapper to vector store for faster querying

In [9]:
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
pub_vs_index = VectorStoreIndexWrapper(vectorstore=pub_vs)
print("Test run query in pub_vs_index:")
print_ww(pub_vs_index.query("Account name of the policy?", llm=br_llm))

pvt_vs_index = VectorStoreIndexWrapper(vectorstore=pvt_vs)
print("Test run query in pvt_vs_index:")
print_ww(pvt_vs_index.query("Account name of the policy?", llm=br_llm))

Test run query in pub_vs_index:
 Unfortunately I do not have enough context to determine the account name or policyholder for this
insurance policy, as the excerpt provided does not specify that information. The excerpt discusses
cancellation rights, claims history, renewing insurance, etc. but does not provide the specific
account name or policyholder details. Without seeing the full policy documents or having additional
context about who this policy belongs to, I cannot confidently determine the account name or
policyholder. I apologize that I do not have enough information provided to answer who this specific
motor insurance policy is held by.
Test run query in pvt_vs_index:
 Based on the policy documents provided, the account name or principal policyholder for this
insurance policy is Mr. Test Tester.


# Creating a function to extract context from pub and pvt PDFS.

In [10]:
def get_pdf_context(query):
    # TODO: is user login 
    print("Getting PDF context from pub_vs_index")
    pub_context=""
    for k, i in enumerate(pub_vs_index.vectorstore.similarity_search(query, k=1)):
        pub_context += f"Source Document - {i.metadata['source']}\nPage Number - {i.metadata['page_number']}\nContext - {i.page_content}\n"
        # print(f"Content \n{i.page_content} \n")

    print("Getting PDF context from pvt_vs_index")
    pvt_context = ""
    for k, i in enumerate(pvt_vs_index.vectorstore.similarity_search(query, k=1)):
        pvt_context += f"Source Document - {i.metadata['source']}\nPage Number - {i.metadata['page_number']}\nContext - {i.page_content}\n"
        # print(f"Content \n{i.page_content} \n")
    return pub_context, pvt_context

In [11]:
query = "What will be my Cancellation fees and charges?"
pub_context, pvt_context = get_pdf_context(query)
print(pvt_context)

Getting PDF context from pub_vs_index
Getting PDF context from pvt_vs_index
Source Document - private/policy_certifcate_multiple_vehicle.pdf
Page Number - 4
Context - AvivaUKDigitalLimited StHelen’s,1Undershaft,LondonEC3P3DQ
Whoregulatesus?
TheFinancialConductAuthority(FCA)istheindependentwatchdogthatregulatesfinancialservices,including
insurance.WeareauthorisedandregulatedbytheFCAasaninsuranceintermediaryandsellinsuranceproducts,
actingonbehalfoftheinsurer.Wearealsopermittedtoactasacreditbrokerinrelationtothearrangingofamonthly
premiumpaymentfacilitytofinancethoseinsuranceproducts.WeareregisteredwiththeFinancialConductAuthority
as:AvivaUKDigitalLimited,StHelen’s,1Undershaft,LondonEC3P3DQ.OurFinancialServicesRegisternumberis
728985.
YoumaycheckthisinformationandobtainfurtherinformationabouthowtheFCAprotectsyoubyvisitingtheFCA’s
websiteatwww.fca.org.uk/register.
Whoseproductsdoweoffer?
WeareonlyabletoofferinformationoninsuranceproductsfromAvivaInsuranceLimitedapartfromBreakdown
(eithers

# Creating Chat History Memory

In [12]:
from langchain.memory import ConversationBufferWindowMemory, ConversationSummaryMemory # llm 

# store previous interactions using ConversationalBufferMemory and add custom prompts to the chat.
memory = ConversationBufferWindowMemory(
    # llm=br_llm,
    input_key="customer_query",
    memory_key="history", 
    return_messages=False, 
    k=2,
    ai_prefix="Assistant",
    human_prefix="Human",
)

# Prompt template for chain

In [13]:
from langchain.prompts.prompt import PromptTemplate

_template = """
Human: If customer start the conversation to greet you at any time, please greet them according to the time of the day and don't refer to any documents also don't return source document name in the responce.

Assistant: I will definetly greet the customer according to the time of the day and I won't refer to any documents also I won't return any source document name in the responce.

{history}

Human:
Consider the following private document : 
{pvt_context} 
Please identify the context most relevant to the question "{customer_query}" unless the question is a greeting and copy them out word-for-word. 
If there are no context in this document than refer to this public document : 
{pub_context} 
that seem relevant to "{customer_query}" question and copy them out word-for-word. 
If you couldn't find any relevant context, please reply to customer politely that you don't know the answer to the question could they rephrase the question.
Return the responce in exact json format given below:
```json
{{
	"result_message":"<Assistant Responce>",
	"source":"<source_document_name>",
	"page_number":"<source_page_number>",
	"search_test":"<word-for-word copied from document>",
}}
```
Assistant:
"""
PROMPT = PromptTemplate(template=_template, input_variables=["customer_query", "pub_context", "pvt_context", "history"])

In [14]:
# from langchain.prompts.prompt import PromptTemplate

# _template = """
# Human: If customer start the conversation to greet you at any time, please greet them according to the time of the day and don't refer to any documents also don't return source document name in the responce.

# Assistant: I will definetly greet the customer according to the time of the day and I won't refer to any documents also I won't return any source document name in the responce.

# {history}

# Human:
# Consider the following document: {pvt_context} Please identify the context most relevant to the question unless the question is a greeting "{customer_query}" and copy them out word-for-word. 
# If there are no context in this document than refer to this document {pub_context} that seem relevant to this question and copy them out word-for-word. 
# If you couldn't find any relevant context, please reply to customer politely that you don't know the answer to the question could they rephrase the question.
# Return the source document name of the context you choose to answer the question in below format:
# Referenced Documents : <document_name>

# Assistant:
# """
# PROMPT = PromptTemplate(template=_template, input_variables=["customer_query", "pub_context", "pvt_context", "history"])

In [15]:
# from langchain.prompts.prompt import PromptTemplate

# _template = """
# Human: If customer start the conversation to greet you at any time, please greet them according to the time of the day and don't refer to any documents also don't return source document name in the responce.

# Assistant: Ok, I will definetly greet the customer according to the time of the day and I won't refer to any documents also I won't return any source document name in the responce.

# Human: You are a Insurance Assistant for Aviva.

# Assistant: Ok I am Insurance Assistant for Aviva.

# Human: Your primary role to answer the customer question with the private context, public context and previous chat history.

# Assistant: Yes, my primary task is to answer customer query politely.

# {history}

# Human:
# Consider the following document: {pvt_context} Please identify the context most relevant to the question unless the question is a greeting "{customer_query}" and copy them out word-for-word. 
# If there are no context in this document than refer to this document {pub_context} that seem relevant to this question and copy them out word-for-word. 
# If you couldn't find any relevant context, please reply to customer politely that you don't know the answer to the question could they rephrase the question.
# Return the source document name of the context you choose to answer the question in below format:
# Referenced Documents : <document_name>

# Assistant:
# """
# PROMPT = PromptTemplate(template=_template, input_variables=["customer_query", "pub_context", "pvt_context", "history"])

In [16]:
# turn verbose to True to see the full logs and documents
from langchain.chains import LLMChain

qa = LLMChain(
    llm=br_llm, 
    verbose=True, 
    prompt=PROMPT,
    memory=memory,
)

def ask_copilot(query):
    pub_context, pvt_context = get_pdf_context(query)
    result = qa.predict(customer_query=query, pvt_context=pvt_context, pub_context=pub_context)
    print(f"\nCustomer: {query}\nCopilot : {result}")

In [17]:
ask_copilot("What are all the document we need to get motor policy?")

Getting PDF context from pub_vs_index
Getting PDF context from pvt_vs_index


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
Human: If customer start the conversation to greet you at any time, please greet them according to the time of the day and don't refer to any documents also don't return source document name in the responce.

Assistant: I will definetly greet the customer according to the time of the day and I won't refer to any documents also I won't return any source document name in the responce.



Human:
Consider the following private document : 
Source Document - private/policy_certifcate_multiple_vehicle.pdf
Page Number - 11
Context - Yourpolicyschedule-continued
A1MiniMiniCooperS(122)3h(2007) EffectiveDate27September2023EndDate26September2024
1598Petrol
Automatic3DoorHatchback
Vehiclepolicyholder:Mr.TestTester
Drivingoption: Vehiclepolicyholderonly
-Thisvehiclehascomprehensivecoverbasedon£450excessandupto8000milesayearperyearofSocial,Domes

In [18]:
ask_copilot("What will be my Cancellation fees and charges?")

Getting PDF context from pub_vs_index
Getting PDF context from pvt_vs_index


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
Human: If customer start the conversation to greet you at any time, please greet them according to the time of the day and don't refer to any documents also don't return source document name in the responce.

Assistant: I will definetly greet the customer according to the time of the day and I won't refer to any documents also I won't return any source document name in the responce.

Human: What are all the document we need to get motor policy?
Assistant:  Unfortunately I could not find any relevant context in the private document that seems relevant to answering "What are all the document we need to get motor policy?". 

As suggested, here is the relevant context from the public document:

```json
{
	"result_message":"I'm afraid I don't have enough context to directly answer the question 'What are all the document we need to get 

In [19]:
ask_copilot("How my Personal Information will be processed?")

Getting PDF context from pub_vs_index
Getting PDF context from pvt_vs_index


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
Human: If customer start the conversation to greet you at any time, please greet them according to the time of the day and don't refer to any documents also don't return source document name in the responce.

Assistant: I will definetly greet the customer according to the time of the day and I won't refer to any documents also I won't return any source document name in the responce.

Human: What are all the document we need to get motor policy?
Assistant:  Unfortunately I could not find any relevant context in the private document that seems relevant to answering "What are all the document we need to get motor policy?". 

As suggested, here is the relevant context from the public document:

```json
{
	"result_message":"I'm afraid I don't have enough context to directly answer the question 'What are all the document we need to get 