# Installing required modules

In [2]:
%pip install --quiet --no-build-isolation --force-reinstall \
    "boto3" \
    "awscli" \
    "botocore" \
    "faiss-cpu" \
    "langchain" \
    "pypdf" \
    "sqlalchemy" \
    "pickle5" \
    "transformers"

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
daal4py 2021.6.0 requires daal==2021.4.0, which is not installed.
spyder 5.3.3 requires pyqt5<5.16, which is not installed.
spyder 5.3.3 requires pyqtwebengine<5.16, which is not installed.
distributed 2022.7.0 requires tornado<6.2,>=6.0.3, but you have tornado 6.3.3 which is incompatible.
jupyterlab 3.4.4 requires jupyter-server~=1.16, but you have jupyter-server 2.7.3 which is incompatible.
jupyterlab-server 2.10.3 requires jupyter-server~=1.4, but you have jupyter-server 2.7.3 which is incompatible.
notebook 6.5.5 requires jupyter-client<8,>=5.3.4, but you have jupyter-client 8.3.1 which is incompatible.
notebook 6.5.5 requires pyzmq<25,>=17, but you have pyzmq 25.1.1 which is incompatible.
numba 0.55.1 requires numpy<1.22,>=1.18, but you have numpy 1.26.0 which is incompatible.
panel 0.13.1 requires bokeh

# Connecting to aws bedrock service & get a client

In [3]:
import json
import os
import sys

import boto3

module_path = ".."
sys.path.append(os.path.abspath(module_path))
from utils import bedrock, print_ww


# ---- ⚠️ Un-comment and edit the below lines as needed for your AWS setup ⚠️ ----

os.environ["AWS_DEFAULT_REGION"] = "us-west-2"  # E.g. "us-east-1"
# os.environ["AWS_PROFILE"] = "<YOUR_PROFILE>"
os.environ["BEDROCK_ASSUME_ROLE"] = "arn:aws:iam::195364414018:role/Crossaccountbedrock"  # E.g. "arn:aws:..."

boto3_bedrock = bedrock.get_bedrock_client(
    assumed_role=os.environ.get("BEDROCK_ASSUME_ROLE", None),
    region=os.environ.get("AWS_DEFAULT_REGION", None)
)

Create new client
  Using region: us-west-2
  Using role: arn:aws:iam::195364414018:role/Crossaccountbedrock ... successful!
boto3 Bedrock client successfully created!
bedrock-runtime(https://bedrock-runtime.us-west-2.amazonaws.com)


# Creating object of embedding and llm

In [4]:
from langchain.embeddings import BedrockEmbeddings
from langchain.llms.bedrock import Bedrock

br_embeddings = BedrockEmbeddings(
    model_id="amazon.titan-embed-text-v1", 
    client=boto3_bedrock
)

br_llm = Bedrock(
    model_id="anthropic.claude-v2",
    client=boto3_bedrock,
    model_kwargs={"temperature":0.1}
)

# Load PDF files from dir and store in vectorstore

In [5]:
from langchain.document_loaders import PyPDFDirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS

loader = PyPDFDirectoryLoader("public")
pages = loader.load()

chunk_size = 1000
chunk_overlap = 500

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, 
    chunk_overlap=chunk_overlap,
    length_function = len,
)

docs, metadata = [], []

for i in range(len(pages)):
    print(f"Spliting the content with length", len(pages[i].page_content))
    splits = text_splitter.split_text(pages[i].page_content)
    docs.extend(splits)
    metadata.extend([{"source": pages[i].metadata["source"]}] * len(splits))

pub_vs = FAISS.from_texts(
    docs,
    br_embeddings,
    metadatas=metadata,
)

print(f"pub_vs: number of elements in the index={pub_vs.index.ntotal}")

loader = PyPDFLoader("policy_certifcate_multiple_vehicle.pdf")
pages = loader.load()

docs, metadata = [], []

for i in range(len(pages)):
    print(f"Spliting the content with length", len(pages[i].page_content))
    splits = text_splitter.split_text(pages[i].page_content)
    docs.extend(splits)
    metadata.extend([{"source": pages[i].metadata["source"]}] * len(splits))

pvt_vs = FAISS.from_texts(
    docs,
    br_embeddings,
    metadatas=metadata,
)

print(f"pvt_vs: number of elements in the index={pvt_vs.index.ntotal}")


Spliting the content with length 5757
Spliting the content with length 4317
Spliting the content with length 280
Spliting the content with length 4045
Spliting the content with length 4043
Spliting the content with length 6742
Spliting the content with length 3221
Spliting the content with length 5830
Spliting the content with length 5840
Spliting the content with length 6663
Spliting the content with length 3329
Spliting the content with length 3385
Spliting the content with length 4846
Spliting the content with length 6515
Spliting the content with length 6493
Spliting the content with length 6994
Spliting the content with length 7020
Spliting the content with length 332
Spliting the content with length 35
Spliting the content with length 391
Spliting the content with length 1473
Spliting the content with length 2889
Spliting the content with length 2218
Spliting the content with length 2405
Spliting the content with length 826
Spliting the content with length 3436
Spliting the conte

# Save vector store for later use 

In [6]:
# import pickle
# print(type(pub_vs))
# with open("pub_vs.pkl", "wb") as f:
#     pickle.dump(pub_vs, f)

# Adding index wapper to vector store for faster querying

In [7]:
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
pub_vs_index = VectorStoreIndexWrapper(vectorstore=pub_vs)
print("Test run query in pub_vs_index:")
print_ww(pub_vs_index.query("Account name of the policy?", llm=br_llm))

pvt_vs_index = VectorStoreIndexWrapper(vectorstore=pvt_vs)
print("Test run query in pvt_vs_index:")
print_ww(pvt_vs_index.query("Account name of the policy?", llm=br_llm))

Test run query in pub_vs_index:
 Based on the policy details provided, I do not see an explicit "account name" for the policy. The
policy refers to a "principal policyholder" who entered into the insurance contract, and "vehicle
policyholders" who are main users of insured vehicles. But there is no specific "account name"
mentioned for the policy itself.
Test run query in pvt_vs_index:
 Based on the context provided, the account name for the policy is TestTester. The relevant line
states:

Accountname TestTester

So the account name is TestTester.


# Creating a function to extract context from pub and pvt PDFS.

In [8]:
def get_pdf_context(query):
    # TODO: is user login 
    print("PDF context from pub_vs_index")
    pub_context=""
    for k, i in enumerate(pub_vs_index.vectorstore.similarity_search(query, k=1)):
        print(f"{k+1}. From {i.metadata['source']}\n")
        pub_context += f"{k+1}. From {i.metadata['source']} document :\n{i.page_content}\n"
        # print(f"Content \n{i.page_content} \n")

    print("PDF context from pvt_vs_index")
    pvt_context = ""
    for k, i in enumerate(pvt_vs_index.vectorstore.similarity_search(query, k=1)):
        print(f"{k+1}. From {i.metadata['source']}\n")
        pvt_context += f"{k+1}. From {i.metadata['source']} document :\n{i.page_content}\n"
        # print(f"Content \n{i.page_content} \n")
    return pub_context, pvt_context

# Prompt template for chain

In [9]:
from langchain.prompts.prompt import PromptTemplate

_template = """
Human: If customer start the conversation to greet you at any time, please greet them according to the time of the day and don't refer to any documents also don't return source document name in the responce.

Assistant: I will definetly greet the customer according to the time of the day and I won't refer to any documents also I won't return any source document name in the responce.

{history}

Human:
Consider the following document: {pvt_context} Please identify the context most relevant to the question unless the question is a greeting "{customer_query}" and copy them out word-for-word. 
If there are no context in this document than refer to this document {pub_context} that seem relevant to this question and copy them out word-for-word. 
If you couldn't find any relevant context, please reply to customer politely that you don't know the answer to the question could they rephrase the question.
Return the source document name of the context you choose to answer the question in below format:
Referenced Documents : <document_name>

Assistant:
"""
PROMPT = PromptTemplate(template=_template, input_variables=["customer_query", "pub_context", "pvt_context", "history"])

In [10]:
# from langchain.prompts.prompt import PromptTemplate

# _template = """
# Human: If customer start the conversation to greet you at any time, please greet them according to the time of the day and don't refer to any documents also don't return source document name in the responce.

# Assistant: Ok, I will definetly greet the customer according to the time of the day and I won't refer to any documents also I won't return any source document name in the responce.

# Human: You are a Insurance Assistant for Aviva.

# Assistant: Ok I am Insurance Assistant for Aviva.

# Human: Your primary role to answer the customer question with the private context, public context and previous chat history.

# Assistant: Yes, my primary task is to answer customer query politely.

# {history}

# Human:
# Consider the following document: {pvt_context} Please identify the context most relevant to the question unless the question is a greeting "{customer_query}" and copy them out word-for-word. 
# If there are no context in this document than refer to this document {pub_context} that seem relevant to this question and copy them out word-for-word. 
# If you couldn't find any relevant context, please reply to customer politely that you don't know the answer to the question could they rephrase the question.
# Return the source document name of the context you choose to answer the question in below format:
# Referenced Documents : <document_name>

# Assistant:
# """
# PROMPT = PromptTemplate(template=_template, input_variables=["customer_query", "pub_context", "pvt_context", "history"])

In [11]:
# turn verbose to true to see the full logs and documents
from langchain.chains import LLMChain
from langchain.memory import ConversationBufferWindowMemory, ConversationSummaryMemory # llm 

# store previous interactions using ConversationalBufferMemory and add custom prompts to the chat.
memory = ConversationBufferWindowMemory(
    # llm=br_llm,
    input_key="customer_query",
    memory_key="history", 
    return_messages=False, 
    k=3,
    ai_prefix="Assistant",
    human_prefix="Human",
)

qa = LLMChain(
    llm=br_llm, 
    verbose=False, 
    prompt=PROMPT,
    memory=memory,
)

In [12]:
# query="Hi, who are you ?"
# pub_context, pvt_context = get_pdf_context(query)
# result = qa.predict(customer_query=query, pvt_context=pvt_context, pub_context=pub_context)
# print(f"AI Answer : {result}")

In [13]:
query="Account name of the policy?"
pub_context, pvt_context = get_pdf_context(query)
result = qa.predict(customer_query=query, pvt_context=pvt_context, pub_context=pub_context)
print(result)

PDF context from pub_vs_index
1. From public/insurance_motor_car_motor_policy_booklet_241017_NMDMG10249_v3.pdf

PDF context from pvt_vs_index
1. From policy_certifcate_multiple_vehicle.pdf

 Referenced Documents : policy_certifcate_multiple_vehicle.pdf

Account name TestTester


In [14]:
query="What will be my Cancellation fees and charges?"
pub_context, pvt_context = get_pdf_context(query)
result = qa.predict(customer_query=query, pvt_context=pvt_context, pub_context=pub_context)
print(result)

PDF context from pub_vs_index
1. From public/insurance_motor_car_motor_policy_booklet_241017_NMDMG10249_v3.pdf

PDF context from pvt_vs_index
1. From policy_certifcate_multiple_vehicle.pdf

 Referenced Documents: public/insurance_motor_car_motor_policy_booklet_241017_NMDMG10249_v3.pdf

The table headed ‘Our cancellation fees’ which can be found in your ‘Aviva Motor Important Information’ document gives details of when a cancellation fee will be charged.


In [15]:
query="How my Personal Information will be processed?"
pub_context, pvt_context = get_pdf_context(query)
result = qa.predict(customer_query=query, pvt_context=pvt_context, pub_context=pub_context)
print(result)

PDF context from pub_vs_index
1. From public/insurance-motor-important-information-document-NMDMG10248.pdf

PDF context from pvt_vs_index
1. From policy_certifcate_multiple_vehicle.pdf

 Referenced Documents: policy_certifcate_multiple_vehicle.pdf

Following the complaints procedure does not affect your right to take legal action. Further details of our complaints procedure can be found in your insurance documents, or may be obtained from your usual Aviva UK Digital Limited contact.
