# Installing required modules

%pip install --quiet --no-build-isolation --force-reinstall \
    "boto3" \
    "awscli" \
    "botocore" \
    "faiss-cpu" \
    "langchain" \
    "pypdf" \
    "unstructured[pdf]" \
    "sqlalchemy" \
    "pickle5" \
    "transformers"

# Connecting to aws bedrock service & get a client

In [2]:
import json
import os
import sys

import boto3

module_path = ".."
sys.path.append(os.path.abspath(module_path))
from utils import bedrock, print_ww


# ---- ⚠️ Un-comment and edit the below lines as needed for your AWS setup ⚠️ ----

os.environ["AWS_DEFAULT_REGION"] = "us-west-2"  # E.g. "us-east-1"
# os.environ["AWS_PROFILE"] = "<YOUR_PROFILE>"
os.environ["BEDROCK_ASSUME_ROLE"] = "arn:aws:iam::195364414018:role/Crossaccountbedrock"  # E.g. "arn:aws:..."

boto3_bedrock = bedrock.get_bedrock_client(
    assumed_role=os.environ.get("BEDROCK_ASSUME_ROLE", None),
    region=os.environ.get("AWS_DEFAULT_REGION", None)
)

Create new client
  Using region: us-west-2
  Using role: arn:aws:iam::195364414018:role/Crossaccountbedrock ... successful!
boto3 Bedrock client successfully created!
bedrock-runtime(https://bedrock-runtime.us-west-2.amazonaws.com)


# Creating object of embedding and llm

In [3]:
from langchain.embeddings import BedrockEmbeddings
from langchain.llms.bedrock import Bedrock

br_embeddings = BedrockEmbeddings(
    model_id="amazon.titan-embed-text-v1", 
    client=boto3_bedrock
)

br_llm = Bedrock(
    model_id="anthropic.claude-v2",
    client=boto3_bedrock,
    model_kwargs={"temperature":0.5}
)

is_login=True
user="MV123456789"

# Load PDF files from dir and store in vectorstore

In [4]:
from langchain.document_loaders import PyPDFDirectoryLoader, PyPDFLoader, UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.vectorstores import FAISS
import os

chunk_size = 2000
chunk_overlap = 500
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, 
    chunk_overlap=chunk_overlap,
    length_function = len,
)


# Getting element by element text from PDF to get doc name & element page no.

In [5]:
elements = []
files = os.listdir("public/")
for file_name in files:
    if file_name.endswith(".pdf"):
        loader = UnstructuredFileLoader("public/" + file_name, mode="elements", strategy="fast")
        elements.extend(loader.load())
print("All the elements from PDF is loaded")

The PDF <_io.BufferedReader name='public/nmdmg14060.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='public/avivaplus-breakdown-cover-policy-wording.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='public/insurance_motor_car_motor_policy_booklet_241017_NMDMG10249_v3.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


All the elements from PDF is loaded


# With source and page number as unique key, joining all document text and creating pages

In [6]:
pages = [elements[0]]
for element in elements[1:]:
    if pages[-1].metadata['source'] == element.metadata['source'] and pages[-1].metadata['page_number'] == element.metadata['page_number']:
        pages[-1].page_content += " "+element.page_content
    else:
        pages.append(element)
print("All the pages are splitted")

All the pages are splitted


# spliting pages with our custome splitter & generating vector store

In [7]:
docs, metadata = [], []

for i in range(len(pages)):
    print(f'Spliting {pages[i].metadata["source"]} page number : {pages[i].metadata["page_number"]}')
    splits =  text_splitter.split_text(pages[i].page_content)
    docs.extend(splits)
    metadata.extend([{"source": pages[i].metadata["source"], "page_number": pages[i].metadata["page_number"]}] * len(splits))

pub_vs = FAISS.from_texts(
    docs,
    br_embeddings,
    metadatas=metadata,
)

print(f"pub_vs: number of elements in the index={pub_vs.index.ntotal}")

Spliting public/nmdmg14060.pdf page number : 1
Spliting public/nmdmg14060.pdf page number : 2
Spliting public/insurance-motor-car-motoring-in-europe-241017-NMDMG2918.pdf page number : 1
Spliting public/insurance-motor-car-motoring-in-europe-241017-NMDMG2918.pdf page number : 2
Spliting public/insurance-motor-car-motoring-in-europe-241017-NMDMG2918.pdf page number : 3
Spliting public/insurance-motor-car-motoring-in-europe-241017-NMDMG2918.pdf page number : 4
Spliting public/insurance-motor-car-motoring-in-europe-241017-NMDMG2918.pdf page number : 5
Spliting public/insurance-motor-car-motoring-in-europe-241017-NMDMG2918.pdf page number : 6
Spliting public/insurance-motor-car-motoring-in-europe-241017-NMDMG2918.pdf page number : 7
Spliting public/insurance-motor-car-motoring-in-europe-241017-NMDMG2918.pdf page number : 8
Spliting public/insurance-motor-car-motoring-in-europe-241017-NMDMG2918.pdf page number : 9
Spliting public/insurance-motor-car-motoring-in-europe-241017-NMDMG2918.pdf pa

# Createing vector store for private document 

In [8]:
loader = UnstructuredFileLoader("private/policy_certifcate_multiple_vehicle_edited.pdf", mode="elements", strategy="fast")
elements = loader.load()
print("All the elements from PDF is loaded")

pages = [elements[0]]
for element in elements[1:]:
    if pages[-1].metadata['source'] == element.metadata['source'] and pages[-1].metadata['page_number'] == element.metadata['page_number']:
        pages[-1].page_content += " "+element.page_content
    else:
        pages.append(element)
print("All the pages are splitted")

        
docs, metadata = [], []

for i in range(len(pages)):
    print(f'Spliting {pages[i].metadata["source"]} page number : {pages[i].metadata["page_number"]}')
    splits =  text_splitter.split_text(pages[i].page_content)
    docs.extend(splits)
    metadata.extend([{"source": pages[i].metadata["source"], "page_number": pages[i].metadata["page_number"]}] * len(splits))

pvt_vs = FAISS.from_texts(
    docs,
    br_embeddings,
    metadatas=metadata,
)

print(f"pvt_vs: number of elements in the index={pvt_vs.index.ntotal}")

All the elements from PDF is loaded
All the pages are splitted
Spliting private/policy_certifcate_multiple_vehicle_edited.pdf page number : 1
Spliting private/policy_certifcate_multiple_vehicle_edited.pdf page number : 2
Spliting private/policy_certifcate_multiple_vehicle_edited.pdf page number : 3
Spliting private/policy_certifcate_multiple_vehicle_edited.pdf page number : 4
Spliting private/policy_certifcate_multiple_vehicle_edited.pdf page number : 5
Spliting private/policy_certifcate_multiple_vehicle_edited.pdf page number : 6
Spliting private/policy_certifcate_multiple_vehicle_edited.pdf page number : 7
Spliting private/policy_certifcate_multiple_vehicle_edited.pdf page number : 8
Spliting private/policy_certifcate_multiple_vehicle_edited.pdf page number : 9
Spliting private/policy_certifcate_multiple_vehicle_edited.pdf page number : 10
Spliting private/policy_certifcate_multiple_vehicle_edited.pdf page number : 11
Spliting private/policy_certifcate_multiple_vehicle_edited.pdf pag

# Save vector store for later use 

In [9]:
# import pickle
# print(type(pub_vs))
# with open("pub_vs.pkl", "wb") as f:
#     pickle.dump(pub_vs, f)

# Adding index wapper to vector store for faster querying

In [10]:
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
pub_vs_index = VectorStoreIndexWrapper(vectorstore=pub_vs)
# print("Test run query in pub_vs_index:")
# query = "What are the vehicles covered in the policy?"
# print_ww(pub_vs_index.query(f"'{query}' is this question answerable ? please return only 'TRUE' keyword if it is answerable else 'FALSE' keyword", llm=br_llm))

pvt_vs_index = VectorStoreIndexWrapper(vectorstore=pvt_vs)
# print("Test run query in pvt_vs_index:")
# print_ww(pvt_vs_index.query(f"'{query}' is this question answerable ? please return only 'TRUE' keywrod if it is answerable else 'FALSE' keyword", llm=br_llm))

# Creating Chat History Memory

In [50]:
from langchain.memory import ConversationBufferWindowMemory, ConversationSummaryMemory # llm 

# store previous interactions using ConversationalBufferMemory and add custom prompts to the chat.
memory = ConversationBufferWindowMemory(
    # llm=br_llm,
    input_key="customer_query",
    memory_key="history", 
    return_messages=False, 
    k=3,
    ai_prefix="Assistant",
    human_prefix="Human",
)

# Creating a function to extract context from pub and pvt PDFS.

In [51]:
def get_pub_context(query):
    print("Getting PDF context from pub_vs_index")
    pub_context=""
    for k, i in enumerate(pub_vs_index.vectorstore.similarity_search(query, k=1)):
        # pub_context += f"Source Document - {i.metadata['source']}\nPage Number - {i.metadata['page_number']}\nContext - {i.page_content}\n"
        pub_context = {
            "source": i.metadata['source'],
            "page_number": i.metadata['page_number'],
            "context": i.page_content,
        }
    return pub_context

def get_pvt_context(query):
    print("Getting PDF context from pvt_vs_index")
    pvt_context = ""
    for k, i in enumerate(pvt_vs_index.vectorstore.similarity_search(query, k=1)):
        # pvt_context += f"Source Document - {i.metadata['source']}\nPage Number - {i.metadata['page_number']}\nContext - {i.page_content}\n"
        pvt_context = {
            "source": i.metadata['source'],
            "page_number": i.metadata['page_number'],
            "context": i.page_content,
        }
    return pvt_context

def get_pdf_context(query, memory):
    search_text = ["I", "MY", "MYSELF", "WE", "MINE", "ME", "OUR", "OURS", "HIMSELF", "HERSELF"]
    for search in search_text:
        if search in query.upper().split(" "):
            pvt_context = get_pvt_context(query)
            return [pvt_context]
        
    template = """
    You are TRUE or FALSE answering bot, you will only retrun keyword 'TRUE' if the user query is answerable else 'FALSE'. Please find the previous conversation of the user below and return keyword with chat history context also
    chat history:
    {history}
    Follow Up User Query : {query}
    """
    pub_result = pub_vs_index.query(template.format(query=query, history=memory.buffer), llm=br_llm)
    # print(f"pub_result : {pub_result}")
    pvt_result = pvt_vs_index.query(template.format(query=query, history=memory.buffer), llm=br_llm)
    # print(f"pvt_result : {pvt_result}")
    if "TRUE" in str(pvt_result) and "TRUE" in str(pub_result):
        pub_context = get_pub_context(query)
        pvt_context = get_pvt_context(query)
        return [pvt_context, pub_context]
    elif "TRUE" in str(pvt_result) and "FALSE" in str(pub_result):
        pvt_context = get_pvt_context(query)
        return [pvt_context]
    elif "FALSE" in str(pvt_result) and "TRUE" in str(pub_result):
        pub_context = get_pub_context(query)
        return [pub_context]
    else:
        return  [{
            "source": "NA",
            "page_number": 0,
            "context": "This is out of context!!!",
        }]

In [52]:
query = " I've had an accident, car will be off the road for 3 weeks to repair. 'Does my insurance have courtesy car cover'?"
context = get_pdf_context(query, memory)
print(len(context))
print(context)

Getting PDF context from pvt_vs_index
1
[{'source': 'private/policy_certifcate_multiple_vehicle_edited.pdf', 'page_number': 10, 'context': '- A1 Third party fire and theft cover: £1,588.00 Total vehicle premium: £1,588.00 The third party policy provides fire and theft cover for your vehicle and third party liability protection for injury or damage you may cause to others. Your insured vehicles AU53ZWM Bmw 325i Se (192) 5e (2003) Effective Date 27 September 2023 End Date 26 September 2024 2494 Petrol Manual 5 Door Estate Vehicle policyholder: Driving option: Mr. Test Tester Vehicle policyholder and named drivers This vehicle has comprehensive cover based on £450 excess and up to 15000 miles a year per year of Social, Domestic, Pleasure and Commuting and restricted business use (refer to the vehicle certificate). This vehicle has comprehensive cover based on £450 excess and up to 15000 miles a year per year of Social, Domestic, Pleasure and Commuting and restricted business use (refer to

# Prompt template for chain

In [53]:
from langchain.prompts.prompt import PromptTemplate

_one_context_template = """
{history}
Human: You are a insurance chat assistant for aviva and your name is 'Aviva Copilot'. You primary task to help the customer on their queries. When every the question is a greeting, please response with positive greeting, ask 'How can I help you?' and don't check for any context given below, current question is "{customer_query}". In the context of "{context}", can you answer on the below question "{customer_query}" and copy them out word-for-word from the context. If you couldn't find any relevant context, please find the chat history given above that can give you some previous conversation context.
Assistant:
"""
ONE_CONTEXT_PROMPT = PromptTemplate(template=_one_context_template, input_variables=["customer_query", "context", "history"])



In [54]:
from langchain.prompts.prompt import PromptTemplate

_two_context_template = """
{history}
Human: You are a insurance chat assistant for aviva and your name is 'Aviva Copilot'. You primary task to help the customer on thier queries. When every the question is a greeting, please responce with positive greeting, ask 'How can I help you?' and don't check for any context given below, current question is "{customer_query}". In the context of "{pvt_context}" and "{pub_context}", can you answer on the below question "{customer_query}" and copy them out word-for-word from the context. First provide insights from "{pvt_context}" then discuss infomation from "{pub_context}". If you couldn't find any relevant context, please find the chat history given above that can give you some previous conversation context.
Assistant:
"""
TWO_CONTEXT_PROMPT = PromptTemplate(template=_two_context_template, input_variables=["customer_query", "pub_context", "pvt_context", "history"])

In [55]:
# turn verbose to True to see the full logs and documents
from langchain.chains import LLMChain
    
def ask_copilot(query):
    context = get_pdf_context(query, memory)
    if len(context) == 1:
        qa = LLMChain(
            llm=br_llm, 
            verbose=False, 
            prompt=ONE_CONTEXT_PROMPT,
            memory=memory,
        )
        ai_result = qa.predict(customer_query=query, context=context[0]['context'])
        return_result = {
            "result_message":str(ai_result),
            "source":[context[0]['source']],
            "page_number":[context[0]['page_number']],
            "search_text":[context[0]['context']],
        }
    elif len(context) == 2:
        qa = LLMChain(
            llm=br_llm, 
            verbose=False, 
            prompt=TWO_CONTEXT_PROMPT,
            memory=memory,
        )
        ai_result = qa.predict(customer_query=query, pvt_context=context[0]['context'], pub_context=context[1]['context'])
        return_result = {
            "result_message":str(ai_result),
            "source":[context[0]['source'], context[1]['source']],
            "page_number":[context[0]['page_number'], context[1]['page_number']],
            "search_text":[context[0]['context'], context[1]['context']],
        }
    print(f"Customer = {query}\n")
    print(f"AI = {return_result['result_message']}\n")
    print(f"Source Document = {return_result['source']}\n")
    print(f"Page Number = {return_result['page_number']}\n")
    print(f"Search Text = {return_result['search_text']}\n")
    return return_result

In [56]:
result = ask_copilot("Hi, Who are you ?")
print(f"JSON Responce : {result}\n")

Customer = Hi, Who are you ?

AI =  Hello! I'm Aviva Copilot, an AI assistant created by Anthropic to be helpful, harmless, and honest. How can I help you today?

Source Document = ['NA']

Page Number = [0]

Search Text = ['This is out of context!!!']

JSON Responce : {'result_message': " Hello! I'm Aviva Copilot, an AI assistant created by Anthropic to be helpful, harmless, and honest. How can I help you today?", 'source': ['NA'], 'page_number': [0], 'search_text': ['This is out of context!!!']}



In [57]:
result = ask_copilot("Is my daughter 'Miss Additional Tester' permitted to drive on the policy?")
print(f"JSON Responce : {result}\n")

Getting PDF context from pvt_vs_index
Customer = Is my daughter 'Miss Additional Tester' permitted to drive on the policy?

AI =  Unfortunately I do not have enough context to determine if your daughter Miss Additional Tester is permitted to drive on the policy. The information provided mentions she is a named driver, but does not specify for which vehicle. More details would be needed about the specific vehicle(s) she is named on and the coverage terms for those vehicles. Without those specifics, I cannot confirm her driving permissions. I apologize that I cannot provide a more definitive answer based on the information given.

Source Document = ['private/policy_certifcate_multiple_vehicle_edited.pdf']

Page Number = [8]

Search Text = ['the extent of the cover may be affected. Principal policyholder: Address: Mr. Test Tester 77 Music House Lane, Norwich, NR1 1QN Contact number: 07459617348 Policy number: MMV070055371 Period of cover: 26 September 2024 14.09 on 27 September 2023 to 23

In [58]:
result = ask_copilot("Please let me know the vehicle details, which she is permitted to use?")
print(f"JSON Responce : {result}\n")

Getting PDF context from pvt_vs_index
Customer = Please let me know the vehicle details, which she is permitted to use?

AI =  Based on the context provided, it appears that Miss Additional Tester is permitted to drive the following vehicle:

Vehicle registration: AU53ZWM 
Make of vehicle: 2003 Bmw 325i Se (192) 5e

The context indicates this is the vehicle Miss Additional Tester is named on as a permitted driver. Please let me know if you need any other details about her permitted vehicle use.

Source Document = ['private/policy_certifcate_multiple_vehicle_edited.pdf']

Page Number = [14]

Search Text = ['-*- Demonstration Powered by OpenText Exstream 09/27/2023, Version 16.6.32 32-bit -*- Your certificate of car insurance This vehicle insurance certificate is evidence of your cover with us. Please read it and keep it safe with your other policy documents. Your policy number MMV070055371/1 Vehicle policyholder Mr. Test Tester Vehicle registration AU53ZWM Make of vehicle 2003 Bmw 325i 

In [59]:
result = ask_copilot("Is that the vehicle with fully comprehensive insurance?")
print(f"JSON Responce : {result}\n")

Getting PDF context from pvt_vs_index
Customer = Is that the vehicle with fully comprehensive insurance?

AI =  Unfortunately I do not have enough context in the previous chat history to definitively determine if the vehicle with registration AU53ZWM has fully comprehensive insurance. The policy information provided indicates it has third party fire and theft cover, but does not specify if it has comprehensive cover. Could you please clarify or provide additional policy details? I want to make sure I give an accurate answer regarding the coverage for this vehicle.

Source Document = ['private/policy_certifcate_multiple_vehicle_edited.pdf']

Page Number = [11]

Search Text = ['Effective Date 27 September 2023 End Date 26 September 2024 1598 Petrol Automatic 3 Door Hatchback Vehicle policyholder: Mr. Test Tester Driving option: Vehicle policyholder only This vehicle has third party fire and theft cover based on £450 excess and up to 8000 miles a year per year of Social, Domestic, Pleasur

result = ask_copilot(" I've had an accident, car will be off the road for 3 weeks to repair. 'Does my insurance have courtesy car cover'?")
print(f"JSON Responce : {result}\n")

result = ask_copilot(" A named driver is added on the policy - 'Can they drive the vehicle'?")
print(f"JSON Responce : {result}\n")

result = ask_copilot("Is 'Miss Additional Tester' driver covered to drive on the policy?")
print(f"JSON Responce : {result}\n")

result = ask_copilot("Is that the vehicle with comprehensive cover?")
print(f"JSON Responce : {result}\n")