# Installing required modules

%pip install --quiet --no-build-isolation --force-reinstall \
    "boto3" \
    "awscli" \
    "botocore" \
    "faiss-cpu" \
    "langchain" \
    "pypdf" \
    "unstructured[pdf]" \
    "sqlalchemy" \
    "pickle5" \
    "transformers"

# Connecting to aws bedrock service & get a client

In [2]:
import json
import os
import sys

import boto3

module_path = ".."
sys.path.append(os.path.abspath(module_path))
from utils import bedrock, print_ww


# ---- ⚠️ Un-comment and edit the below lines as needed for your AWS setup ⚠️ ----

os.environ["AWS_DEFAULT_REGION"] = "us-west-2"  # E.g. "us-east-1"
# os.environ["AWS_PROFILE"] = "<YOUR_PROFILE>"
os.environ["BEDROCK_ASSUME_ROLE"] = "arn:aws:iam::195364414018:role/Crossaccountbedrock"  # E.g. "arn:aws:..."

boto3_bedrock = bedrock.get_bedrock_client(
    assumed_role=os.environ.get("BEDROCK_ASSUME_ROLE", None),
    region=os.environ.get("AWS_DEFAULT_REGION", None)
)

Create new client
  Using region: us-west-2
  Using role: arn:aws:iam::195364414018:role/Crossaccountbedrock ... successful!
boto3 Bedrock client successfully created!
bedrock-runtime(https://bedrock-runtime.us-west-2.amazonaws.com)


# Creating object of embedding and llm

In [3]:
from langchain.embeddings import BedrockEmbeddings
from langchain.llms.bedrock import Bedrock

br_embeddings = BedrockEmbeddings(
    model_id="amazon.titan-embed-text-v1", 
    client=boto3_bedrock
)

br_llm = Bedrock(
    model_id="anthropic.claude-v2",
    client=boto3_bedrock,
    model_kwargs={"temperature":0.1}
)

is_login=True
user="MV123456789"

# Load PDF files from dir and store in vectorstore

In [4]:
from langchain.document_loaders import PyPDFDirectoryLoader, PyPDFLoader, UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.vectorstores import FAISS
import os

chunk_size = 1500
chunk_overlap = 500
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, 
    chunk_overlap=chunk_overlap,
    length_function = len,
)


# Getting element by element text from PDF to get doc name & element page no.

In [5]:
elements = []
files = os.listdir("public/")
for file_name in files:
    if file_name.endswith(".pdf"):
        loader = UnstructuredFileLoader("public/" + file_name, mode="elements", strategy="fast")
        elements.extend(loader.load())
print("All the elements from PDF is loaded")

The PDF <_io.BufferedReader name='public/nmdmg14060.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='public/avivaplus-breakdown-cover-policy-wording.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='public/insurance_motor_car_motor_policy_booklet_241017_NMDMG10249_v3.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


All the elements from PDF is loaded


# With source and page number as unique key, joining all document text and creating pages

In [6]:
pages = [elements[0]]
for element in elements[1:]:
    if pages[-1].metadata['source'] == element.metadata['source'] and pages[-1].metadata['page_number'] == element.metadata['page_number']:
        pages[-1].page_content += " "+element.page_content
    else:
        pages.append(element)
print("All the pages are splitted")

All the pages are splitted


# spliting pages with our custome splitter & generating vector store

In [7]:
docs, metadata = [], []

for i in range(len(pages)):
    print(f'Spliting {pages[i].metadata["source"]} page number : {pages[i].metadata["page_number"]}')
    splits =  text_splitter.split_text(pages[i].page_content)
    docs.extend(splits)
    metadata.extend([{"source": pages[i].metadata["source"], "page_number": pages[i].metadata["page_number"]}] * len(splits))

pub_vs = FAISS.from_texts(
    docs,
    br_embeddings,
    metadatas=metadata,
)

print(f"pub_vs: number of elements in the index={pub_vs.index.ntotal}")

Spliting public/nmdmg14060.pdf page number : 1
Spliting public/nmdmg14060.pdf page number : 2
Spliting public/insurance-motor-car-motoring-in-europe-241017-NMDMG2918.pdf page number : 1
Spliting public/insurance-motor-car-motoring-in-europe-241017-NMDMG2918.pdf page number : 2
Spliting public/insurance-motor-car-motoring-in-europe-241017-NMDMG2918.pdf page number : 3
Spliting public/insurance-motor-car-motoring-in-europe-241017-NMDMG2918.pdf page number : 4
Spliting public/insurance-motor-car-motoring-in-europe-241017-NMDMG2918.pdf page number : 5
Spliting public/insurance-motor-car-motoring-in-europe-241017-NMDMG2918.pdf page number : 6
Spliting public/insurance-motor-car-motoring-in-europe-241017-NMDMG2918.pdf page number : 7
Spliting public/insurance-motor-car-motoring-in-europe-241017-NMDMG2918.pdf page number : 8
Spliting public/insurance-motor-car-motoring-in-europe-241017-NMDMG2918.pdf page number : 9
Spliting public/insurance-motor-car-motoring-in-europe-241017-NMDMG2918.pdf pa

# Createing vector store for private document 

In [8]:
loader = UnstructuredFileLoader("private/policy_certifcate_multiple_vehicle_edited.pdf", mode="elements", strategy="fast")
elements = loader.load()
print("All the elements from PDF is loaded")

pages = [elements[0]]
for element in elements[1:]:
    if pages[-1].metadata['source'] == element.metadata['source'] and pages[-1].metadata['page_number'] == element.metadata['page_number']:
        pages[-1].page_content += " "+element.page_content
    else:
        pages.append(element)
print("All the pages are splitted")

        
docs, metadata = [], []

for i in range(len(pages)):
    print(f'Spliting {pages[i].metadata["source"]} page number : {pages[i].metadata["page_number"]}')
    splits =  text_splitter.split_text(pages[i].page_content)
    docs.extend(splits)
    metadata.extend([{"source": pages[i].metadata["source"], "page_number": pages[i].metadata["page_number"]}] * len(splits))

pvt_vs = FAISS.from_texts(
    docs,
    br_embeddings,
    metadatas=metadata,
)

print(f"pvt_vs: number of elements in the index={pvt_vs.index.ntotal}")

All the elements from PDF is loaded
All the pages are splitted
Spliting private/policy_certifcate_multiple_vehicle_edited.pdf page number : 1
Spliting private/policy_certifcate_multiple_vehicle_edited.pdf page number : 2
Spliting private/policy_certifcate_multiple_vehicle_edited.pdf page number : 3
Spliting private/policy_certifcate_multiple_vehicle_edited.pdf page number : 4
Spliting private/policy_certifcate_multiple_vehicle_edited.pdf page number : 5
Spliting private/policy_certifcate_multiple_vehicle_edited.pdf page number : 6
Spliting private/policy_certifcate_multiple_vehicle_edited.pdf page number : 7
Spliting private/policy_certifcate_multiple_vehicle_edited.pdf page number : 8
Spliting private/policy_certifcate_multiple_vehicle_edited.pdf page number : 9
Spliting private/policy_certifcate_multiple_vehicle_edited.pdf page number : 10
Spliting private/policy_certifcate_multiple_vehicle_edited.pdf page number : 11
Spliting private/policy_certifcate_multiple_vehicle_edited.pdf pag

# Save vector store for later use 

In [9]:
# import pickle
# print(type(pub_vs))
# with open("pub_vs.pkl", "wb") as f:
#     pickle.dump(pub_vs, f)

# Adding index wapper to vector store for faster querying

In [10]:
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
pub_vs_index = VectorStoreIndexWrapper(vectorstore=pub_vs)
# print("Test run query in pub_vs_index:")
# query = "What are the vehicles covered in the policy?"
# print_ww(pub_vs_index.query(f"'{query}' is this question answerable ? please return only 'TRUE' keyword if it is answerable else 'FALSE' keyword", llm=br_llm))

pvt_vs_index = VectorStoreIndexWrapper(vectorstore=pvt_vs)
# print("Test run query in pvt_vs_index:")
# print_ww(pvt_vs_index.query(f"'{query}' is this question answerable ? please return only 'TRUE' keywrod if it is answerable else 'FALSE' keyword", llm=br_llm))

# Creating a function to extract context from pub and pvt PDFS.

In [11]:
def get_pub_context(query):
    print("Getting PDF context from pub_vs_index")
    pub_context=""
    for k, i in enumerate(pub_vs_index.vectorstore.similarity_search(query, k=1)):
        # pub_context += f"Source Document - {i.metadata['source']}\nPage Number - {i.metadata['page_number']}\nContext - {i.page_content}\n"
        pub_context = {
            "source": i.metadata['source'],
            "page_number": i.metadata['page_number'],
            "context": i.page_content,
        }
    return pub_context

def get_pvt_context(query):
    print("Getting PDF context from pvt_vs_index")
    pvt_context = ""
    for k, i in enumerate(pvt_vs_index.vectorstore.similarity_search(query, k=1)):
        # pvt_context += f"Source Document - {i.metadata['source']}\nPage Number - {i.metadata['page_number']}\nContext - {i.page_content}\n"
        pvt_context = {
            "source": i.metadata['source'],
            "page_number": i.metadata['page_number'],
            "context": i.page_content,
        }
    return pvt_context

def get_pdf_context(query):
    
    search_text = ["I", "MY", "MYSELF", "WE", "MINE", "ME", "OUR", "OURS", "HIMSELF", "HERSELF"]
    for search in search_text:
        if search in query.upper().split(" "):
            pvt_context = get_pvt_context(query)
            return [pvt_context]
        
    template = """
    You are TRUE or FALSE answering bot, you will only retrun keyword 'TRUE' if the user query is answerable else 'FALSE'.
    User Query : {query}
    """
    pub_result = pub_vs_index.query(template.format(query=query), llm=br_llm)
    # print(f"pub_result : {pub_result}")
    pvt_result = pvt_vs_index.query(template.format(query=query), llm=br_llm)
    # print(f"pvt_result : {pvt_result}")
    if "TRUE" in str(pvt_result) and "TRUE" in str(pub_result):
        pub_context = get_pub_context(query)
        pvt_context = get_pvt_context(query)
        return [pvt_context, pub_context]
    elif "TRUE" in str(pvt_result) and "FALSE" in str(pub_result):
        pvt_context = get_pvt_context(query)
        return [pvt_context]
    elif "FALSE" in str(pvt_result) and "TRUE" in str(pub_result):
        pub_context = get_pub_context(query)
        return [pub_context]
    else:
        return  [{
            "source": "NA",
            "page_number": 0,
            "context": "This is out of context!!!",
        }]

In [12]:
query = " I've had an accident, car will be off the road for 3 weeks to repair. 'Does my insurance have courtesy car cover'?"
context = get_pdf_context(query)
print(len(context))
print(context)

Getting PDF context from pvt_vs_index
1
[{'source': 'private/policy_certifcate_multiple_vehicle_edited.pdf', 'page_number': 11, 'context': 'Effective Date 27 September 2023 End Date 26 September 2024 1598 Petrol Automatic 3 Door Hatchback Vehicle policyholder: Mr. Test Tester Driving option: Vehicle policyholder only This vehicle has third party fire and theft cover based on £450 excess and up to 8000 miles a year per year of Social, Domestic, Pleasure and Commuting and restricted business use (refer to the vehicle certificate). This vehicle has third party fire and theft cover based on £450 excess and up to 8000 miles a year per year of Social, Domestic, Pleasure and Commuting and restricted business use (refer to the vehicle certificate). The vehicles are or will be owned and registered by either you, your spouse/civil/domestic partner, a close relative residing at the same address, your company or is the subject of a private or personal leasing contract. The vehicles are or will be 

# Creating Chat History Memory

In [49]:
from langchain.memory import ConversationBufferWindowMemory, ConversationSummaryMemory # llm 

# store previous interactions using ConversationalBufferMemory and add custom prompts to the chat.
memory = ConversationBufferWindowMemory(
    # llm=br_llm,
    input_key="customer_query",
    memory_key="history", 
    return_messages=False, 
    k=3,
    ai_prefix="Assistant",
    human_prefix="Human",
)

# Prompt template for chain

In [50]:
from langchain.prompts.prompt import PromptTemplate

_one_context_template = """
{history}
Human: In the context of "{context}", can you answer on the below question "{customer_query}" and copy them out word-for-word from the context. If you couldn't find any relevant context, please reply to customer politely that you don't know the answer to the question could they mind rephrasing the question.
Assistant:
"""
ONE_CONTEXT_PROMPT = PromptTemplate(template=_one_context_template, input_variables=["customer_query", "context", "history"])

In [51]:
from langchain.prompts.prompt import PromptTemplate

_two_context_template = """
{history}

Human: In the context of "{pvt_context}" and "{pub_context}", can you answer on the below question "{customer_query}" and copy them out word-for-word from the context. First provide insights from "{pvt_context}" then discuss infomation from "{pub_context}".

Assistant:
"""
TWO_CONTEXT_PROMPT = PromptTemplate(template=_two_context_template, input_variables=["customer_query", "pub_context", "pvt_context", "history"])

In [52]:
# turn verbose to True to see the full logs and documents
from langchain.chains import LLMChain
    
def ask_copilot(query):
    context = get_pdf_context(query)
    if len(context) == 1:
        qa = LLMChain(
            llm=br_llm, 
            verbose=False, 
            prompt=ONE_CONTEXT_PROMPT,
            memory=memory,
        )
        ai_result = qa.predict(customer_query=query, context=context[0]['context'])
        return_result = {
            "result_message":str(ai_result),
            "source":[context[0]['source']],
            "page_number":[context[0]['page_number']],
            "search_text":[context[0]['context']],
        }
    elif len(context) == 2:
        qa = LLMChain(
            llm=br_llm, 
            verbose=False, 
            prompt=TWO_CONTEXT_PROMPT,
            memory=memory,
        )
        ai_result = qa.predict(customer_query=query, pvt_context=context[0]['context'], pub_context=context[1]['context'])
        return_result = {
            "result_message":str(ai_result),
            "source":[context[0]['source'], context[1]['source']],
            "page_number":[context[0]['page_number'], context[1]['page_number']],
            "search_text":[context[0]['context'], context[1]['context']],
        }
    print(f"AI Result = {return_result['result_message']}\n")
    print(f"Source Document = {return_result['source']}\n")
    print(f"Page Number = {return_result['page_number']}\n")
    print(f"Search Text = {return_result['search_text']}\n")
    return return_result

In [54]:
result = ask_copilot("Hello?")
print(f"JSON Responce : {result}\n")

AI Result =  Hello, I'm afraid I don't have enough context to fully understand your question. Could you please rephrase it or provide some more details so I can try to assist you better? I want to make sure I give you the most helpful response.

Source Document = ['NA']

Page Number = [0]

Search Text = ['This is out of context!!!']

JSON Responce : {'result_message': " Hello, I'm afraid I don't have enough context to fully understand your question. Could you please rephrase it or provide some more details so I can try to assist you better? I want to make sure I give you the most helpful response.", 'source': ['NA'], 'page_number': [0], 'search_text': ['This is out of context!!!']}



In [18]:
result = ask_copilot("What will be my Cancellation fees and charges?")
print(f"JSON Responce : {result}\n")

Getting PDF context from pvt_vs_index
AI Result =  The context states: "Cancellation fee £38 (exc IPT)"

Source Document = ['private/policy_certifcate_multiple_vehicle_edited.pdf']

Page Number = [12]

Search Text = ["-*- Demonstration Powered by OpenText Exstream 09/27/2023, Version 16.6.32 32-bit -*- Your policy schedule - continued Young driver excess - Young driver excess - under 21's - Young driver excess - 21-24 year olds Non approved repairer excess - Non approved repairer excess £300 per claim £200 per claim £200 per claim Your fees Cancellation fee £38 (exc IPT) Clauses applicable The following are special terms and conditions which should be read in conjunction with your Policy Booklet. Clauses applicable to the Vehicles section Opt out of Courtesy Vehicle Benefit Courtesy Vehicle cover for vehicle registration numbers(s) AU53ZWM and A1 has been removed from Section 1 of the policy. Aviva Insurance Limited. Registered in Scotland, No. 2116. Registered Office: Pitheavlis, Pert

In [19]:
result = ask_copilot(" I've had an accident, car will be off the road for 3 weeks to repair. 'Does my insurance have courtesy car cover'?")
print(f"JSON Responce : {result}\n")

Getting PDF context from pvt_vs_index
AI Result =  Unfortunately I could not find any information in the given context that indicates whether courtesy car cover is included in this policy. The context does not mention courtesy cars or cover for a replacement vehicle while repairs are being carried out. I would need more information from the policy documents to be able to definitively answer whether courtesy car cover is included for this policy. I apologize that I cannot provide a direct answer based on the information given. Perhaps you could clarify what type of cover was purchased or rephrase the question with additional context that might allow me to assist further.

Source Document = ['private/policy_certifcate_multiple_vehicle_edited.pdf']

Page Number = [11]

Search Text = ['Effective Date 27 September 2023 End Date 26 September 2024 1598 Petrol Automatic 3 Door Hatchback Vehicle policyholder: Mr. Test Tester Driving option: Vehicle policyholder only This vehicle has third party

In [20]:
result = ask_copilot(" A named driver is added on the policy - 'Can they drive the vehicle'?")
print(f"JSON Responce : {result}\n")

Getting PDF context from pub_vs_index
Getting PDF context from pvt_vs_index
AI Result =  Based on the context provided in "Exclusions..." section, a named driver such as Miss Additional Tester can drive the insured vehicle under certain conditions:

"Permitted drivers As below provided that the person holds a licence to drive the car or has held and is not disqualified from holding or obtaining such a licence Mr. Test Tester Named drivers: Miss Additional Tester"

This indicates that the named driver Miss Additional Tester is permitted to drive the insured vehicle, provided she holds a valid driving license and is not disqualified from driving.

Additionally, the "of other persons driving..." section states:

"Any person given permission by the vehicle policyholder to drive your vehicle provided that your certificate of motor insurance allows that person to drive your vehicle."

This further confirms that a named driver like Miss Additional Tester can drive the insured vehicle if given

In [21]:
result = ask_copilot("Is 'Miss Additional Tester' driver covered to drive on the policy?")
print(f"JSON Responce : {result}\n")

Getting PDF context from pub_vs_index
Getting PDF context from pvt_vs_index
AI Result =  Based on the provided context:

From "holds a licence to drive the car or has held and is not disqualified from holding or obtaining such a licence Mr. Test Tester Named drivers: Miss Additional Tester":

"Named drivers: Miss Additional Tester" indicates that Miss Additional Tester is a named driver on the policy.

From "of other persons driving or using your vehicle Cover under this section will also apply on the same basis, for the following persons:":  

"Any person given permission by the vehicle policyholder to drive your vehicle provided that your certificate of motor insurance allows that person to drive your vehicle."

This means that any named driver like Miss Additional Tester can drive the insured vehicle if the policyholder gives permission and the certificate of insurance allows it.

In summary, as a named driver on the policy, Miss Additional Tester is covered to drive the insured veh

In [22]:
# Is that the vehicle with comprehensive cover?
result = ask_copilot("Is that the vehicle with comprehensive cover?")
print(f"JSON Responce : {result}\n")

AI Result =  I'm sorry, but the context you provided ("This is out of context!!!") does not contain any information to help me determine if the vehicle has comprehensive cover. Without any policy details or other relevant context, I do not have enough information to answer your question "Is that the vehicle with comprehensive cover?". Could you please rephrase the question or provide additional context that would allow me to assist you further? I want to make sure I give you an accurate answer, but currently lack the context to do so. Please let me know if there is any other way I can try to address your question.

Source Document = ['NA']

Page Number = [0]

Search Text = ['This is out of context!!!']

JSON Responce : {'result_message': ' I\'m sorry, but the context you provided ("This is out of context!!!") does not contain any information to help me determine if the vehicle has comprehensive cover. Without any policy details or other relevant context, I do not have enough informatio