In [1]:
# Install the following packages incase they're not loaded already
!pip install pdfplumber
!pip install chromadb
!pip install tiktoken
!pip install openai



In [2]:
# Import all the required Libraries
import pdfplumber
from pathlib import Path
import pandas as pd
from operator import itemgetter
import json
import tiktoken
import chromadb
import openai

In [3]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import os
os.chdir('/content/drive/MyDrive/helpmate')
!ls

chroma	       helpmate.ipynb	   Principal-Sample-Life-Insurance-Policy.pdf
ChromaDB_Data  OpenAI_API_Key.txt


In [5]:
pdf_path = "/content/drive/MyDrive/helpmate/"

In [6]:
# Set the API key
filepath = "/content/drive/MyDrive/helpmate/"

with open(filepath + "OpenAI_API_Key.txt", "r") as f:
  openai.api_key = ' '.join(f.readlines())

In [7]:
# Function to check whether a word is present in a table or not for segregation of regular text and tables

def check_bboxes(word, table_bbox):
    # Check whether word is inside a table bbox.
    l = word['x0'], word['top'], word['x1'], word['bottom']
    r = table_bbox
    return l[0] > r[0] and l[1] > r[1] and l[2] < r[2] and l[3] < r[3]

In [8]:
# Function to extract text from a PDF file.
# 1. Declare a variable p to store the iteration of the loop that will help us store page numbers alongside the text
# 2. Declare an empty list 'full_text' to store all the text files
# 3. Use pdfplumber to open the pdf pages one by one
# 4. Find the tables and their locations in the page
# 5. Extract the text from the tables in the variable 'tables'
# 6. Extract the regular words by calling the function check_bboxes() and checking whether words are present in the table or not
# 7. Use the cluster_objects utility to cluster non-table and table words together so that they retain the same chronology as in the original PDF
# 8. Declare an empty list 'lines' to store the page text
# 9. If a text element in present in the cluster, append it to 'lines', else if a table element is present, append the table
# 10. Append the page number and all lines to full_text, and increment 'p'
# 11. When the function has iterated over all pages, return the 'full_text' list

def extract_text_from_pdf(pdf_path):
    p = 0
    full_text = []


    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_no = f"Page {p+1}"
            text = page.extract_text()

            tables = page.find_tables()
            table_bboxes = [i.bbox for i in tables]
            tables = [{'table': i.extract(), 'top': i.bbox[1]} for i in tables]
            non_table_words = [word for word in page.extract_words() if not any(
                [check_bboxes(word, table_bbox) for table_bbox in table_bboxes])]
            lines = []

            for cluster in pdfplumber.utils.cluster_objects(non_table_words + tables, itemgetter('top'), tolerance=5):

                if 'text' in cluster[0]:
                    try:
                        lines.append(' '.join([i['text'] for i in cluster]))
                    except KeyError:
                        pass

                elif 'table' in cluster[0]:
                    lines.append(json.dumps(cluster[0]['table']))


            full_text.append([page_no, " ".join(lines)])
            p +=1

    return full_text

In [9]:
# Define the directory containing the PDF files
pdf_directory = Path(pdf_path)

# Initialize an empty list to store the extracted texts and document names
data = []

# Loop through all files in the directory
for pdf_path in pdf_directory.glob("*.pdf"):

    # Process the PDF file
    print(f"...Processing {pdf_path.name}")

    # Call the function to extract the text from the PDF
    extracted_text = extract_text_from_pdf(pdf_path)

    # Convert the extracted list to a PDF, and add a column to store document names
    extracted_text_df = pd.DataFrame(extracted_text, columns=['Page No.', 'Page_Text'])
    extracted_text_df['Document Name'] = pdf_path.name

    # Append the extracted text and document name to the list
    data.append(extracted_text_df)

    # Print a message to indicate progress
    print(f"Finished processing {pdf_path.name}")

# Print a message to indicate all PDFs have been processed
print("The have been processed.")

...Processing Principal-Sample-Life-Insurance-Policy.pdf
Finished processing Principal-Sample-Life-Insurance-Policy.pdf
The have been processed.


In [10]:
insurance_pdfs_data = pd.concat(data, ignore_index=True)

In [11]:
insurance_pdfs_data.head(25)

Unnamed: 0,Page No.,Page_Text,Document Name
0,Page 1,DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/...,Principal-Sample-Life-Insurance-Policy.pdf
1,Page 2,This page left blank intentionally,Principal-Sample-Life-Insurance-Policy.pdf
2,Page 3,POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...,Principal-Sample-Life-Insurance-Policy.pdf
3,Page 4,This page left blank intentionally,Principal-Sample-Life-Insurance-Policy.pdf
4,Page 5,PRINCIPAL LIFE INSURANCE COMPANY (called The P...,Principal-Sample-Life-Insurance-Policy.pdf
5,Page 6,TABLE OF CONTENTS PART I - DEFINITIONS PART II...,Principal-Sample-Life-Insurance-Policy.pdf
6,Page 7,Section A – Eligibility Member Life Insurance ...,Principal-Sample-Life-Insurance-Policy.pdf
7,Page 8,Section A - Member Life Insurance Schedule of ...,Principal-Sample-Life-Insurance-Policy.pdf
8,Page 9,P ART I - DEFINITIONS When used in this Group ...,Principal-Sample-Life-Insurance-Policy.pdf
9,Page 10,T he legally recognized union of two eligible ...,Principal-Sample-Life-Insurance-Policy.pdf


In [12]:
len(insurance_pdfs_data)

64

In [13]:
insurance_pdfs_data['Text_Length'] = insurance_pdfs_data['Page_Text'].apply(lambda x: len(x.split(' ')))

In [14]:
insurance_pdfs_data['Text_Length']

0      30
1       5
2     230
3       5
4     110
     ... 
59    285
60    418
61    322
62      5
63      8
Name: Text_Length, Length: 64, dtype: int64

In [15]:
# Retain only the rows with a text length of at least 10

insurance_pdfs_data = insurance_pdfs_data.loc[insurance_pdfs_data['Text_Length'] >= 10]
insurance_pdfs_data

Unnamed: 0,Page No.,Page_Text,Document Name,Text_Length
0,Page 1,DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/...,Principal-Sample-Life-Insurance-Policy.pdf,30
2,Page 3,POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...,Principal-Sample-Life-Insurance-Policy.pdf,230
4,Page 5,PRINCIPAL LIFE INSURANCE COMPANY (called The P...,Principal-Sample-Life-Insurance-Policy.pdf,110
5,Page 6,TABLE OF CONTENTS PART I - DEFINITIONS PART II...,Principal-Sample-Life-Insurance-Policy.pdf,153
6,Page 7,Section A – Eligibility Member Life Insurance ...,Principal-Sample-Life-Insurance-Policy.pdf,176
7,Page 8,Section A - Member Life Insurance Schedule of ...,Principal-Sample-Life-Insurance-Policy.pdf,171
8,Page 9,P ART I - DEFINITIONS When used in this Group ...,Principal-Sample-Life-Insurance-Policy.pdf,387
9,Page 10,T he legally recognized union of two eligible ...,Principal-Sample-Life-Insurance-Policy.pdf,251
10,Page 11,(2) has been placed with the Member or spouse ...,Principal-Sample-Life-Insurance-Policy.pdf,299
11,Page 12,An institution that is licensed as a Hospital ...,Principal-Sample-Life-Insurance-Policy.pdf,352


In [16]:
# Store the metadata for each page in a separate column

insurance_pdfs_data['Metadata'] = insurance_pdfs_data.apply(lambda x: {'Policy_Name': x['Document Name'][:-4], 'Page_No.': x['Page No.']}, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  insurance_pdfs_data['Metadata'] = insurance_pdfs_data.apply(lambda x: {'Policy_Name': x['Document Name'][:-4], 'Page_No.': x['Page No.']}, axis=1)


In [17]:
insurance_pdfs_data

Unnamed: 0,Page No.,Page_Text,Document Name,Text_Length,Metadata
0,Page 1,DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/...,Principal-Sample-Life-Insurance-Policy.pdf,30,{'Policy_Name': 'Principal-Sample-Life-Insuran...
2,Page 3,POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...,Principal-Sample-Life-Insurance-Policy.pdf,230,{'Policy_Name': 'Principal-Sample-Life-Insuran...
4,Page 5,PRINCIPAL LIFE INSURANCE COMPANY (called The P...,Principal-Sample-Life-Insurance-Policy.pdf,110,{'Policy_Name': 'Principal-Sample-Life-Insuran...
5,Page 6,TABLE OF CONTENTS PART I - DEFINITIONS PART II...,Principal-Sample-Life-Insurance-Policy.pdf,153,{'Policy_Name': 'Principal-Sample-Life-Insuran...
6,Page 7,Section A – Eligibility Member Life Insurance ...,Principal-Sample-Life-Insurance-Policy.pdf,176,{'Policy_Name': 'Principal-Sample-Life-Insuran...
7,Page 8,Section A - Member Life Insurance Schedule of ...,Principal-Sample-Life-Insurance-Policy.pdf,171,{'Policy_Name': 'Principal-Sample-Life-Insuran...
8,Page 9,P ART I - DEFINITIONS When used in this Group ...,Principal-Sample-Life-Insurance-Policy.pdf,387,{'Policy_Name': 'Principal-Sample-Life-Insuran...
9,Page 10,T he legally recognized union of two eligible ...,Principal-Sample-Life-Insurance-Policy.pdf,251,{'Policy_Name': 'Principal-Sample-Life-Insuran...
10,Page 11,(2) has been placed with the Member or spouse ...,Principal-Sample-Life-Insurance-Policy.pdf,299,{'Policy_Name': 'Principal-Sample-Life-Insuran...
11,Page 12,An institution that is licensed as a Hospital ...,Principal-Sample-Life-Insurance-Policy.pdf,352,{'Policy_Name': 'Principal-Sample-Life-Insuran...


In [18]:
# Import the OpenAI Embedding Function into chroma

from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

In [19]:
# Define the path where chroma collections will be stored

chroma_data_path = '/content/drive/MyDrive/helpmate/ChromaDB_Data'

In [20]:
import chromadb

In [21]:
# Call PersistentClient()

client = chromadb.PersistentClient(path=chroma_data_path)

In [22]:
# Set up the embedding function using the OpenAI embedding model

model = "text-embedding-ada-002"
embedding_function = OpenAIEmbeddingFunction(api_key=openai.api_key, model_name=model)

In [23]:
# Initialise a collection in chroma and pass the embedding_function to it so that it used OpenAI embeddings to embed the documents

insurance_collection = client.get_or_create_collection(name='RAG_on_Insurance', embedding_function=embedding_function)

In [24]:
# Convert the page text and metadata from your dataframe to lists to be able to pass it to chroma

documents_list = insurance_pdfs_data["Page_Text"].tolist()
metadata_list = insurance_pdfs_data['Metadata'].tolist()

In [25]:
insurance_collection.add(
    documents= documents_list,
    ids = [str(i) for i in range(0, len(documents_list))],
    metadatas = metadata_list
)



In [26]:
insurance_collection

Collection(name=RAG_on_Insurance)

In [27]:
# Let's take a look at the first few entries in the collection

insurance_collection.get(
    ids = ['0','1','2'],
    include = ['embeddings', 'documents', 'metadatas']
)

{'ids': ['0', '1', '2'],
 'embeddings': [[-0.02239711582660675,
   0.018705571070313454,
   -0.027236562222242355,
   -0.031306393444538116,
   -0.0036654570139944553,
   0.027967045083642006,
   0.0008495120564475656,
   0.023662416264414787,
   -0.016579344868659973,
   0.004320934414863586,
   -0.005201426800340414,
   0.004086136817932129,
   -0.006196056958287954,
   0.023284129798412323,
   -0.011270301416516304,
   0.0310194194316864,
   0.002889319323003292,
   -0.02466682903468609,
   0.030549822375178337,
   0.007833120413124561,
   0.006388461217284203,
   0.018862102180719376,
   0.009476706385612488,
   0.012633434496819973,
   -0.012379069812595844,
   -0.009352785535156727,
   0.01784464530646801,
   -0.02169272117316723,
   0.023505883291363716,
   0.003093136940151453,
   0.01329217292368412,
   -0.01460965070873499,
   -0.033550020307302475,
   -0.0411418192088604,
   -0.008426637388765812,
   0.013592192903161049,
   -0.002639846410602331,
   -0.003919821232557297,
 

In [28]:
insurance_collection.peek(1)

{'ids': ['0'],
 'embeddings': [[-0.02239711582660675,
   0.018705571070313454,
   -0.027236562222242355,
   -0.031306393444538116,
   -0.0036654570139944553,
   0.027967045083642006,
   0.0008495120564475656,
   0.023662416264414787,
   -0.016579344868659973,
   0.004320934414863586,
   -0.005201426800340414,
   0.004086136817932129,
   -0.006196056958287954,
   0.023284129798412323,
   -0.011270301416516304,
   0.0310194194316864,
   0.002889319323003292,
   -0.02466682903468609,
   0.030549822375178337,
   0.007833120413124561,
   0.006388461217284203,
   0.018862102180719376,
   0.009476706385612488,
   0.012633434496819973,
   -0.012379069812595844,
   -0.009352785535156727,
   0.01784464530646801,
   -0.02169272117316723,
   0.023505883291363716,
   0.003093136940151453,
   0.01329217292368412,
   -0.01460965070873499,
   -0.033550020307302475,
   -0.0411418192088604,
   -0.008426637388765812,
   0.013592192903161049,
   -0.002639846410602331,
   -0.003919821232557297,
   0.012424

In [29]:
cache_collection = client.get_or_create_collection(name='Insurance_Cache', embedding_function=embedding_function)

In [30]:
cache_collection.peek()

{'ids': [],
 'embeddings': [],
 'metadatas': [],
 'documents': [],
 'uris': None,
 'data': None}

In [89]:
# Read the user query

query = input()

Claim Procedures


In [90]:
query

'Claim Procedures'

In [91]:
## Quickly checking the results of the query

results = insurance_collection.query(
      query_texts=query,
      n_results=10
      )

In [92]:
results

{'ids': [['59', '58', '15', '16', '51', '3', '14', '5', '13', '45']],
 'distances': [[0.36934734124149327,
   0.41786184356208483,
   0.4268137008144802,
   0.4313661706433755,
   0.4404824498519293,
   0.44063564936674066,
   0.4463725841180099,
   0.4619926011548441,
   0.46813205264614394,
   0.4688368132155892]],
 'metadatas': [[{'Page_No.': 'Page 62',
    'Policy_Name': 'Principal-Sample-Life-Insurance-Policy'},
   {'Page_No.': 'Page 61',
    'Policy_Name': 'Principal-Sample-Life-Insurance-Policy'},
   {'Page_No.': 'Page 18',
    'Policy_Name': 'Principal-Sample-Life-Insurance-Policy'},
   {'Page_No.': 'Page 19',
    'Policy_Name': 'Principal-Sample-Life-Insurance-Policy'},
   {'Page_No.': 'Page 54',
    'Policy_Name': 'Principal-Sample-Life-Insurance-Policy'},
   {'Page_No.': 'Page 6',
    'Policy_Name': 'Principal-Sample-Life-Insurance-Policy'},
   {'Page_No.': 'Page 17',
    'Policy_Name': 'Principal-Sample-Life-Insurance-Policy'},
   {'Page_No.': 'Page 8',
    'Policy_Name': '

In [93]:
for key, val in results.items():
  print(key)

ids
distances
metadatas
embeddings
documents
uris
data


In [94]:
for key, val in results.items():
  print(val)

[['59', '58', '15', '16', '51', '3', '14', '5', '13', '45']]
[[0.36934734124149327, 0.41786184356208483, 0.4268137008144802, 0.4313661706433755, 0.4404824498519293, 0.44063564936674066, 0.4463725841180099, 0.4619926011548441, 0.46813205264614394, 0.4688368132155892]]
[[{'Page_No.': 'Page 62', 'Policy_Name': 'Principal-Sample-Life-Insurance-Policy'}, {'Page_No.': 'Page 61', 'Policy_Name': 'Principal-Sample-Life-Insurance-Policy'}, {'Page_No.': 'Page 18', 'Policy_Name': 'Principal-Sample-Life-Insurance-Policy'}, {'Page_No.': 'Page 19', 'Policy_Name': 'Principal-Sample-Life-Insurance-Policy'}, {'Page_No.': 'Page 54', 'Policy_Name': 'Principal-Sample-Life-Insurance-Policy'}, {'Page_No.': 'Page 6', 'Policy_Name': 'Principal-Sample-Life-Insurance-Policy'}, {'Page_No.': 'Page 17', 'Policy_Name': 'Principal-Sample-Life-Insurance-Policy'}, {'Page_No.': 'Page 8', 'Policy_Name': 'Principal-Sample-Life-Insurance-Policy'}, {'Page_No.': 'Page 16', 'Policy_Name': 'Principal-Sample-Life-Insurance-Poli

In [95]:
# Search the Cache collection first
# Query the collection against the user query and return the top 20 results

cache_results = cache_collection.query(
    query_texts=query,
    n_results=1
)

In [96]:
cache_results

{'ids': [['POLICY NO: S655']],
 'distances': [[0.47306108121890106]],
 'metadatas': [[{'distances0': '0.3612644620020928',
    'distances1': '0.38385040650271823',
    'distances2': '0.3970016033587383',
    'distances3': '0.39735362340672037',
    'distances4': '0.3973973620112447',
    'distances5': '0.4000613299539901',
    'distances6': '0.40200901642414705',
    'distances7': '0.4044496780786471',
    'distances8': '0.40452009834168995',
    'distances9': '0.40584983973581484',
    'documents0': 'POLICY RIDER GROUP INSURANCE POLICY NO: S655 COVERAGE: Life EMPLOYER: RHODE ISLAND JOHN DOE Effective on the later of the Date of Issue of this Group Policy or March 1, 2005, the following will apply to your Policy: From time to time The Principal may offer or provide certain employer groups who apply for coverage with The Principal a Financial Services Hotline and Grief Support Services or any other value added service for the employees of that employer group. In addition, The Principal 

In [97]:
results = insurance_collection.query(
query_texts=query,
n_results=10
)
results.items()

dict_items([('ids', [['59', '58', '15', '16', '51', '3', '14', '5', '13', '45']]), ('distances', [[0.36934734124149327, 0.41786184356208483, 0.4268137008144802, 0.4313661706433755, 0.4404824498519293, 0.44063564936674066, 0.4463725841180099, 0.4619926011548441, 0.46813205264614394, 0.4688368132155892]]), ('metadatas', [[{'Page_No.': 'Page 62', 'Policy_Name': 'Principal-Sample-Life-Insurance-Policy'}, {'Page_No.': 'Page 61', 'Policy_Name': 'Principal-Sample-Life-Insurance-Policy'}, {'Page_No.': 'Page 18', 'Policy_Name': 'Principal-Sample-Life-Insurance-Policy'}, {'Page_No.': 'Page 19', 'Policy_Name': 'Principal-Sample-Life-Insurance-Policy'}, {'Page_No.': 'Page 54', 'Policy_Name': 'Principal-Sample-Life-Insurance-Policy'}, {'Page_No.': 'Page 6', 'Policy_Name': 'Principal-Sample-Life-Insurance-Policy'}, {'Page_No.': 'Page 17', 'Policy_Name': 'Principal-Sample-Life-Insurance-Policy'}, {'Page_No.': 'Page 8', 'Policy_Name': 'Principal-Sample-Life-Insurance-Policy'}, {'Page_No.': 'Page 16', 

In [98]:
# Implementing Cache in Semantic Search

# Set a threshold for cache search
threshold = 0.2

ids = []
documents = []
distances = []
metadatas = []
results_df = pd.DataFrame()


# If the distance is greater than the threshold, then return the results from the main collection.

if cache_results['distances'][0] == [] or cache_results['distances'][0][0] > threshold:
      # Query the collection against the user query and return the top 10 results
      results = insurance_collection.query(
      query_texts=query,
      n_results=10
      )

      # Store the query in cache_collection as document w.r.t to ChromaDB so that it can be embedded and searched against later
      # Store retrieved text, ids, distances and metadatas in cache_collection as metadatas, so that they can be fetched easily if a query indeed matches to a query in cache
      Keys = []
      Values = []

      for key, val in results.items():
        if val is None:
          continue
        for i in range(10):
          Keys.append(str(key)+str(i))
          Values.append(str(val[0][i]))


      cache_collection.add(
          documents= [query],
          ids = [query],  # Or if you want to assign integers as IDs 0,1,2,.., then you can use "len(cache_results['documents'])" as will return the no. of queries currently in the cache and assign the next digit to the new query."
          metadatas = dict(zip(Keys, Values))
      )

      print("Not found in cache. Found in main collection.")

      result_dict = {'Metadatas': results['metadatas'][0], 'Documents': results['documents'][0], 'Distances': results['distances'][0], "IDs":results["ids"][0]}
      results_df = pd.DataFrame.from_dict(result_dict)
      results_df


# If the distance is, however, less than the threshold, you can return the results from cache

elif cache_results['distances'][0][0] <= threshold:
      cache_result_dict = cache_results['metadatas'][0][0]

      # Loop through each inner list and then through the dictionary
      for key, value in cache_result_dict.items():
          if 'ids' in key:
              ids.append(value)
          elif 'documents' in key:
              documents.append(value)
          elif 'distances' in key:
              distances.append(value)
          elif 'metadatas' in key:
              metadatas.append(value)

      print("Found in cache!")

      # Create a DataFrame
      results_df = pd.DataFrame({
        'IDs': ids,
        'Documents': documents,
        'Distances': distances,
        'Metadatas': metadatas
      })

Not found in cache. Found in main collection.


In [99]:
results_df

Unnamed: 0,Metadatas,Documents,Distances,IDs
0,"{'Page_No.': 'Page 62', 'Policy_Name': 'Princi...",A claimant may request an appeal of a claim de...,0.369347,59
1,"{'Page_No.': 'Page 61', 'Policy_Name': 'Princi...",Section D - Claim Procedures Article 1 - Notic...,0.417862,58
2,"{'Page_No.': 'Page 18', 'Policy_Name': 'Princi...",c . a copy of the form which contains the stat...,0.426814,15
3,"{'Page_No.': 'Page 19', 'Policy_Name': 'Princi...",T he Principal has complete discretion to cons...,0.431366,16
4,"{'Page_No.': 'Page 54', 'Policy_Name': 'Princi...","f . claim requirements listed in PART IV, Sect...",0.440482,51
5,"{'Page_No.': 'Page 6', 'Policy_Name': 'Princip...",TABLE OF CONTENTS PART I - DEFINITIONS PART II...,0.440636,3
6,"{'Page_No.': 'Page 17', 'Policy_Name': 'Princi...",a. be actively engaged in business for profit ...,0.446373,14
7,"{'Page_No.': 'Page 8', 'Policy_Name': 'Princip...",Section A - Member Life Insurance Schedule of ...,0.461993,5
8,"{'Page_No.': 'Page 16', 'Policy_Name': 'Princi...",PART II - POLICY ADMINISTRATION Section A - Co...,0.468132,13
9,"{'Page_No.': 'Page 48', 'Policy_Name': 'Princi...",c . If a beneficiary dies at the same time or ...,0.468837,45


In [100]:
## Checking if the cache also contains the results
cache_results = cache_collection.query(
    query_texts=query,
    n_results=1
)

In [101]:
cache_results

{'ids': [['Claim Procedures']],
 'distances': [[0.0]],
 'metadatas': [[{'distances0': '0.36934734124149327',
    'distances1': '0.41786184356208483',
    'distances2': '0.4268137008144802',
    'distances3': '0.4313661706433755',
    'distances4': '0.4404824498519293',
    'distances5': '0.44063564936674066',
    'distances6': '0.4463725841180099',
    'distances7': '0.4619926011548441',
    'distances8': '0.46813205264614394',
    'distances9': '0.4688368132155892',
    'documents0': 'A claimant may request an appeal of a claim denial by Written request to The Principal within 180 days of receipt of notice of the denial. The Principal will make a full and fair review of the claim. The Principal may require additional information to make the review. The Principal will notify the claimant in Writing of the appeal decision within 45 days after receipt of the appeal request. If the appeal cannot be processed within the 45-day period because The Principal did not receive the requested addi

In [73]:
!pip install sentence_transformers



In [74]:
# Import the CrossEncoder library from sentence_transformers

from sentence_transformers import CrossEncoder, util

In [102]:
# Initialise the cross encoder model

cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')



In [103]:
# Test the cross encoder model

scores = cross_encoder.predict([['How much will be the premium rate(s) for each member insured for Life insurance','The premium rate(s) for each Member insured for Life Insurance will be:Member Life Insurance $0.210 for each $1,000 of insurance in force.Member Accidental Death and Dismemberment Insurance $0.025 for each $1,000 of Member Life Insurance in force.Dependent Life Insurance $1.46 for each Member insured for Dependent Life Insurance.'],
                                ['How much will be the premium rate(s) for each member insured for Life insurance','The premium rate(s) for an individual member is goog and affordable']])

In [104]:
scores

array([10.953239 ,  0.3542872], dtype=float32)

In [105]:
# Input (query, response) pairs for each of the top 20 responses received from the semantic search to the cross encoder
# Generate the cross_encoder scores for these pairs

cross_inputs = [[query, response] for response in results_df['Documents']]
cross_rerank_scores = cross_encoder.predict(cross_inputs)

In [106]:
#Store the rerank_scores in results_df

results_df['Reranked_scores'] = cross_rerank_scores

In [107]:
results_df

Unnamed: 0,Metadatas,Documents,Distances,IDs,Reranked_scores
0,"{'Page_No.': 'Page 62', 'Policy_Name': 'Princi...",A claimant may request an appeal of a claim de...,0.369347,59,1.21232
1,"{'Page_No.': 'Page 61', 'Policy_Name': 'Princi...",Section D - Claim Procedures Article 1 - Notic...,0.417862,58,4.571044
2,"{'Page_No.': 'Page 18', 'Policy_Name': 'Princi...",c . a copy of the form which contains the stat...,0.426814,15,-9.198697
3,"{'Page_No.': 'Page 19', 'Policy_Name': 'Princi...",T he Principal has complete discretion to cons...,0.431366,16,0.108492
4,"{'Page_No.': 'Page 54', 'Policy_Name': 'Princi...","f . claim requirements listed in PART IV, Sect...",0.440482,51,-4.652668
5,"{'Page_No.': 'Page 6', 'Policy_Name': 'Princip...",TABLE OF CONTENTS PART I - DEFINITIONS PART II...,0.440636,3,-11.131358
6,"{'Page_No.': 'Page 17', 'Policy_Name': 'Princi...",a. be actively engaged in business for profit ...,0.446373,14,-10.247763
7,"{'Page_No.': 'Page 8', 'Policy_Name': 'Princip...",Section A - Member Life Insurance Schedule of ...,0.461993,5,-2.886805
8,"{'Page_No.': 'Page 16', 'Policy_Name': 'Princi...",PART II - POLICY ADMINISTRATION Section A - Co...,0.468132,13,-10.241926
9,"{'Page_No.': 'Page 48', 'Policy_Name': 'Princi...",c . If a beneficiary dies at the same time or ...,0.468837,45,-7.233204


In [108]:
# Return the top 3 results from semantic search
top_3_semantic = results_df.sort_values(by='Distances')
top_3_semantic[:3]

Unnamed: 0,Metadatas,Documents,Distances,IDs,Reranked_scores
0,"{'Page_No.': 'Page 62', 'Policy_Name': 'Princi...",A claimant may request an appeal of a claim de...,0.369347,59,1.21232
1,"{'Page_No.': 'Page 61', 'Policy_Name': 'Princi...",Section D - Claim Procedures Article 1 - Notic...,0.417862,58,4.571044
2,"{'Page_No.': 'Page 18', 'Policy_Name': 'Princi...",c . a copy of the form which contains the stat...,0.426814,15,-9.198697


In [109]:
# Return the top 3 results after reranking

top_3_rerank = results_df.sort_values(by='Reranked_scores', ascending=False)
top_3_rerank[:3]

Unnamed: 0,Metadatas,Documents,Distances,IDs,Reranked_scores
1,"{'Page_No.': 'Page 61', 'Policy_Name': 'Princi...",Section D - Claim Procedures Article 1 - Notic...,0.417862,58,4.571044
0,"{'Page_No.': 'Page 62', 'Policy_Name': 'Princi...",A claimant may request an appeal of a claim de...,0.369347,59,1.21232
3,"{'Page_No.': 'Page 19', 'Policy_Name': 'Princi...",T he Principal has complete discretion to cons...,0.431366,16,0.108492


In [110]:
top_3_RAG = top_3_rerank[["Documents", "Metadatas"]][:3]

In [111]:
top_3_RAG

Unnamed: 0,Documents,Metadatas
1,Section D - Claim Procedures Article 1 - Notic...,"{'Page_No.': 'Page 61', 'Policy_Name': 'Princi..."
0,A claimant may request an appeal of a claim de...,"{'Page_No.': 'Page 62', 'Policy_Name': 'Princi..."
3,T he Principal has complete discretion to cons...,"{'Page_No.': 'Page 19', 'Policy_Name': 'Princi..."


In [112]:
top_3_RAG['Documents'].iloc[0]

'Section D - Claim Procedures Article 1 - Notice of Claim Written notice must be sent to The Principal by or for a Member or Dependent who wishes to file claim for benefits under this Group Policy. This notice must be sent within 20 days after the date of the loss for which claim is being made. Failure to give notice within the time specified will not invalidate or reduce any claim if notice is given as soon as reasonably possible. Article 2 - Claim Forms The Principal, when it receives notice of claim, will provide appropriate claim forms for filing proof of loss. If the forms are not provided within 15 days after The Principal receives notice, the person will be considered to have complied with the requirements of this Group Policy upon submitting, within the time specified below for filing proof of loss, Written proof covering the occurrence, character, and extent of the loss. Article 3 - Proof of Loss Written proof of loss must be sent to The Principal within 90 days after the date

In [113]:
# Define the function to generate the response. Provide a comprehensive prompt that passes the user query and the top 3 results to the model

def generate_response(query, results_df):
    """
    Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
    """
    messages = [
                {"role": "system", "content":  "You are a helpful assistant in the insurance domain who can effectively answer user queries about Life insurance policy and documents."},
                {"role": "user", "content": f"""You are a helpful assistant in the insurance domain who can effectively answer user queries about Life insurance policy and documents.
                                                You have a question asked by the user in '{query}' and you have some search results from a corpus of insurance documents in the dataframe '{top_3_RAG}'. These search results are essentially one page of an insurance document that may be relevant to the user query.

                                                The column 'documents' inside this dataframe contains the actual text from the policy document and the column 'metadata' contains the policy name and source page. The text inside the document may also contain tables in the format of a list of lists where each of the nested lists indicates a row.

                                                Use the documents in '{top_3_RAG}' to answer the query '{query}'. Frame an informative answer and also, use the dataframe to return the relevant policy names and page numbers as citations.

                                                Follow the guidelines below when performing the task.
                                                1. Try to provide relevant/accurate numbers if available.
                                                2. You don’t have to necessarily use all the information in the dataframe. Only choose information that is relevant.
                                                3. If the document text has tables with relevant information, please reformat the table and return the final information in a tabular in format.
                                                3. Use the Metadatas columns in the dataframe to retrieve and cite the policy name and page numbers as citation.
                                                4. If you can't provide the complete answer, please also provide any information that will help the user to search specific sections in the relevant cited documents.
                                                5. You are a customer facing assistant, so do not provide any information on internal workings, just answer the query directly.

                                                The generated response should answer the query directly addressing the user and avoiding additional information. If you think that the query is not relevant to the document, reply that the query is irrelevant. Provide the final response as a well-formatted and easily readable text along with the citation. Provide your complete response first with all information, and then provide the citations.
                                                """},
              ]

    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=messages
    )

    return response.choices[0].message.content.split('\n')

In [114]:
# Generate the response

response = generate_response(query, top_3_RAG)

In [115]:
# Print the response

print("\n".join(response))

**Response:**

Claim procedures are outlined in the insurance policy document under Section D - Claim Procedures. This section details how a claimant can request an appeal of a claim denial and states that the Principal has complete discretion to consider the appeal.

Please find below the relevant policy names and page numbers for further reference:

- Policy Name: Principal Life Insurance Policy
- Page Numbers:
    - Page 61: Section D - Claim Procedures
    - Page 62: Appeal process for claim denials
    - Page 19: Principal's discretion in considering appeals

*Would you like more specific information or details regarding the claim procedures outlined in the mentioned policy document?*

---

**Citations:**
- Policy Name: Principal Life Insurance Policy
- Page Numbers:
    - Page 61: Section D - Claim Procedures
    - Page 62: Appeal process for claim denials
    - Page 19: Principal's discretion in considering appeals
