In [1]:
# Install all the required libraries

!pip install -U -q pdfplumber tiktoken openai chromaDB sentence-transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m74.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m755.0/755.0 kB[0m [31m48.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.3/19.3 MB[0m [31m80.8 MB/s[0m eta [36m

In [2]:
# Import all the required Libraries

import pdfplumber
from pathlib import Path
import pandas as pd
from operator import itemgetter
import json
import tiktoken
import openai
import chromadb

In [3]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [8]:
pdf_path = '/content/drive/MyDrive/Colab Notebooks/AskMyPolicy'

In [54]:
from google.colab import userdata
import os
from openai import OpenAI

api_key = userdata.get("API_KEY")
os.environ["OPENAI_API_KEY"] = api_key



In [7]:
# Function to check whether a word is present in a table or not for segregation of regular text and tables

def check_bboxes(word, table_bbox):
    # Check whether word is inside a table bbox.
    l = word['x0'], word['top'], word['x1'], word['bottom']
    r = table_bbox
    return l[0] > r[0] and l[1] > r[1] and l[2] < r[2] and l[3] < r[3]

In [9]:
def extract_text_from_pdf(pdf_path):
    p = 0
    full_text = []


    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_no = f"Page {p+1}"
            text = page.extract_text()

            tables = page.find_tables()
            table_bboxes = [i.bbox for i in tables]
            tables = [{'table': i.extract(), 'top': i.bbox[1]} for i in tables]
            non_table_words = [word for word in page.extract_words() if not any(
                [check_bboxes(word, table_bbox) for table_bbox in table_bboxes])]
            lines = []

            for cluster in pdfplumber.utils.cluster_objects(non_table_words + tables, itemgetter('top'), tolerance=5):

                if 'text' in cluster[0]:
                    try:
                        lines.append(' '.join([i['text'] for i in cluster]))
                    except KeyError:
                        pass

                elif 'table' in cluster[0]:
                    lines.append(json.dumps(cluster[0]['table']))


            full_text.append([page_no, " ".join(lines)])
            p +=1

    return full_text

In [10]:
# Define the directory containing the PDF files
pdf_directory = Path(pdf_path)

# Initialize an empty list to store the extracted texts and document names
data = []

# Loop through all files in the directory
for pdf_path in pdf_directory.glob("*.pdf"):

    # Process the PDF file
    print(f"...Processing {pdf_path.name}")

    # Call the function to extract the text from the PDF
    extracted_text = extract_text_from_pdf(pdf_path)

    # Convert the extracted list to a PDF, and add a column to store document names
    extracted_text_df = pd.DataFrame(extracted_text, columns=['Page No.', 'Page_Text'])
    extracted_text_df['Document Name'] = pdf_path.name

    # Append the extracted text and document name to the list
    data.append(extracted_text_df)

    # Print a message to indicate progress
    print(f"Finished processing {pdf_path.name}")

# Print a message to indicate all PDFs have been processed
print("All PDFs have been processed.")

...Processing Principal-Sample-Life-Insurance-Policy.pdf
Finished processing Principal-Sample-Life-Insurance-Policy.pdf
All PDFs have been processed.


In [18]:
data

[   Page No.                                          Page_Text  \
 0    Page 1  DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/...   
 1    Page 2                 This page left blank intentionally   
 2    Page 3  POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...   
 3    Page 4                 This page left blank intentionally   
 4    Page 5  PRINCIPAL LIFE INSURANCE COMPANY (called The P...   
 ..      ...                                                ...   
 59  Page 60  I f a Dependent who was insured dies during th...   
 60  Page 61  Section D - Claim Procedures Article 1 - Notic...   
 61  Page 62  A claimant may request an appeal of a claim de...   
 62  Page 63                 This page left blank intentionally   
 63  Page 64  Principal Life Insurance Company Des Moines, I...   
 
                                  Document Name  
 0   Principal-Sample-Life-Insurance-Policy.pdf  
 1   Principal-Sample-Life-Insurance-Policy.pdf  
 2   Principal-Sample-Life-Insurance-Policy.

In [11]:
# Concatenate all the PDFs in the list 'data' together

insurance_pdfs_data = pd.concat(data, ignore_index=True)

In [12]:
insurance_pdfs_data

Unnamed: 0,Page No.,Page_Text,Document Name
0,Page 1,DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/...,Principal-Sample-Life-Insurance-Policy.pdf
1,Page 2,This page left blank intentionally,Principal-Sample-Life-Insurance-Policy.pdf
2,Page 3,POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...,Principal-Sample-Life-Insurance-Policy.pdf
3,Page 4,This page left blank intentionally,Principal-Sample-Life-Insurance-Policy.pdf
4,Page 5,PRINCIPAL LIFE INSURANCE COMPANY (called The P...,Principal-Sample-Life-Insurance-Policy.pdf
...,...,...,...
59,Page 60,I f a Dependent who was insured dies during th...,Principal-Sample-Life-Insurance-Policy.pdf
60,Page 61,Section D - Claim Procedures Article 1 - Notic...,Principal-Sample-Life-Insurance-Policy.pdf
61,Page 62,A claimant may request an appeal of a claim de...,Principal-Sample-Life-Insurance-Policy.pdf
62,Page 63,This page left blank intentionally,Principal-Sample-Life-Insurance-Policy.pdf


In [13]:
# Let's also check the length of all the texts as there might be some empty pages or pages with very few words that we can drop

insurance_pdfs_data['Text_Length'] = insurance_pdfs_data['Page_Text'].apply(lambda x: len(x.split(' ')))

In [14]:
insurance_pdfs_data['Text_Length']

Unnamed: 0,Text_Length
0,30
1,5
2,230
3,5
4,110
...,...
59,285
60,418
61,322
62,5


In [57]:
# Retain only the rows with a text length of at least 10

insurance_pdfs_data = insurance_pdfs_data.loc[insurance_pdfs_data['Text_Length'] >= 10]
insurance_pdfs_data.head()

Unnamed: 0,Page No.,Page_Text,Document Name,Text_Length,Metadata
0,Page 1,DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/...,Principal-Sample-Life-Insurance-Policy.pdf,30,{'Policy_Name': 'Principal-Sample-Life-Insuran...
2,Page 3,POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...,Principal-Sample-Life-Insurance-Policy.pdf,230,{'Policy_Name': 'Principal-Sample-Life-Insuran...
4,Page 5,PRINCIPAL LIFE INSURANCE COMPANY (called The P...,Principal-Sample-Life-Insurance-Policy.pdf,110,{'Policy_Name': 'Principal-Sample-Life-Insuran...
5,Page 6,TABLE OF CONTENTS PART I - DEFINITIONS PART II...,Principal-Sample-Life-Insurance-Policy.pdf,153,{'Policy_Name': 'Principal-Sample-Life-Insuran...
6,Page 7,Section A – Eligibility Member Life Insurance ...,Principal-Sample-Life-Insurance-Policy.pdf,176,{'Policy_Name': 'Principal-Sample-Life-Insuran...


In [16]:
# Store the metadata for each page in a separate column

insurance_pdfs_data['Metadata'] = insurance_pdfs_data.apply(lambda x: {'Policy_Name': x['Document Name'][:-4], 'Page_No.': x['Page No.']}, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  insurance_pdfs_data['Metadata'] = insurance_pdfs_data.apply(lambda x: {'Policy_Name': x['Document Name'][:-4], 'Page_No.': x['Page No.']}, axis=1)


In [21]:
# Import the OpenAI Embedding Function into chroma

from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

In [22]:
# Define the path where chroma collections will be stored

chroma_data_path = '/content/drive/MyDrive/HelpMate/ChromaDB_Data'

In [23]:
import chromadb

In [26]:
# Call PersistentClient()

client = chromadb.PersistentClient(path=chroma_data_path)

In [25]:
import logging
logging.getLogger("chromadb.telemetry.product.posthog").setLevel(logging.CRITICAL)


In [27]:
# Set up the embedding function using the OpenAI embedding model

model = "text-embedding-ada-002"
embedding_function = OpenAIEmbeddingFunction(api_key=api_key, model_name=model)

In [29]:
# Initialise a collection in chroma and pass the embedding_function to it so that it used OpenAI embeddings to embed the documents

insurance_collection = client.get_or_create_collection(name='AskMyPolicy', embedding_function=embedding_function)

In [30]:
# Convert the page text and metadata from your dataframe to lists to be able to pass it to chroma

documents_list = insurance_pdfs_data["Page_Text"].tolist()
metadata_list = insurance_pdfs_data['Metadata'].tolist()

In [31]:
# Add the documents and metadata to the collection alongwith generic integer IDs. You can also feed the metadata information as IDs by combining the policy name and page no.

insurance_collection.add(
    documents= documents_list,
    ids = [str(i) for i in range(0, len(documents_list))],
    metadatas = metadata_list
)

In [32]:
# Let's take a look at the first few entries in the collection

insurance_collection.get(
    ids = ['0','1','2'],
    include = ['embeddings', 'documents', 'metadatas']
)

{'ids': ['0', '1', '2'],
 'embeddings': array([[-2.24228799e-02,  1.87183432e-02, -2.72361692e-02, ...,
         -3.69149223e-02,  2.83710100e-03, -1.30930578e-03],
        [-1.32036684e-02,  8.89394712e-03, -4.63569537e-03, ...,
         -1.57016590e-02, -4.11756810e-05,  7.26064527e-03],
        [-1.20574497e-02,  1.41532440e-02, -3.39074316e-03, ...,
         -2.85983067e-02, -9.54382308e-03,  1.02932686e-02]]),
 'documents': ['DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/01/2014 711 HIGH STREET GEORGE RI 02903 GROUP POLICY FOR: RHODE ISLAND JOHN DOE ALL MEMBERS Group Member Life Insurance Print Date: 07/16/2014',
  'POLICY RIDER GROUP INSURANCE POLICY NO: S655 COVERAGE: Life EMPLOYER: RHODE ISLAND JOHN DOE Effective on the later of the Date of Issue of this Group Policy or March 1, 2005, the following will apply to your Policy: From time to time The Principal may offer or provide certain employer groups who apply for coverage with The Principal a Financial Services Hotline and Gri

In [33]:
cache_collection = client.get_or_create_collection(name='Insurance_Cache', embedding_function=embedding_function)

In [34]:
# Read the user query

query = input()

is heart problem surgery covered under this policy?


In [35]:
cache_results = cache_collection.query(
    query_texts= query,
    n_results= 1

)

In [36]:

cache_results

{'ids': [[]],
 'embeddings': None,
 'documents': [[]],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[]],
 'distances': [[]]}

In [37]:
results = insurance_collection.query(
query_texts=query,
n_results=10
)
# results.items()

In [38]:
results

{'ids': [['10', '1', '51', '54', '52', '50', '53', '29', '9', '16']],
 'embeddings': None,
 'documents': [["a . A licensed Doctor of Medicine (M.D.) or Osteopathy (D.O.); or b. any other licensed health care practitioner that state law requires be recognized as a Physician under this Group Policy. The term Physician does not include the Member, an employee of the Member, a business or professional partner or associate of the Member, any person who has a financial affiliation or business interest with the Member, anyone related to the Member by blood or marriage, or anyone living in the Member's household. Policy Anniversary November 1, 2014 and the same day of each following year. Policyholder The entity to whom this Group Policy is issued (see Title Page). Prior Policy The Group Term Life coverage of either: a. the Policyholder; or b. a business entity which has been obtained by the Policyholder through a merger or acquisition; for which this Group Policy is a replacement. Proof of Go

In [39]:
# Implementing Cache in Semantic Search

# Set a threshold for cache search
threshold = 0.2

ids = []
documents = []
distances = []
metadatas = []
results_df = pd.DataFrame()


# If the distance is greater than the threshold, then return the results from the main collection.

if cache_results['distances'][0] == [] or cache_results['distances'][0][0] > threshold:
      # Query the collection against the user query and return the top 10 results
      results = insurance_collection.query(
      query_texts=query,
      n_results=10
      )

      # Store the query in cache_collection as document w.r.t to ChromaDB so that it can be embedded and searched against later
      # Store retrieved text, ids, distances and metadatas in cache_collection as metadatas, so that they can be fetched easily if a query indeed matches to a query in cache
      Keys = []
      Values = []

      for key, val in results.items():
        if val is None:
            continue
      for i, item in enumerate(val[0][:10]):
        Keys.append(f"{key}{i}")
        Values.append(str(item))


      cache_collection.add(
          documents= [query],
          ids = [query],  # Or if you want to assign integers as IDs 0,1,2,.., then you can use "len(cache_results['documents'])" as will return the no. of queries currently in the cache and assign the next digit to the new query."
          metadatas = dict(zip(Keys, Values))
      )

      print("Not found in cache. Found in main collection.")

      result_dict = {'Metadatas': results['metadatas'][0], 'Documents': results['documents'][0], 'Distances': results['distances'][0], "IDs":results["ids"][0]}
      results_df = pd.DataFrame.from_dict(result_dict)
      results_df


# If the distance is, however, less than the threshold, you can return the results from cache

elif cache_results['distances'][0][0] <= threshold:
      cache_result_dict = cache_results['metadatas'][0][0]

      # Loop through each inner list and then through the dictionary
      for key, value in cache_result_dict.items():
          if 'ids' in key:
              ids.append(value)
          elif 'documents' in key:
              documents.append(value)
          elif 'distances' in key:
              distances.append(value)
          elif 'metadatas' in key:
              metadatas.append(value)

      print("Found in cache!")

      # Create a DataFrame
      results_df = pd.DataFrame({
        'IDs': ids,
        'Documents': documents,
        'Distances': distances,
        'Metadatas': metadatas
      })


Not found in cache. Found in main collection.


In [40]:
results_df

Unnamed: 0,Metadatas,Documents,Distances,IDs
0,"{'Page_No.': 'Page 13', 'Policy_Name': 'Princi...",a . A licensed Doctor of Medicine (M.D.) or Os...,0.418747,10
1,{'Policy_Name': 'Principal-Sample-Life-Insuran...,POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...,0.447095,1
2,"{'Page_No.': 'Page 54', 'Policy_Name': 'Princi...","f . claim requirements listed in PART IV, Sect...",0.453098,51
3,{'Policy_Name': 'Principal-Sample-Life-Insuran...,% of Scheduled Covered Loss Benefit Loss of Sp...,0.457872,54
4,{'Policy_Name': 'Principal-Sample-Life-Insuran...,Exposure Exposure to the elements will be pres...,0.458687,52
5,"{'Page_No.': 'Page 53', 'Policy_Name': 'Princi...",Section B - Member Accidental Death and Dismem...,0.459929,50
6,"{'Page_No.': 'Page 56', 'Policy_Name': 'Princi...","If a Member sustains an injury, and as a resul...",0.464078,53
7,"{'Page_No.': 'Page 32', 'Policy_Name': 'Princi...",(1) marriage or establishment of a Civil Union...,0.469509,29
8,{'Policy_Name': 'Principal-Sample-Life-Insuran...,An institution that is licensed as a Hospital ...,0.470569,9
9,"{'Page_No.': 'Page 19', 'Policy_Name': 'Princi...",T he Principal has complete discretion to cons...,0.471383,16


In [41]:
# Import the CrossEncoder library from sentence_transformers

from sentence_transformers import CrossEncoder, util

In [42]:
# Initialise the cross encoder model

cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

In [43]:
# Input (query, response) pairs for each of the top 20 responses received from the semantic search to the cross encoder
# Generate the cross_encoder scores for these pairs

cross_inputs = [[query, response] for response in results_df['Documents']]
cross_rerank_scores = cross_encoder.predict(cross_inputs)

In [44]:
cross_rerank_scores

array([ -4.11397 , -10.364563,  -8.811096,  -9.519801,  -8.510972,
       -10.652288,  -8.619309, -10.424266, -10.589346,  -9.647388],
      dtype=float32)

In [45]:
# Store the rerank_scores in results_df

results_df['Reranked_scores'] = cross_rerank_scores

In [46]:
results_df

Unnamed: 0,Metadatas,Documents,Distances,IDs,Reranked_scores
0,"{'Page_No.': 'Page 13', 'Policy_Name': 'Princi...",a . A licensed Doctor of Medicine (M.D.) or Os...,0.418747,10,-4.11397
1,{'Policy_Name': 'Principal-Sample-Life-Insuran...,POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...,0.447095,1,-10.364563
2,"{'Page_No.': 'Page 54', 'Policy_Name': 'Princi...","f . claim requirements listed in PART IV, Sect...",0.453098,51,-8.811096
3,{'Policy_Name': 'Principal-Sample-Life-Insuran...,% of Scheduled Covered Loss Benefit Loss of Sp...,0.457872,54,-9.519801
4,{'Policy_Name': 'Principal-Sample-Life-Insuran...,Exposure Exposure to the elements will be pres...,0.458687,52,-8.510972
5,"{'Page_No.': 'Page 53', 'Policy_Name': 'Princi...",Section B - Member Accidental Death and Dismem...,0.459929,50,-10.652288
6,"{'Page_No.': 'Page 56', 'Policy_Name': 'Princi...","If a Member sustains an injury, and as a resul...",0.464078,53,-8.619309
7,"{'Page_No.': 'Page 32', 'Policy_Name': 'Princi...",(1) marriage or establishment of a Civil Union...,0.469509,29,-10.424266
8,{'Policy_Name': 'Principal-Sample-Life-Insuran...,An institution that is licensed as a Hospital ...,0.470569,9,-10.589346
9,"{'Page_No.': 'Page 19', 'Policy_Name': 'Princi...",T he Principal has complete discretion to cons...,0.471383,16,-9.647388


In [47]:
# Return the top 3 results from semantic search

top_3_semantic = results_df.sort_values(by='Distances')
top_3_semantic[:3]

Unnamed: 0,Metadatas,Documents,Distances,IDs,Reranked_scores
0,"{'Page_No.': 'Page 13', 'Policy_Name': 'Princi...",a . A licensed Doctor of Medicine (M.D.) or Os...,0.418747,10,-4.11397
1,{'Policy_Name': 'Principal-Sample-Life-Insuran...,POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...,0.447095,1,-10.364563
2,"{'Page_No.': 'Page 54', 'Policy_Name': 'Princi...","f . claim requirements listed in PART IV, Sect...",0.453098,51,-8.811096


In [48]:
# Return the top 3 results after reranking

top_3_rerank = results_df.sort_values(by='Reranked_scores', ascending=False)
top_3_rerank[:3]

Unnamed: 0,Metadatas,Documents,Distances,IDs,Reranked_scores
0,"{'Page_No.': 'Page 13', 'Policy_Name': 'Princi...",a . A licensed Doctor of Medicine (M.D.) or Os...,0.418747,10,-4.11397
4,{'Policy_Name': 'Principal-Sample-Life-Insuran...,Exposure Exposure to the elements will be pres...,0.458687,52,-8.510972
6,"{'Page_No.': 'Page 56', 'Policy_Name': 'Princi...","If a Member sustains an injury, and as a resul...",0.464078,53,-8.619309


In [49]:
top_3_RAG = top_3_rerank[["Documents", "Metadatas"]][:3]

In [50]:
top_3_RAG

Unnamed: 0,Documents,Metadatas
0,a . A licensed Doctor of Medicine (M.D.) or Os...,"{'Page_No.': 'Page 13', 'Policy_Name': 'Princi..."
4,Exposure Exposure to the elements will be pres...,{'Policy_Name': 'Principal-Sample-Life-Insuran...
6,"If a Member sustains an injury, and as a resul...","{'Page_No.': 'Page 56', 'Policy_Name': 'Princi..."


In [51]:
# Define the function to generate the response. Provide a comprehensive prompt that passes the user query and the top 3 results to the model

def generate_response(query, results_df):
    """
    Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
    """
    messages = [
                {"role": "system", "content":  "You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents."},
                {"role": "user", "content": f"""You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents.
                                                You have a question asked by the user in '{query}' and you have some search results from a corpus of insurance documents in the dataframe '{top_3_RAG}'. These search results are essentially one page of an insurance document that may be relevant to the user query.

                                                The column 'documents' inside this dataframe contains the actual text from the policy document and the column 'metadata' contains the policy name and source page. The text inside the document may also contain tables in the format of a list of lists where each of the nested lists indicates a row.

                                                Use the documents in '{top_3_RAG}' to answer the query '{query}'. Frame an informative answer and also, use the dataframe to return the relevant policy names and page numbers as citations.

                                                Follow the guidelines below when performing the task.
                                                1. Try to provide relevant/accurate numbers if available.
                                                2. You don’t have to necessarily use all the information in the dataframe. Only choose information that is relevant.
                                                3. If the document text has tables with relevant information, please reformat the table and return the final information in a tabular in format.
                                                3. Use the Metadatas columns in the dataframe to retrieve and cite the policy name(s) and page numbers(s) as citation.
                                                4. If you can't provide the complete answer, please also provide any information that will help the user to search specific sections in the relevant cited documents.
                                                5. You are a customer facing assistant, so do not provide any information on internal workings, just answer the query directly.

                                                The generated response should answer the query directly addressing the user and avoiding additional information. If you think that the query is not relevant to the document, reply that the query is irrelevant. Provide the final response as a well-formatted and easily readable text along with the citation. Provide your complete response first with all information, and then provide the citations.
                                                """},
              ]

    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=messages
    )

    return response.choices[0].message.content.split('\n')

In [52]:
# Generate the response

response = generate_response(query, top_3_RAG)

In [53]:
response

['Based on the search results from the insurance documents, the query "is heart problem surgery covered under this policy?" is not directly addressed in the extracted text snippets. However, to find information on whether heart problem surgery is covered under a specific policy, you may search through the full policy documents using the policy names and corresponding page numbers provided in the metadata.',
 '',
 '**Response:**',
 '- The query regarding coverage for heart problem surgery is not explicitly mentioned in the extracted text from the insurance documents.',
 '- To determine specific coverage details for heart problem surgery, please refer to the full policy documents listed below for more comprehensive information.',
 '',
 '**Citations:**',
 '1. Policy Name: Principal-Sample-Life-Insurance',
 '   - Page Number: Page 56',
 '',
 '2. Policy Name: Principal-Sample-Life-Insurance',
 '   - Page Number: Page 13']