In [1]:
# Install all the required libraries

!pip install pdfplumber tiktoken openai chromaDB sentence-transformers -q

In [2]:
# Import all the required Dependencies

import pdfplumber
from pathlib import Path
import pandas as pd
from operator import itemgetter
import json
import tiktoken
import openai
import chromadb

In [3]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

ModuleNotFoundError: No module named 'google.colab'

In [None]:
# Define the path of the given PDF file
single_pdf_path = 'https://drive.google.com/drive/folders/1aWmIfkBhPvaT1Z9W9CWThS0OrKh-fiw9'

In [None]:
import os

folder_path = '/content/drive/My Drive/Policy+Documents'  # Adjust the path to your folder
files = os.listdir(folder_path)

print("Files in folder:")
for file in files:
    print(file)

In [None]:
# Define the path of the given PDF file
single_pdf_path = '/content/drive/My Drive/Policy+Documents/HDFC-Life-Easy-Health-101N110V03-Policy-Bond-Single-Pay.pdf'

In [None]:
# Open the PDF file
with pdfplumber.open(single_pdf_path) as pdf:

    # Get one of the pages from the PDF and examine it
    one_page = pdf.pages[4]

    # Extract text from the first page
    text = one_page.extract_text()

    # Extract tables from the first page
    tables = one_page.extract_tables()

    # Print the extracted text
    print(text)

In [None]:
# View the table in the page, if any
if len(tables) > 0:
            # Access the first table
            first_table = tables[0]
            for row in first_table:
                print(row)
else:
            print("No tables found")


In [None]:
# Define the path where all pdf documents are present


pdf_path = "/content/drive/My Drive/Policy+Documents"

In [None]:
def check_bboxes(word, table_bbox):
    l = (word['x0'], word['top'], word['x1'], word['bottom'])
    r = table_bbox
    return (l[0] > r[0] and   # Left x of word > Left x of table
            l[1] > r[1] and   # Top y of word > Top y of table
            l[2] < r[2] and   # Right x of word < Right x of table
            l[3] < r[3])      # Bottom y of word < Bottom y of table

In [None]:
def extract_text_from_pdf(pdf_path):
    p = 0
    full_text = []


    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_no = f"Page {p+1}"
            text = page.extract_text()

            tables = page.find_tables()
            table_bboxes = [i.bbox for i in tables]
            tables = [{'table': i.extract(), 'top': i.bbox[1]} for i in tables]
            non_table_words = [word for word in page.extract_words() if not any(
                [check_bboxes(word, table_bbox) for table_bbox in table_bboxes])]
            lines = []

            for cluster in pdfplumber.utils.cluster_objects(non_table_words + tables, itemgetter('top'), tolerance=5):

                if 'text' in cluster[0]:
                    try:
                        lines.append(' '.join([i['text'] for i in cluster]))
                    except KeyError:
                        pass

                elif 'table' in cluster[0]:
                    lines.append(json.dumps(cluster[0]['table']))


            full_text.append([page_no, " ".join(lines)])
            p +=1

    return full_text


In [None]:
# Define the directory containing the PDF files
import pandas as pd
from pathlib import Path

# Define the directory containing the PDFs
pdf_directory = Path("/content/drive/My Drive/Policy+Documents")

# Initialize an empty list to store the extracted texts and document names
data = []

# Loop through the specified PDF file in the directory
for pdf_path in pdf_directory.glob("HDFC-Life-Easy-Health-101N110V03-Policy-Bond-Single-Pay.pdf"):
    # Process the PDF file
    print(f"...Processing {pdf_path.name}")

    # Call the function to extract the text from the PDF
    extracted_text = extract_text_from_pdf(pdf_path)

    # Convert the extracted list to a DataFrame, and add a column to store document names
    extracted_text_df = pd.DataFrame(extracted_text, columns=['Page No.', 'Page_Text'])
    extracted_text_df['Document Name'] = pdf_path.name

    # Append the extracted text DataFrame to the list
    data.append(extracted_text_df)

    # Print a message to indicate progress
    print(f"Finished processing {pdf_path.name}")

# Concatenate all DataFrames in the list into a single DataFrame
final_df = pd.concat(data, ignore_index=True)

# Print a message to indicate all PDFs have been processed
print("All PDFs have been processed.")

# Save the final DataFrame to a CSV file
final_df.to_csv('/content/drive/My Drive/Policy+Documents/extracted_texts.csv', index=False)
print("Extracted texts saved to extracted_texts.csv")


In [None]:
# Concatenate all the PDFs in the list 'data' together

insurance_pdfs_data = pd.concat(data, ignore_index=True)


In [None]:
insurance_pdfs_data

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
insurance_pdfs_data.groupby('Document Name').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
# Check one of the extracted page texts to ensure that the text has been correctly read

insurance_pdfs_data.Page_Text[0]

In [None]:
# Let's also check the length of all the texts as there might be some empty pages or pages with very few words that we can drop

insurance_pdfs_data['Text_Length'] = insurance_pdfs_data['Page_Text'].apply(lambda x: len(x.split(' ')))

In [None]:
insurance_pdfs_data['Text_Length']

In [None]:
# Retain only the rows with a text length of at least 10

insurance_pdfs_data = insurance_pdfs_data.loc[insurance_pdfs_data['Text_Length'] >= 10]
insurance_pdfs_data

In [None]:
# Store the metadata for each page in a separate column

metadata = insurance_pdfs_data.apply(lambda x: {'Policy_Name': x['Document Name'][:-4], 'Page_No.': x['Page No.']}, axis=1)
insurance_pdfs_data['Metadata'] = metadata


In [None]:

insurance_pdfs_data.head()

In [None]:
# Import the OpenAI Embedding Function into chroma

from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

In [None]:
# Define the path where chroma collections will be stored

chroma_data_path = '/content/drive/MyDrive/Colab Notebooks/HelpMate AI Codes/ChromaDB_Data'

In [None]:
import chromadb

In [None]:
# Call PersistentClient()

client = chromadb.PersistentClient()

In [None]:
# Initialise a collection in chroma and pass the embedding_function to it so that it used OpenAI embeddings to embed the documents

insurance_collection = client.get_or_create_collection(name='RAG_on_Insurance')

In [None]:
# Convert the page text and metadata from your dataframe to lists to be able to pass it to chroma

documents_list = insurance_pdfs_data["Page_Text"].tolist()
metadata_list = insurance_pdfs_data['Metadata'].tolist()


In [None]:
# Add the documents and metadata to the collection alongwith generic integer IDs. You can also feed the metadata information as IDs by combining the policy name and page no.

insurance_collection.add(
    documents= documents_list,
    ids = [str(i) for i in range(0, len(documents_list))],
    metadatas = metadata_list
)

In [None]:

# First few entries in the collection

insurance_collection.get(
    ids = ['0','1','2'],
    include = ['embeddings', 'documents', 'metadatas']
)

In [None]:
cache_collection = client.get_or_create_collection(name='Insurance_Cache')

In [None]:
cache_collection.peek()

In [None]:
# Read the user query

query = input()

In [None]:
print(query)

In [None]:
cache_results = cache_collection.query(
    query_texts=query,
    n_results=1
)

In [None]:
# Print the result

cache_results

In [None]:
results = insurance_collection.query(
query_texts=query,
n_results=10
)


In [None]:
import pandas as pd

def perform_semantic_search(query, cache_results, insurance_collection, cache_collection, threshold=0.2):
    ids, documents, distances, metadatas = [], [], [], []
    results_df = pd.DataFrame()

    # Check if results are in cache
    if not cache_results['distances'] or cache_results['distances'][0][0] > threshold:
        # Query the main collection
        print("Not found in cache. Found in main collection.")
        results = insurance_collection.query(query_texts=query, n_results=10)

        # Prepare data for caching
        Keys, Values = [], []
        for key, val in results.items():
            if val is None:
                continue
            for i in range(len(val[0])):  # Use len(val[0]) for dynamic handling
                Keys.append(f"{key}{i}")
                Values.append(str(val[0][i]))

        # Add to cache
        cache_collection.add(
            documents=[query],
            ids=[query],  # Can use incremental IDs if needed
            metadatas=dict(zip(Keys, Values))
        )

        # Create result DataFrame
        result_dict = {
            'Metadatas': results['metadatas'][0],
            'Documents': results['documents'][0],
            'Distances': results['distances'][0],
            'IDs': results["ids"][0]
        }
        results_df = pd.DataFrame.from_dict(result_dict)

    else:
        # Cache hit
        print("Found in cache!")
        cache_result_dict = cache_results['metadatas'][0][0]

        # Extract cached data
        for key, value in cache_result_dict.items():
            if 'ids' in key:
                ids.append(value)
            elif 'documents' in key:
                documents.append(value)
            elif 'distances' in key:
                distances.append(value)
            elif 'metadatas' in key:
                metadatas.append(value)

        # Create DataFrame from cache results
        results_df = pd.DataFrame({
            'IDs': ids,
            'Documents': documents,
            'Distances': distances,
            'Metadatas': metadatas
        })

    return results_df


In [None]:
results_df


In [None]:
# Import the CrossEncoder library from sentence_transformers

from sentence_transformers import CrossEncoder, util

In [None]:
# Initialise the cross encoder model

cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [None]:
# Test the cross encoder model

scores = cross_encoder.predict([['Does the insurance cover diabetic patients?', 'The insurance policy covers some pre-existing conditions including diabetes, heart diseases, etc. The policy does not howev'],
                                ['Does the insurance cover diabetic patients?', 'The premium rates for various age groups are given as follows. Age group (<18 years): Premium rate']])


In [None]:
scores

In [None]:
# Generate the cross_encoder scores for these pairs


# Prepare inputs for cross-encoder
cross_inputs = [[query, response] for response in results_df['Documents']]

# Predict rerank scores
cross_rerank_scores = cross_encoder.predict(cross_inputs)

# Append scores to the DataFrame
results_df['Rerank Scores'] = cross_rerank_scores

# Sort results by rerank scores in descending order
results_df = results_df.sort_values(by='Rerank Scores', ascending=False)

# Display the final results
print("Reranked Results:")
print(results_df)


In [None]:
cross_rerank_scores



In [None]:

# Store the rerank_scores in results_df

results_df['Reranked_scores'] = cross_rerank_scores

In [None]:
results_df


In [None]:


# Example results DataFrame with distances and documents
top_3_semantic = results_df.sort_values(by='Distances').head(3)

# Display the top 3 results
print("Top 3 Semantic Search Results:")
print(top_3_semantic)


In [None]:
results_df.head()

In [None]:
top_3_rerank = results_df.sort_values(by='Reranked_scores', ascending=False)
top_3_rerank[:3]

In [None]:
top_3_RAG = top_3_rerank[["Documents", "Documents"]][:3]

In [None]:
top_3_RAG

In [None]:
# Define the function to generate the response. Provide a comprehensive prompt that passes the user query and the top 3 results to the model

def generate_response(query, results_df):
    """
    Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
    """
    messages = [
                {"role": "system", "content":  "You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents."},
                {"role": "user", "content": f"""You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents.
                                                You have a question asked by the user in '{query}' and you have some search results from a corpus of insurance documents in the dataframe '{top_3_RAG}'. These search results are essentially one page of an insurance document that may be relevant to the user query.

                                                The column 'documents' inside this dataframe contains the actual text from the policy document and the column 'metadata' contains the policy name and source page. The text inside the document may also contain tables in the format of a list of lists where each of the nested lists indicates a row.

                                                Use the documents in '{top_3_RAG}' to answer the query '{query}'. Frame an informative answer and also, use the dataframe to return the relevant policy names and page numbers as citations.

                                                Follow the guidelines below when performing the task.
                                                1. Try to provide relevant/accurate numbers if available.
                                                2. You don’t have to necessarily use all the information in the dataframe. Only choose information that is relevant.
                                                3. If the document text has tables with relevant information, please reformat the table and return the final information in a tabular in format.
                                                3. Use the Metadatas columns in the dataframe to retrieve and cite the policy name(s) and page numbers(s) as citation.
                                                4. If you can't provide the complete answer, please also provide any information that will help the user to search specific sections in the relevant cited documents.
                                                5. You are a customer facing assistant, so do not provide any information on internal workings, just answer the query directly.

                                                The generated response should answer the query directly addressing the user and avoiding additional information. If you think that the query is not relevant to the document, reply that the query is irrelevant. Provide the final response as a well-formatted and easily readable text along with the citation. Provide your complete response first with all information, and then provide the citations.
                                                """},
              ]

    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=messages
    )

    return response.choices[0].message.content.split('\n')