In [13]:
from dotenv import load_dotenv
from langchain_openai import AzureChatOpenAI, ChatOpenAI
import os
from langchain.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer # type: ignore
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
# from langchain.globals import set_llm_cache
from langchain_community.cache import InMemoryCache
from langchain_core.prompts import PromptTemplate
from langchain.chains import RetrievalQA
import tqdm
load_dotenv()

True

In [14]:
llm = ChatOpenAI(
    model = "gpt-3.5-turbo",
    temperature = 0.4,
    max_tokens = 400,
)
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

In [15]:
file_path = "policy_data.pdf"
loader = PyPDFLoader(file_path)
pages = loader.load()

In [16]:
len(pages)

44

In [17]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=700,
    chunk_overlap=50,
    length_function=len,
)

In [18]:
pages_text = [pages.page_content for pages in pages]
# the type of pages is documents but we need to convert them into list of strings for splitting the text.

In [19]:
documents = splitter.create_documents(pages_text)
# splitting the text into chunks of 250 characters and 50 characters overlap.

In [20]:
print(documents[2])

page_content='About the policy\nThe policy is made up of:\n >This booklet.\n >Your car insurance details.\n >Your certificate (or certificates)  \nof motor insurance.\nIf the policy includes Green Flag breakdown cover:\n >Your breakdown cover and your car \ninsurance are part of the same policy.\n >The policy also includes the Green Flag \npolicy booklet we’ve given you.\nIf you have a policy that includes DriveSure:\n >The policy also includes the DriveSure terms \nand conditions we’ve given you.\nPlease read all these documents carefully  and keep them safe in case you need them.Words in bold type\nSome of the words and phrases we use in \nthis booklet have a specific meaning – for example, your car or modifications.'


In [21]:
def get_vectorstore(text_chunks):
    # Check if the FAISS index file already exists
    if os.path.exists("faiss_index"):
        embeddings = OpenAIEmbeddings()
        # Load the existing FAISS index
        vectorstore = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True) # used this because it is my local desktop mode
        print("Loaded existing FAISS index.")
    else:
        # Create a new FAISS index
        embeddings = OpenAIEmbeddings()
        vectorstore = FAISS.from_documents(documents=text_chunks, embedding=embeddings)
        # Save the new FAISS index locally
        vectorstore.save_local("faiss_index")
        print("Created and saved new FAISS index.")
    return vectorstore

In [22]:
vectorstore = get_vectorstore(documents).as_retriever()

Loaded existing FAISS index.


In [23]:
template = """Use the context below to answer the question.
Keep the answer concise and to the point.
If you are unsure about the answer, just say i do not know the answer to the question do not create your own answer and make sure the answer is concise and to the point.
Summarize the information such that main points are covered and if you think that there needs to be some more information added to the answer then you can add that information as well.
{context}

Question: {question}

Helpful Answer:"""
prompt = PromptTemplate.from_template(template)

In [24]:
chain_type_kwargs = {"prompt": prompt}
chain = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type="stuff",
    retriever=vectorstore,
    chain_type_kwargs=chain_type_kwargs,
)

In [44]:
import pandas as pd
df = pd.read_excel('/Users/siddartha/Desktop/github/Athina_AI_project/dataset_testing/dataset_for_testing.xlsx')

In [45]:
questions = df["question"].to_list()
ground_truth = df["ground_truth"].to_list()

In [47]:
from datasets import Dataset
data = {"question": [], "response": [], "contexts": [], "ground_truth": ground_truth}

for query in questions:
    data["question"].append(query)
    data["response"].append(chain.invoke(query))
    data["contexts"].append([doc.page_content for doc in vectorstore.get_relevant_documents(query, top_k=5)])

dataset = Dataset.from_dict(data)

In [48]:
df = pd.DataFrame(dataset)

In [49]:
df['response'] = df['response'].apply(lambda x: x['result'])

In [50]:
df.to_csv('final_dataset_testing.csv', index=False)

In [31]:
dataset = Dataset.from_pandas(df)