In [1]:
from langchain_core.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader ,DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [2]:
# extarct the data from PDF
def load_pdf(data_dir):
    loader =DirectoryLoader(data_dir,
                            glob = "*.pdf",
                            loader_cls =PyPDFLoader)
    documents = loader.load()
    return documents

In [3]:
extracted_data = load_pdf(r"C:\github\openai\athina-ai\data")

In [28]:
# creating textn chunks 
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 50000,
        chunk_overlap = 0
    )
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [29]:
docs = text_split(extracted_data)

In [36]:
docs =docs[1:24]

In [32]:
from langchain_community.embeddings import HuggingFaceEmbeddings
def download_hugging_face_embdinds():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

embeddings = download_hugging_face_embdinds()



In [37]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context

generator = TestsetGenerator.from_langchain(
    llm,
    llm,
    embeddings
)

testset = generator.generate_with_langchain_docs(docs, test_size=4, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})

embedding nodes:   0%|          | 0/46 [00:00<?, ?it/s]

Filename and doc_id are the same for all nodes.


Generating:   0%|          | 0/4 [00:00<?, ?it/s]

max retries exceeded for ReasoningEvolution(generator_llm=LangchainLLMWrapper(run_config=RunConfig(timeout=60, max_retries=15, max_wait=90, max_workers=16, exception_types=(<class 'Exception'>,), log_tenacity=False)), docstore=InMemoryDocumentStore(splitter=<langchain_text_splitters.base.TokenTextSplitter object at 0x000002270F9EE230>, nodes=[Node(page_content='Page 3FAQs\nHow much will you pay if my car is damaged?\nWhere damage to your car is covered under \nyour policy, we’ll pay the cost of repairing or \nreplacing your car up to its UK market value. \nThis is the current value of your car at the time \nof the claim. It may be different to the amount \nyou paid or any amount you provided when you insured your car with us.\nWho is covered to drive other cars?\nYour certificate of motor insurance will show \nwho has cover to drive other cars. We’ll only \ncover injury to third parties, or damage caused to their property, not to the car being driven. See ‘Section 1: Liability’ on page

In [38]:
testset.to_pandas()

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,Here is a question that can be fully answered ...,[Section 6: Personal benefits\nPersonal benefi...,If you claim for an accident that isn't your f...,simple,[{'source': 'C:\github\openai\athina-ai\data\p...,True
1,Here is a question that can be fully answered ...,[Page 12If the main driver is driving \nanothe...,The main driver is covered for liability to ot...,simple,[{'source': 'C:\github\openai\athina-ai\data\p...,True


In [41]:
from langchain.vectorstores import pinecone
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain.retrievers import ContextualCompressionRetriever
from langchain_cohere import CohereRerank
from langchain_pinecone import PineconeVectorStore

In [42]:
index_name = "policybot"
# Connect to Pinecone index 
vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)

In [43]:
# genration part
from langchain_groq import ChatGroq

llm = ChatGroq(
    model="llama3-70b-8192"
)

In [44]:
prompt_template="""
You are an a professional AI assistant for car insurance inquiries, \
use the provided information to accurately address the user's question. \
If the answer is not available, clearly state that you do not have the information instead of providing a speculative response.
-----------------------------
Context: {context}
-----------------------------
Question: {question}

Please provide in professional and helpful answer below , without any additional commentary.
Helpful answer:
"""

In [45]:
PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"])


In [46]:

parser = StrOutputParser()


In [47]:
compressor = CohereRerank(cohere_api_key="nbDqU1hTVxWmXGbLYI6OnYhp4Cx40MZ5hOmO5oKX")
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=vectorstore.as_retriever(search_kwargs={"k": 5})
    )


In [48]:
chain = (
    {
        "context":  compression_retriever,
        "question":  RunnablePassthrough(),
    }
    | PROMPT
    | llm
    | parser
)

In [49]:
questions = testset.to_pandas()["question"].to_list()
ground_truth = testset.to_pandas()["ground_truth"].to_list()

In [50]:
from datasets import Dataset

questions = testset.to_pandas()["question"].to_list()
ground_truth = testset.to_pandas()["ground_truth"].to_list()

data = {"question": [], "answer": [], "contexts": [], "ground_truth": ground_truth}

for query in questions:
    data["question"].append(query)
    data["answer"].append(chain.invoke(query))
    data["contexts"].append([doc.page_content for doc in compression_retriever.get_relevant_documents(query)])

dataset = Dataset.from_dict(data)

  warn_deprecated(


In [58]:
from ragas.metrics import (
    context_precision,
    faithfulness,
    context_recall,
)
from ragas.metrics.critique import harmfulness

# change the LLM

faithfulness.llm = llm
context_precision.llm = llm
context_recall.llm = llm
harmfulness.llm = llm

In [59]:
# evaluate
from ragas import evaluate

result = evaluate(
    dataset = dataset,
    llm=llm,
    metrics=[
        context_relevancy,
        context_precision,
        context_recall,
        faithfulness,
        answer_relevancy,
    ],
)


AttributeError: 'ChatGroq' object has no attribute 'set_run_config'

In [None]:
result.to_pandas()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

df = result.to_pandas()

heatmap_data = df[['context_relevancy', 'context_precision', 'context_recall', 'faithfulness', 'answer_relevancy']]

cmap = LinearSegmentedColormap.from_list('green_red', ['red', 'green'])

plt.figure(figsize=(10, 8))
sns.heatmap(heatmap_data, annot=True, fmt=".2f", linewidths=.5, cmap=cmap)

plt.yticks(ticks=range(len(df['question'])), labels=df['question'], rotation=0)

plt.show()

In [9]:
prompt_template = """
You are an expert at creating questions and answers based on insurance documentation.
Your goal is to prepare a Question and Answer pair.
make answer on point dont explain the answer need clean answer.
You do this by asking questions with the given context below:

------------
{context}
------------

Create 5 Question and Answer pairs in JSON format.
Make sure not to lose any important information.
here is an example:
\"question\": \"What is AI?\", \"answer\": \"AI stands for Artificial Intelligence.\", \
\"question\": \"What is ML?\", \"answer\": \"ML stands for Machine Learning.\", \
"""

In [10]:
PROMPT_QUESTIONS = PromptTemplate(template=prompt_template, input_variables=["context"])

In [11]:
# genration part
from langchain_groq import ChatGroq

llm = ChatGroq(
    model="llama3-70b-8192"
)

In [12]:
from langchain_core.prompts import PromptTemplate

In [16]:
from langchain_core.output_parsers import BaseOutputParser
from typing import List, Dict
import re
import json

class dictOutput(BaseOutputParser):
    def parse(self, text: str) -> List[Dict[str, str]]:
        # Finding all JSON objects in the text
        matches = re.findall(r'\{[^}]+\}', text)
        
        if not matches:
            print("No JSON objects found in the input text.")
            return []
        
        result = []
        
        for json_text in matches:
            try:
                # Load the JSON data
                data = json.loads(json_text)
                
                # Extract the question and answer
                question = data.get("question")
                answer = data.get("answer")
                
                if question is not None and answer is not None:
                    result.append({
                        "question": question,
                        "answer": answer
                    })
                else:
                    print(f"Invalid JSON object found: missing 'question' or 'answer' key in {json_text}")
            except json.JSONDecodeError:
                print(f"Invalid JSON format found in text: {json_text}")
        
        return result


In [17]:
# Chain for generating question and answer
chain_for_qa = PROMPT_QUESTIONS | llm | dictOutput()


In [18]:
    print(chain_for_qa.invoke(docs[2].page_content))

[{'question': 'How much will you pay if my car is damaged?', 'answer': 'Up to its UK market value.'}, {'question': 'Who is covered to drive other cars?', 'answer': 'Check your certificate of motor insurance.'}, {'question': 'Am I covered if I leave my car unlocked or the keys in the car?', 'answer': 'No.'}, {'question': 'What’s not included in my cover?', 'answer': 'Mechanical or electrical failure, wear and tear, damage to tyres, and more.'}, {'question': 'Can I use my car abroad?', 'answer': 'Depends on the type of policy and where you’re driving.'}]


In [19]:
import csv
import time

# Assuming `docs` is a list of documents and each document has a `page_content` attribute
# Also assuming `chain_for_qa` is an object with an `invoke` method that processes the content

# List to hold all question-answer pairs
qa_pairs_list = []

# Maximum number of requests per minute
max_requests_per_minute = 13
# Time interval between requests in seconds
interval = 60 / max_requests_per_minute

# Loop through each document and process its content
for index, doc in enumerate(docs):
    # Get the page content
    page_content = doc.page_content
    
    # Invoke the chain to get question-answer pairs
    qa_pairs = chain_for_qa.invoke(page_content)
    
    # Assuming the output of `invoke` is a list of question-answer pairs
    qa_pairs_list.extend(qa_pairs)
    
    # Wait to respect the rate limit, except after the last request
    if (index + 1) % max_requests_per_minute == 0:
        print("Reached the limit of 13 requests per minute. Waiting for the next minute...")
        time.sleep(60)  # Wait for a minute before making more requests
    else:
        time.sleep(interval)  # Wait for the calculated interval before the next request

# Define the CSV file name
csv_file_name = 'qa_pairs.csv'

# Save the collected question-answer pairs to a CSV file
with open(csv_file_name, mode='w', newline='', encoding='utf-8') as csv_file:
    fieldnames = ['question', 'answer']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

    writer.writeheader()
    for qa in qa_pairs_list:
        writer.writerow(qa)

print(f"Question-answer pairs have been saved to {csv_file_name}")


Reached the limit of 13 requests per minute. Waiting for the next minute...
Reached the limit of 13 requests per minute. Waiting for the next minute...
Reached the limit of 13 requests per minute. Waiting for the next minute...
Question-answer pairs have been saved to qa_pairs.csv


In [21]:
import pandas as pd
df = pd.read_csv("qa_pairs.csv")

In [22]:
df.head()

Unnamed: 0,question,answer
0,What is the policy period?,12 months from the effective date.
1,What is the deductible for collision coverage?,$500.
2,What is the limit for personal liability?,"$100,000."
3,Is roadside assistance included?,Yes.
4,What is the process for filing a claim?,Call 1-800-CLAIMS or report online within 24 h...
