<h1>Load document files, split them and make giskard knowledge_base object</h1>

In [1]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
os.environ["OPENAI_API_KEY"] = ""
os.environ['PINECONE_API_KEY'] = ''
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
MODEL = "gpt-3.5-turbo"
PINECONE_ENV="gcp-starter"

In [40]:
#Load and split files
data = []
file_path = "Algae-Chatbot/"
file_names = os.listdir(file_path)

for file_name in file_names:
    if file_name.endswith('.pdf'):
        try:
            print(file_name)
            loader = PyPDFLoader(file_path + file_name)
            docs = loader.load_and_split()
            data.append(docs)
        except Exception as e:
            print(f"An error occurred with {file_name}: {e}")


1-s2.0-S0196890409000764-main.pdf
1-s2.0-S0196890410002207-main.pdf
1-s2.0-S0360319916320717-main.pdf
1-s2.0-S0734975012001048-main.pdf
1-s2.0-S1319016409000462-main.pdf
1-s2.0-S1364032114005413-main.pdf
1-s2.0-S1364032114009915-main.pdf
10.1515_bot-2021-0026.pdf
10.1515_bot-2021-0066.pdf
10.1515_bot-2021-0100.pdf
10.1515_bot-2021-0105.pdf
10.1515_bot-2022-0006.pdf
10.1515_bot-2022-0016.pdf
10.1515_bot-2022-0025.pdf
10.1515_bot-2022-0037.pdf
10.1515_bot-2022-0058.pdf
10.1515_bot-2022-0061.pdf
10.1515_bot-2022-0062.pdf
10.1515_bot-2022-0081.pdf
10.1515_bot-2023-0002.pdf
10.1515_bot-2023-0009.pdf
10.1515_bot-2023-0011.pdf
10.1515_bot-2023-0015.pdf
10.1515_bot-2023-0031.pdf
10.1515_bot-2023-0032.pdf
10.1515_bot-2023-0035.pdf
10.1515_bot-2023-0065.pdf
10.1515_bot-2023-0085.pdf
10.1515_bot-2023-0088.pdf
978-1-4939-2684-8.pdf
978-3-030-57911-1.pdf
978-3-031-33144-2 (1).pdf
978-3-031-33144-2.pdf
978-3-031-42026-9.pdf
978-3-319-17031-2.pdf
978-3-319-19018-1.pdf
978-3-319-51010-1.pdf
978-3-319-

In [41]:
# unfold list
documents = [x for sublist in data for x in sublist]

#To limit cost of llm api usage, we make the dataset smaller and use only 25% to generate questions:
import random
sample_size = max(1, len(documents) // 4)
smaller_list = random.sample(documents, sample_size)
len(smaller_list)

6340

In [42]:
import pandas as pd
df = pd.DataFrame([x.page_content for x in smaller_list], columns=["text"])
df.tail(10)

Unnamed: 0,text
6330,"50. Boisson-Vidal C, Zemani F, Caligiuri G, Ga..."
6331,The same study found that when betaine was dep...
6332,"Cancers 2020 ,12, 107 3 of 17\n2. Materials an..."
6333,"As shown by molecular docking, fucoidan has an..."
6334,Jin et al. Incompleteness and Inaccuracy of ...
6335,"Algae 2020, 35(3): 213-224\nhttps://doi.org/10..."
6336,Challenges and Opportunities\nin Commercializa...
6337,"Mar. Drugs 2015 , 13 92 \n \n86.1%, B06-SP-F3..."
6338,24 S. Kag et al.\nproduction due to its chemic...
6339,"polysaccharides, which is similar to previous ..."


In [43]:
#Load in the knowledge base in giskard.
from giskard.rag import KnowledgeBase
knowledge_base = KnowledgeBase(df)



<h1>Generate testset from knowledge base</h1>

In [44]:
from giskard.rag import generate_testset
from giskard.rag.question_generators import simple_questions, distracting_questions, complex_questions, double_questions, situational_questions

# Generate a testset with 100 questions & answers for 5 types
testset = generate_testset(
    knowledge_base, 
    question_generators=[simple_questions, distracting_questions, complex_questions, double_questions, situational_questions],
    num_questions=100,
    language='en',
    agent_description="An algae research assistant chatbot"
)

2024-05-15 12:00:05,502 pid:14040 MainThread giskard.rag  INFO     Finding topics in the knowledge base.
2024-05-15 12:07:36,295 pid:14040 MainThread giskard.rag  INFO     Found 177 topics in the knowledge base.


Generating questions:   0%|          | 0/100 [00:00<?, ?it/s]



In [45]:
testset.save("testset_giskard_rag.jsonl")

In [46]:
#saving the testset
df.to_csv('knowledge_base_giskard_eval.csv', index=False)

In [22]:
# If returned another day, you can start here and load the saved testset.
from giskard.rag import QATestset
import pandas as pd

loaded_testset = QATestset.load("testset_giskard_rag.jsonl")
df = loaded_testset.to_pandas()
pd.set_option('display.max_colwidth', None)
print(df.iloc[3])

question                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                

In [4]:
df.describe()

Unnamed: 0,question,reference_answer,reference_context,conversation_history,metadata
count,100,100,100,100,100
unique,100,100,99,1,100
top,What is the main argument against the recent n...,The main argument against the recent name chan...,Document 2490: Table :(continued)\n# Original...,[],"{'question_type': 'simple', 'seed_document_id'..."
freq,1,1,2,100,1


<h1>Use testset to evaluate</h1>

In [5]:
#We need to prepare the retriever for automatic evaluation

PINECONE_API_KEY = "1f8bce18-42b8-41c5-bc5b-f76a221a34f8"
PINECONE_ENV = os.getenv('PINECONE_ENV', 'gcp-starter') 

from pinecone import Pinecone

# initialize connection to pinecone
api_key = PINECONE_API_KEY
print(api_key)

# configure client
pc = Pinecone(api_key=api_key)
from pinecone import Pinecone

# configure client
pc = Pinecone(api_key=api_key)
index_name = 'algaeopenai'
index = pc.Index(index_name)
index.describe_index_stats()


1f8bce18-42b8-41c5-bc5b-f76a221a34f8


{'dimension': 1536,
 'index_fullness': 0.48005,
 'namespaces': {'': {'vector_count': 48005}},
 'total_vector_count': 48005}

In [7]:
from langchain_openai.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_core.runnables import RunnableParallel, RunnablePassthrough


In [8]:
from langchain.prompts import PromptTemplate

template = """
You are an algae research assistant. Answer the question based on the context below. If you can't answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""

prompt = PromptTemplate.from_template(template)


In [9]:
from langchain_openai.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from operator import itemgetter
embeddings = OpenAIEmbeddings()
vectorstore = PineconeVectorStore.from_existing_index(index_name,embeddings)
retriever = vectorstore.as_retriever(search_kwargs={'k': 3})
model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model=MODEL)

chain = (
    {
        "context": itemgetter("question") | retriever,
        "question": itemgetter("question"),
    }
    | prompt
    | model
    | StrOutputParser()
)


In [13]:
def answer_fn(question, history=None):
    return chain.invoke({"question": question})

In [None]:
from giskard.rag.metrics.ragas_metrics import ragas_context_recall, ragas_faithfulness

report = evaluate(
    answer_fn,
    testset=testset,
    knowledge_base=knowledge_base,
    metrics=[ragas_context_recall, ragas_faithfulness]
)

In [None]:
report.to_html("giskard_rag_eval_report.html")