##### Imports Section 

In [31]:
# Check for GPU
import torch
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(0))

2.5.1+cu124
True
1
0
NVIDIA GeForce RTX 3080


In [32]:
import numpy as np
import os
import sys
import matplotlib.pyplot as plt
import random
import scipy.stats as stats

from csv import DictReader
#from tqdm.autonotebook import tqdm
import csv
from datetime import datetime
import math

In [33]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_ollama import OllamaEmbeddings
from langchain_ollama import ChatOllama
from langchain_chroma import Chroma

# Prompts
# from langchain_core.prompts import ChatPromptTemplate
from langchain.prompts import PromptTemplate

# from langchain_core.runnables import RunnablePassthrough
from langchain.schema.runnable import RunnableSequence

In [34]:
# Do imports for squad_scoring and prompts
from pathlib import Path

utils_folder = Path("..")
sys.path.append(str(utils_folder))

utils_folder = Path("../src/deh")
sys.path.append(str(utils_folder))

import squad_scoring
import prompts

##### Set Global variables

In [35]:
# Folders for storing data and the results
DATA_ROOT = "../../../deh_data_results/data"         # Set to your own data folder
RESULTS_ROOT = "../../../deh_data_results/results"   # Set to your own results folder

# Vector Store Parameters
ollama_embedding_model = "avr/sfr-embedding-mistral"
embeddings = OllamaEmbeddings(model=ollama_embedding_model)
persist_directory = f"{DATA_ROOT}/chroma_deh_rag_db"
collection_name = "deh_rag"
VECTORIZE_SQUAD_DATASET = False     # Set to True to vectorize the squad dataset. If False, 
                                    # then the documents and their embeddings should already
                                    # exist in the vector store.
CREATE_QUESTION_CONTEXTS = False    # Set to True to create question contexts; if False, 
                                    # the question contexts are loaded from a csv file.
CREATE_HYDE_CONTEXTS = True         # Set to True to create hyde contexts; if False,
                                    # the hyde contexts are loaded from a csv file.                                    

# LLM Parameters
CHAT_MODEL_NAME = "llama3.1"
MAX_TOKENS = 100
TEMPERATURE = 0.5
TOP_P = 0.95
FREQUENCY_PENALTY = 0.0
PRESENCE_PENALTY = 0.0

CURRENT_QUERY_PROMPT_IDX = 0

# Bootstrap Parameters
SAMPLE_SIZE = 10
BOOTSTRAP_CNT = 20

# Experiment Parameters - define all the experiments to run
experiments = [{"name": "NO_RAG", "rag": False, "rag_model": None, "query_prompt_idx": 0, "conduct": False},
               {"name": "BASIC_RAG", "rag": True, "rag_model": "basic", "query_prompt_idx": 1, "conduct": False},
               {"name": "BASIC_RAG_DONT_LIE", "rag": True, "rag_model": "basic_dont_lie", "query_prompt_idx": 2, "conduct": False},
               {"name": "BASIC_RAG_HYDE", "rag": True, "rag_model": "basic_hyde", "query_prompt_idx": 2, "conduct": True},
               {"name": "BASIC_RAG_MILVUS", "rag": True, "rag_model": "basic_milvus", "conduct": False},
               {"name": "BASIC_RAG_SEMANTIC_CHUNKING", "rag": True, "rag_model": "basic_semantic_chunking", "conduct": False},
               {"name": "BASIC_RAG_SUPPRESS_ANSWERS", "rag": True, "rag_model": "basic_suppress_answers", "query_prompt_idx": 2, "conduct": False},
               {"name": "FULL_RAG", "rag": True, "rag_model": "full", "conduct": False}]

PERSIST_ANSWER_SAMPLES = False   # Set to True to persist the llm answers for each sample, for each experiment


##### Define the prompts and a function to get the LLM

In [36]:
query_prompts = [
    PromptTemplate(
        template=prompts.rag_text_prompts[2],
        input_variables = ["question"]
    ),
    PromptTemplate(
        template=prompts.rag_text_prompts[1],
        input_variables = ["context", "question"]
    ),
    PromptTemplate(
        template=("""
                You are an assistant for question-answering tasks.
                Use the following pieces of retrieved context to answer the question.
                If you don't know the answer, just return 'DONT KNOW'. 
                If you know the answer keep it as short and concise as possible,
                i.e. to a maximum of a couple of words.

                Question: {question}
                Context: {context}

                Answer:
                """
        ),
        input_variables=["context", "question"]
    ),
    PromptTemplate(
        template = prompts.hyde_prompts[1],
        input_variables = ["question"]
    )
]

In [37]:
# Create the llm instance, based on the current query prompt
def get_llm(current_query_prompt):
    llm = ChatOllama(
        prompt_template = current_query_prompt,
        model = CHAT_MODEL_NAME,
        max_tokens = MAX_TOKENS,
        temperature = TEMPERATURE,
        top_p = TOP_P,
        frequency_penalty = FREQUENCY_PENALTY,
        presence_penalty = PRESENCE_PENALTY,
        gpu_use = True
    )

    return llm

##### Intialize the Vector Store (Chroma; Milvus to be added later)

In [38]:
# Intiialize the Chroma vector store
vector_store = Chroma(
    collection_name = collection_name,
    embedding_function = embeddings,
    persist_directory = persist_directory
)

##### Load the SQuAD Dataset

In [39]:
data_file = f"{DATA_ROOT}/dev-v2.0.json"
dataset = squad_scoring.load_dataset(data_file)

articles = []
contexts = []
qas = []

for article in dataset:
    title = article["title"]
    articles.append(title)
    for p in article['paragraphs']:
        context = p["context"]
        contexts.append(context)
        for qa in p['qas']:
            question = qa["question"]
            for a in qa["answers"]:
                answer = a["text"]
                qas.append({"title": title, "context": context, "question": question, "answer": answer})

print(f"#articles in the dataset:     {len(articles)}")
print(f"#contexts in the dataset:   {len(contexts)}")
print(f"#questions in the dataset: {len(qas)}")   

#articles in the dataset:     35
#contexts in the dataset:   1204
#questions in the dataset: 20302


##### If configured, chunk the SQuAD dataset and add to the vector store

In [40]:
if VECTORIZE_SQUAD_DATASET:
    print(f"Creating contexts for the dataset...")

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100
    )

    all_contexts = "\n\n".join(contexts)
    chunks = text_splitter.create_documents([all_contexts])   

    print(f"Number of chunks --> {len(chunks)}\n")
    print(chunks[0])      

    for chunk in chunks:
        chunk.metadata = {"source": "squad"}
    
    ids = [str(i) for i in list(range(len(chunks)))]
    vector_store.add_documents(documents=chunks, ids=ids);
else:
    print("Not vectorizing the SQuAD dataset...")


Not vectorizing the SQuAD dataset...


##### Contexts

Questions:

- if configured: generate question contexts and then persist
- else: read the question and hyde contexts from a .csv file

Hyde:

- if configured: generate Hyde contexts and then persist
- else: read the Hyde contexts from a .csv fil


In [41]:
def persist_question_contexts(csv_file_path):

    # Write the the qas dataset including the question contexts to a CSV file
    with open(csv_file_path, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=["title", "context", "question", "answer", "question_context"])
        writer.writeheader()   # Write the header row
        writer.writerows(qas)  # Write the data rows

    print(f"Data successfully written to {csv_file_path}")


In [42]:
def persist_hyde_contexts(csv_file_path, hyde_contexts):

    # Write the the Hyde contexts to a CSV file
    with open(csv_file_path, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=["question", "hyde_context"])
        writer.writeheader()   # Write the header row
        writer.writerows(hyde_contexts)  # Write the data rows

    print(f"Data successfully written to {csv_file_path}")


In [43]:
csv_file_path = f"{DATA_ROOT}/qas_with_question_contexts.csv"

if CREATE_QUESTION_CONTEXTS:
    print(f"Creating question contexts for the dataset and persisting these in a csv file...")
    for i, qa in enumerate(qas):
        if i %100 == 0:
            print(f"Processing question {i}...")
        question = qa["question"]
        
        top_docs = vector_store.similarity_search(
            query = question,
            k = 5
        )
        qa["question_context"] = " ".join([top_doc.page_content for top_doc in top_docs])
    
    persist_question_contexts(csv_file_path)

else:
    print(f"Reading qas and question contexts from a csv file...")
    
    # Read the qas (including contexts) from a CSV file
    qas = []

    with open(csv_file_path, mode='r') as file:
        csv_reader = csv.reader(file)
        
        for i, row in enumerate(csv_reader):
            if i == 0:
                continue
            qas.append({"title": row[0], "context": row[1], "question": row[2], "answer": row[3],
                        "question_context": row[4]})
    

Reading qas and question contexts from a csv file...


In [None]:
def get_hyde_contexts_already_done(csv_file_path):
    if not os.path.exists(csv_file_path):
        return []

    hyde_contexts_already_done = []
    with open(csv_file_path, mode='r') as file:
        csv_reader = csv.reader(file)
        
        for i, row in enumerate(csv_reader):
            if i == 0:
                continue
            hyde_contexts_already_done.append({"question": row[0], "hyde_context": row[1]})
    return hyde_contexts_already_done

In [45]:
csv_file_path = f"{DATA_ROOT}/hyde_contexts.csv"

hyde_contexts_already_done = get_hyde_contexts_already_done(csv_file_path)
hyde_questions_already_done = [hc["question"] for hc in hyde_contexts_already_done]
# print(hyde_questions_already_done)

if CREATE_HYDE_CONTEXTS:
    print(f"Creating Hyde contexts for the dataset and persisting these in a csv file...")
    unique_questions = list(set([qa["question"] for qa in qas]))
    print(f"Number of unique questions: {len(unique_questions)}")

    current_query_prompt = query_prompts[3]
    llm = get_llm(current_query_prompt)
    runnable_chain = RunnableSequence(current_query_prompt | llm)

    for i, question in enumerate(unique_questions[:400]):
        #if i %10 == 0:
        print("---------------------------------------------------------------")
        print(f"Processing question {i}...")
        print(f"Question: {question}")
        if question in hyde_questions_already_done:
            print("Question already done...")
            continue
        response = runnable_chain.invoke({"question": question})
        hyde_contexts_already_done.append({"question": question, "hyde_context": response.content})
        hyde_questions_already_done.append(question)
        print("")
    
    persist_hyde_contexts(csv_file_path, hyde_contexts_already_done)
    hyde_contexts = hyde_contexts_already_done

else:
    print(f"Reading the Hyde contexts from a csv file...")

    hyde_contexts = []
    with open(csv_file_path, mode='r') as file:
        csv_reader = csv.reader(file)
        
        for i, row in enumerate(csv_reader):
            if i == 0:
                continue
            hyde_contexts.append({"question": row[0], "hyde_context": row[1]})


Creating Hyde contexts for the dataset and persisting these in a csv file...
Number of unique questions: 5926
---------------------------------------------------------------
Processing question 0...
Question: What country did the Normans invade in 1169?
Question already done...
---------------------------------------------------------------
Processing question 1...
Question: The price of oil is usually a stable commodity until when?
Question already done...
---------------------------------------------------------------
Processing question 2...
Question: What area of Brookhaven is still known for its high levels of crime?
Question already done...
---------------------------------------------------------------
Processing question 3...
Question: Who led the North American Huguenot colonial expedition?
Question already done...
---------------------------------------------------------------
Processing question 4...
Question: When was he elected by Nixon?
Question already done...
----------

##### Define functions that are needed for experiments

In [16]:
# Create the runnable chain
def get_runnable_chain(current_query_prompt, llm):
    runnable_chain = RunnableSequence(current_query_prompt | llm)
    return runnable_chain

In [17]:
# Get the Hyde context for a question
# Could be made much more efficient by creating a dictionary of hyde contexts
def get_hyde_context(question):
    for hc in hyde_contexts:
        if hc["question"] == question:
            return hc["hyde_context"]
    return None

In [18]:
# generate the LLM answers, using a runnable chain and the sample of questions provided
def generate_llm_answers(runnable_chain, qas_sample, hyde=False):
    
    preds = {}

    sample_size = len(qas_sample)
    print(f"sample_size: {sample_size}")

    for i, qa in enumerate(qas_sample):

        question = qa["question"]
        if hyde:
            #context = qa["hyde_context"]
            context = get_hyde_context(question)
        else:
            context = qa["vector_store_context"]
            
        # print(f"question --> {question}")
        # print(context)
        response = runnable_chain.invoke({"context": context, "question": question})
                
        qid = squad_scoring.get_qid_from_question(question, dataset)
        
        if response.content.upper() == "DONT KNOW":
            llm_answer = ""
        else:
            llm_answer = response.content

        preds[qid] = llm_answer
        qas_sample[i]["llm_answer"] = llm_answer

    return preds


In [19]:
#%%capture

# Get the metrics for a set of predictions (preds) that have been generated in a run
def get_squad_metrics(dataset, preds, verbose=False):
    squad_metrics = squad_scoring.calc_squad_metrics(dataset, preds);
    return squad_metrics["precision"], squad_metrics["recall"], squad_metrics["f1"]


In [20]:
# Utility function to persist a qas sample, including a copy with the LLM answers
def persist_qas_sample_results(qas_sample, run_id, results_folder_name):
    csv_file_path = f"{results_folder_name}/qas_sample_with_llm_answers_{run_id}.csv"
    csv_file_path_squad_format = f"{results_folder_name}/qas_sample_with_llm_answers_{run_id}.csv"

    # Write to CSV
    with open(csv_file_path, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=["title", "context", "question", "answer", "llm_answer"])
        # Write header
        writer.writeheader()
        # Write rows
        writer.writerows(qas_sample)

    qas_sample_squad_format = [{"question": qa["question"], "answer": qa["llm_answer"]} for qa in qas_sample]
    # Write to CSV in SQuAD format
    with open(csv_file_path_squad_format, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=["question", "answer"])
        # Write header
        writer.writeheader()
        # Write rows
        writer.writerows(qas_sample_squad_format)

    print(f"Data has been written to {csv_file_path}")

In [21]:
len([qa for qa in qas if qa["question"] in hyde_questions_already_done])

4183

In [22]:
# Conduct one single experiment (e.g. NO_RAG, BASIC_RAG, etc.)
def conduct_experiment(experiment, persist_answer_samples=False, results_folder_name=None):
 
    current_query_prompt = query_prompts[experiment["query_prompt_idx"]]
    print(f"current_query_prompt = {current_query_prompt.template}\n")
    llm = get_llm(current_query_prompt)
    runnable_chain = get_runnable_chain(current_query_prompt, llm)

    # Lists that will contain experiment metrics
    precision_l = []
    recall_l = []
    f1_l = []

    # !! For now, we are only using those questions that already have a Hyde context
    # TODO: Use all questions in the dataset once we have Hyde contexts for all questions
    # For now, filter on those questions that have a Hyde context
    temp_qas = [qa for qa in qas if qa["question"] in hyde_questions_already_done]
    # qas = temp_qas
    print(f"Number of questions with Hyde context in the dataset: {len(temp_qas)}")
    
    for i in range(BOOTSTRAP_CNT):
        print(f"{experiment["name"]} - Bootstrap: {i + 1} of {BOOTSTRAP_CNT}...")
        
        qas_sample = random.sample(temp_qas, SAMPLE_SIZE)
        preds = generate_llm_answers(runnable_chain, qas_sample, hyde = (experiment["name"] == "BASIC_RAG_HYDE"));

        # For experiment BASIC_RAG_SUPPRESS_ANSWSERS, we need to remove the LLM answers
        # from preds that are empty strings.
        if experiment["name"] == "BASIC_RAG_SUPPRESS_ANSWERS":
            print("Removing empty LLM answers...")
            preds = {k: v for k, v in preds.items() if v != ""}

        precision, recall, f1 = get_squad_metrics(dataset, preds)

        precision_l.append(precision)
        recall_l.append(recall)
        f1_l.append(f1)

        if persist_answer_samples:
            persist_qas_sample_results(qas_sample, i, results_folder_name)
        
    return precision_l, recall_l, f1_l

In [23]:
# Calculate the mean and confidence interval for a list of scores
# TODO: Check if this is calculation is correct !!
def calculate_mean_confidence_interval(scores_l):

    # Calculate mean
    mean = np.mean(scores_l)

    # Calculate 95% confidence interval
    
    # confidence = 0.95
    # n = len(scores_l)
    # std_err = stats.sem(scores_l)  # Standard error of the mean
    # h = std_err * stats.t.ppf((1 + confidence) / 2, n - 1)  # Margin of error
    # ci = (mean - h, mean + h)

    sample_std_dev = np.std(scores_l, ddof=1)
    n = len(scores_l)
    standard_error = sample_std_dev / np.sqrt(n)
    margin_of_error = 1.96 * standard_error
    ci = (mean - margin_of_error, mean + margin_of_error)

    return mean, ci

In [24]:
# Generate a histogram for a list of scores and persist it
def generate_histogram(scores_l, mean, ci, results_folder_name, experiment_name):

    plt.clf
    plt.hist(scores_l, bins=30, density=True, edgecolor='black', alpha=0.6, color = 'lightblue' ) # color='aquamarine')
    plt.xlim(0, 100)
    plt.title(f"F1-Scores for {experiment_name} - (Bootstraps: {BOOTSTRAP_CNT} - Sample Size: {SAMPLE_SIZE})", fontsize=10)
    plt.xlabel("F1-Score")
    plt.ylabel("Density")

    # Add a vertical line for the mean
    plt.axvline(mean, color='red', linestyle='dotted', linewidth=2, label=f'Mean F1: {round(mean, 2)}')

    # Add vertical lines for the 95% confidence interval
    plt.axvline(ci[0], color='orange', linestyle='dashdot', linewidth=1.5, label='95% CI Lower')
    plt.axvline(ci[1], color='orange', linestyle='dashdot', linewidth=1.5, label='95% CI Upper')

    plt.legend(loc='upper right', fontsize=10)
    plt.savefig(os.path.join(results_folder_name, f"{experiment_name}_{BOOTSTRAP_CNT}_{SAMPLE_SIZE}"))
    return plt

In [25]:
# Persist the metrics for an experiment in a CSV file
def persist_metrics(results_folder_name, precision_l, recall_l, f1_l, experiment_name):
    # Combine lists into rows
    rows = zip(precision_l, recall_l, f1_l)

    # Write to a CSV file
    csv_file_path = f"./{results_folder_name}/{experiment_name}_metrics_{BOOTSTRAP_CNT}_{SAMPLE_SIZE}.csv"
    with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file) 
        writer.writerow(["precision", "recall", "f1"]) # Write header
        writer.writerows(rows) # Write data


##### Conduct all experiments

In [26]:
# Get the current timestamp
start_timestamp = datetime.now()

# Format the start timestamp as a string
start_timestamp_str = start_timestamp.strftime('%Y%m%d_%H%M%S')


In [27]:
# %%capture

for experiment in experiments:
    if not experiment["conduct"]:  # Skip experiments that should not be done for the moment
        continue
    
    print("------------------------------------------------------------------------------------------")
    print(f"Conducting experiment: {experiment['name']}")

    results_folder_name = f"{RESULTS_ROOT}/results_{start_timestamp_str}/results_{experiment['name']}\n"
    os.makedirs(results_folder_name)
    print(f"results_folder_name: {results_folder_name}")

    precision_l, recall_l, f1_l = conduct_experiment(experiment,
                                                     persist_answer_samples = PERSIST_ANSWER_SAMPLES,
                                                     results_folder_name = results_folder_name)  

    # Remove 2.5% from each tail of f1_l (precision_l and recall_l are not taken into account)
    remove_cnt = math.ceil(BOOTSTRAP_CNT * 0.025)  # take ceiling for very low values of BOOTSTRAP_CNT
    # print(f"len of f1_l before: {len(f1_l)}")
    f1_l = sorted(f1_l)[remove_cnt:-(remove_cnt+1)]
    # print(f"len of f1_l after: {len(f1_l)}")
    mean, ci = calculate_mean_confidence_interval(f1_l)

    plt = generate_histogram(f1_l, mean, ci, results_folder_name, experiment["name"])
    persist_metrics(results_folder_name, precision_l, recall_l, f1_l, experiment["name"])
    plt.close()

------------------------------------------------------------------------------------------
Conducting experiment: BASIC_RAG_HYDE
results_folder_name: ../../../deh_data_results/results/results_20241125_231741/results_BASIC_RAG_HYDE

current_query_prompt = 
                You are an assistant for question-answering tasks.
                Use the following pieces of retrieved context to answer the question.
                If you don't know the answer, just return 'DONT KNOW'. 
                If you know the answer keep it as short and concise as possible,
                i.e. to a maximum of a couple of words.

                Question: {question}
                Context: {context}

                Answer:
                

Number of questions with Hyde context in the dataset: 4183
BASIC_RAG_HYDE - Bootstrap: 1 of 200...
sample_size: 30
BASIC_RAG_HYDE - Bootstrap: 2 of 200...
sample_size: 30
BASIC_RAG_HYDE - Bootstrap: 3 of 200...
sample_size: 30
BASIC_RAG_HYDE - Bootstrap: 4 of 200...

In [28]:
# Calculate the time elapsed for conducting all experiments
end_timestamp = datetime.now()

print(f"Elapsed time for conducting all experiments: {end_timestamp - start_timestamp}")

Elapsed time for conducting all experiments: 0:42:36.092618


0