##### Imports Section 

In [231]:
import torch

In [None]:
# Check for GPU
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(0))

2.5.1+cu124
True
1
0
NVIDIA GeForce RTX 3080


In [233]:
import numpy as np
import os
import sys
import matplotlib.pyplot as plt
import random
import scipy.stats as stats

from csv import DictReader
#from tqdm.autonotebook import tqdm
import csv
from datetime import datetime

In [234]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_ollama import OllamaEmbeddings
from langchain_ollama import ChatOllama
from langchain_chroma import Chroma

# Prompts
# from langchain_core.prompts import ChatPromptTemplate
from langchain.prompts import PromptTemplate

# from langchain_core.runnables import RunnablePassthrough
from langchain.schema.runnable import RunnableSequence

In [235]:
# Do imports for squad_scoring and prompts
from pathlib import Path

utils_folder = Path("..")
sys.path.append(str(utils_folder))

utils_folder = Path("../src/deh")
sys.path.append(str(utils_folder))

import squad_scoring
import prompts

##### Set Global variables

In [None]:
# Vector Store Parameters
ollama_embedding_model = "avr/sfr-embedding-mistral"
embeddings = OllamaEmbeddings(model=ollama_embedding_model)
persist_directory = "./chroma_deh_rag_db"
collection_name = "deh_rag"
VECTORIZE_SQUAD_DATASET = False
CREATE_QUESTION_CONTEXTS = False

# LLM Parameters
CHAT_MODEL_NAME = "llama3.1"
MAX_TOKENS = 100
TEMPERATURE = 0.5
TOP_P = 0.95
FREQUENCY_PENALTY = 0.0
PRESENCE_PENALTY = 0.0

CURRENT_QUERY_PROMPT_IDX = 0

# Bootstrap Parameters
SAMPLE_SIZE = 30
BOOTSTRAP_CNT = 100

# Experiment Parameters - define all the experiments to run
experiments = [{"name": "NO_RAG", "rag": False, "rag_model": None, "query_prompt_idx": 0, "feasible": True},
               {"name": "BASIC_RAG", "rag": True, "rag_model": "basic", "query_prompt_idx": 1, "feasible": True},
               {"name": "BASIC_RAG_DONT_LIE", "rag": True, "rag_model": "basic_dont_lie", "query_prompt_idx": 2, "feasible": True},
               {"name": "BASIC_RAG_HYDE", "rag": True, "rag_model": "basic_hyde", "feasible": False},
               {"name": "BASIC_RAG_MILVUS", "rag": True, "rag_model": "basic_milvus", "feasible": False},
               {"name": "BASIC_RAG_SEMANTIC_CHUNKING", "rag": True, "rag_model": "basic_semantic_chunking", "feasible": False},
               {"name": "BASIC_RAG_SUPPRESS_ANSWSERS", "rag": True, "rag_model": "basic_suppress_answers", "feasible": False},
               {"name": "FULL_RAG", "rag": True, "rag_model": "full", "feasible": False}]


##### Intialize the Vector Store (Chroma; Milvus to be added later)

In [237]:
# Intiialize the Chroma vector store
vector_store = Chroma(
    collection_name = collection_name,
    embedding_function = embeddings,
    persist_directory = persist_directory
)

##### Load the SQuAD Dataset

In [238]:
data_file = "./data/dev-v2.0.json"
dataset = squad_scoring.load_dataset(data_file)

articles = []
contexts = []
qas = []

for article in dataset:
    title = article["title"]
    articles.append(title)
    for p in article['paragraphs']:
        context = p["context"]
        contexts.append(context)
        for qa in p['qas']:
            question = qa["question"]
            for a in qa["answers"]:
                answer = a["text"]
                qas.append({"title": title, "context": context, "question": question, "answer": answer})

print(f"#articles in the dataset:     {len(articles)}")
print(f"#contexts in the dataset:   {len(contexts)}")
print(f"#questions in the dataset: {len(qas)}")   

#articles in the dataset:     35
#contexts in the dataset:   1204
#questions in the dataset: 20302


##### If VECTORIZE_SQUAD_DATASET --> Chunk the SQuAD dataset and add to vector store

In [239]:
if VECTORIZE_SQUAD_DATASET:
    print(f"Creating contexts for the dataset...")

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100
    )

    all_contexts = "\n\n".join(contexts)
    chunks = text_splitter.create_documents([all_contexts])   

    print(f"Number of chunks --> {len(chunks)}\n")
    print(chunks[0])      

    for chunk in chunks:
        chunk.metadata = {"source": "squad"}
    
    ids = [str(i) for i in list(range(len(chunks)))]
    vector_store.add_documents(documents=chunks, ids=ids);
else:
    print("Not vectorizing the SQuAD dataset...")


Not vectorizing the SQuAD dataset...


##### If CREATE_QUESTION_CONTEXTS --> generate question contexts and persist - Else --> Read the question contexts from a .csv file

In [240]:
def persist_contexts(csv_file_path):

    # Write the list of dictionaries to a CSV file
    with open(csv_file_path, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=["title", "context", "question", "answer"])
        writer.writeheader()  # Write the header row
        writer.writerows(qas)  # Write the data rows

    print(f"Data successfully written to {csv_file_path}")

In [241]:
csv_file_path = "./data/qas_with_contexts.csv"

if CREATE_QUESTION_CONTEXTS:
    print(f"Creating question contexts for the dataset and persisting these in a csv file...")
    for i, qa in enumerate(qas):
        if i %100 == 0:
            print(f"Processing question {i}...")
        question = qa["question"]
        
        top_docs = vector_store.similarity_search(
            query = question,
            k = 5
        )
        qa["context"] = " ".join([top_doc.page_content for top_doc in top_docs])
    
    persist_contexts(csv_file_path)

else:
    print(f"Reading qas and question contexts from the csv file...")
    
    # Read the qas (including contexts) from CSV file
    qas = []

    # Reading the CSV file
    with open(csv_file_path, mode='r') as file:
        csv_reader = csv.reader(file)
        
        for i, row in enumerate(csv_reader):
            if i == 0:
                continue
            qas.append({"title": row[0], "context": row[1], "question": row[2], "answer": row[3]})
    

Reading qas and question contexts from the csv file...


##### Define prompts and functions that are needed for experiments

In [242]:
query_prompts = [
    PromptTemplate(
        template=prompts.rag_text_prompts[2],
        input_variables = ["question"]
    ),
    PromptTemplate(
        template=prompts.rag_text_prompts[1],
        input_variables = ["context", "question"]
    ),
    PromptTemplate(
        template=("""
                You are an assistant for question-answering tasks.
                Use the following pieces of retrieved context to answer the question.
                If you don't know the answer, just return 'DONT KNOW'. 
                If you know the answer keep it as short and concise as possible,
                i.e. to a maximum of a couple of words.

                Question: {question}
                Context: {context}

                Answer:
                """
        ),
        input_variables=["context", "question"]
    )
]

In [None]:
# Create the llm instance, based on the current query prompt
def get_llm(current_query_prompt):
    llm = ChatOllama(
        prompt_template = current_query_prompt,
        model = CHAT_MODEL_NAME,
        max_tokens = MAX_TOKENS,
        temperature = TEMPERATURE,
        top_p = TOP_P,
        frequency_penalty = FREQUENCY_PENALTY,
        presence_penalty = PRESENCE_PENALTY,
        gpu_use = True
    )

    return llm

In [244]:
# Create the runnable chain
def get_runnable_chain(current_query_prompt, llm):
    runnable_chain = RunnableSequence(current_query_prompt | llm)
    return runnable_chain

In [None]:
# generate the LLM answers, using a runnable chain and a sample of questions
def generate_llm_answers(runnable_chain, qas_sample):
    
    preds = {}

    sample_size = len(qas_sample)
    print(f"sample_size: {sample_size}\n")

    for i, qa in enumerate(qas_sample):

        response = runnable_chain.invoke({"context": qa["context"], "question": qa["question"]})
        
        question = qa["question"]
        qid = squad_scoring.get_qid_from_question(question, dataset)
        
        if response.content == "DONT KNOW":
            llm_answer = ""
        else:
            llm_answer = response.content

        preds[qid] = llm_answer
        qas_sample[i]["llm_answer"] = llm_answer

    # print(f"\nFinished generating predictions for {sample_size} questions...")
    return preds


In [None]:
#%%capture

# Get the metrics for a set of predictions (preds) that have been generated in a run
def get_squad_metrics(dataset, preds, verbose=False):
    squad_metrics = squad_scoring.calc_squad_metrics(dataset, preds);
    return squad_metrics["precision"], squad_metrics["recall"], squad_metrics["f1"]


In [None]:
# Utility function to persist a qas sample, including a copy with the LLM answers
def persist_qas_sample_results(qas_sample, run_id, results_folder_name):
    csv_file_path = f"{results_folder_name}/qas_sample_with_llm_answers_{run_id}.csv"
    csv_file_path_squad_format = f"{results_folder_name}/qas_sample_with_llm_answers_{run_id}.csv"

    # Write to CSV
    with open(csv_file_path, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=["title", "context", "question", "answer", "llm_answer"])
        # Write header
        writer.writeheader()
        # Write rows
        writer.writerows(qas_sample)

    qas_sample_squad_format = [{"question": qa["question"], "answer": qa["llm_answer"]} for qa in qas_sample]
    # Write to CSV in SQuAD format
    with open(csv_file_path_squad_format, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=["question", "answer"])
        # Write header
        writer.writeheader()
        # Write rows
        writer.writerows(qas_sample_squad_format)

    print(f"Data has been written to {csv_file_path}")

In [None]:
# Conduct one single experiment (e.g. NO_RAG, BASIC_RAG, etc.)
def conduct_experiment(experiment, persist_answser_samples=False, results_folder_name=None):
    
    current_query_prompt = query_prompts[experiment["query_prompt_idx"]]
    print(f"current_query_prompt = {current_query_prompt.template}\n")
    llm = get_llm(current_query_prompt)
    runnable_chain = get_runnable_chain(current_query_prompt, llm)

    # Lists that will contain experiment metrics
    precision_l = []
    recall_l = []
    f1_l = []

    for i in range(BOOTSTRAP_CNT):
        print(f"{experiment["name"]} - Bootstrap: {i + 1} of {BOOTSTRAP_CNT}...")
        
        qas_sample = random.sample(qas, SAMPLE_SIZE)
        preds = generate_llm_answers(runnable_chain, qas_sample);
        precision, recall, f1 = get_squad_metrics(dataset, preds)

        precision_l.append(precision)
        recall_l.append(recall)
        f1_l.append(f1)

        if persist_answser_samples:
            persist_qas_sample_results(qas_sample, i, results_folder_name)
        
    return precision_l, recall_l, f1_l

In [None]:
# Calculate the mean and confidence interval for a list of scores
# TODO: Check if this is calculation is correct !!
def calculate_mean_confidence_interval(scores_l):

    # Calculate mean
    mean = np.mean(scores_l)

    # Calculate 95% confidence interval
    confidence = 0.95
    n = len(scores_l)
    std_err = stats.sem(scores_l)  # Standard error of the mean
    h = std_err * stats.t.ppf((1 + confidence) / 2, n - 1)  # Margin of error
    ci = (mean - h, mean + h)

    return mean, ci

In [None]:
# Generate a histogram for a list of scores and persist it
def generate_histogram(scores_l, mean, ci, results_folder_name, experiment_name):

    plt.clf
    plt.hist(scores_l, bins=30, density=True, edgecolor='black', alpha=0.6, color = 'lightblue' ) # color='aquamarine')
    plt.xlim(0, 100)
    plt.title(f"F1-Scores for {experiment_name} - (Bootstraps: {BOOTSTRAP_CNT} - Sample Size: {SAMPLE_SIZE})", fontsize=10)
    plt.xlabel("F1-Score")
    plt.ylabel("Density")

    # Add a vertical line for the mean
    plt.axvline(mean, color='red', linestyle='dotted', linewidth=2, label=f'Mean F1: {round(mean, 2)}')

    # Add vertical lines for the 95% confidence interval
    plt.axvline(ci[0], color='orange', linestyle='dashdot', linewidth=1.5, label='95% CI Lower')
    plt.axvline(ci[1], color='orange', linestyle='dashdot', linewidth=1.5, label='95% CI Upper')

    plt.legend(loc='upper right', fontsize=10)
    plt.savefig(os.path.join(results_folder_name, f"{experiment_name}_{BOOTSTRAP_CNT}_{SAMPLE_SIZE}"))
    return plt

In [None]:
# Persist the metrics for an experiment in a CSV file
def persist_metrics(results_folder_name, precision_l, recall_l, f1_l, experiment_name):
    # Combine lists into rows
    rows = zip(precision_l, recall_l, f1_l)

    # Write to a CSV file
    csv_file_path = f"./{results_folder_name}/{experiment_name}_metrics_{BOOTSTRAP_CNT}_{SAMPLE_SIZE}.csv"
    with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file) 
        writer.writerow(["precision", "recall", "f1"]) # Write header
        writer.writerows(rows) # Write data


##### Conduct all experiments

In [252]:
# Get the current timestamp
start_timestamp = datetime.now()

# Format the start timestamp as a string
start_timestamp_str = start_timestamp.strftime('%Y%m%d_%H%M%S')


In [None]:
# %%capture

for experiment in experiments:
    if not experiment["feasible"]:  # Skip experiments that are not feasible for the moment
        continue
    
    print("------------------------------------------------------------------------------------------")
    print(f"Conducting experiment: {experiment['name']}")

    results_folder_name = f"./results/results_{start_timestamp_str}/results_{experiment['name']}\n"
    os.makedirs(results_folder_name)
    print(f"results_folder_name: {results_folder_name}")

    precision_l, recall_l, f1_l = conduct_experiment(experiment) #, persist_answser_samples=True, results_folder_name=results_folder_name)  

    # Remove 2.5% from each tail of f1_l (precision_l and recall_l are not taken into account)
    remove_cnt = int(BOOTSTRAP_CNT * 0.05)
    f1_l = f1_l[remove_cnt:-remove_cnt]
    mean, ci = calculate_mean_confidence_interval(f1_l)

    plt = generate_histogram(f1_l, mean, ci, results_folder_name, experiment["name"])
    persist_metrics(results_folder_name, precision_l, recall_l, f1_l, experiment["name"])
    plt.close()

------------------------------------------------------------------------------------------
Conducting experiment: NO_RAG
results_folder_name: ./results/results_20241124_231452/results_NO_RAG

current_query_prompt = 
    You are an assistant for question-answering tasks.
    Use ten words maximum and keep the answer concise.

    Question: {question}

    Answer:
    

NO_RAG - Bootstrap: 1 of 100...
sample_size: 30

NO_RAG - Bootstrap: 2 of 100...
sample_size: 30

NO_RAG - Bootstrap: 3 of 100...
sample_size: 30

NO_RAG - Bootstrap: 4 of 100...
sample_size: 30

NO_RAG - Bootstrap: 5 of 100...
sample_size: 30

NO_RAG - Bootstrap: 6 of 100...
sample_size: 30

NO_RAG - Bootstrap: 7 of 100...
sample_size: 30

NO_RAG - Bootstrap: 8 of 100...
sample_size: 30

NO_RAG - Bootstrap: 9 of 100...
sample_size: 30

NO_RAG - Bootstrap: 10 of 100...
sample_size: 30

NO_RAG - Bootstrap: 11 of 100...
sample_size: 30

NO_RAG - Bootstrap: 12 of 100...
sample_size: 30

NO_RAG - Bootstrap: 13 of 100...
sampl

In [None]:
# Calculate the time elapsed for conducting all experiments
end_timestamp = datetime.now()

print(f"Elapsed time for conducting all experiments: {end_timestamp - start_timestamp}")

Elapsed time for conducting all experiments: 1:00:07.267766
