1.) Change chunking strategy

- change chunking --> chunk per article
- recreate SQAUD database

2.) Generate Hyde-based contexts

- all files in one directory: hyde_based_contexts
- each file called hyde_based_contexts.csv
- each file contains:

   - qid
   - question
   - hyde_article
   - hyde_based_context

Currently, files contain the following information:

    - filename : hyde_contexts_<n>
    - question, hyde_context
 
 ==> change strategy for hyde_generated_contexts

 a) read in complete dataset ("title", "context", "qid", "question", "is_impossible", "answer")
 b) read in all hyde_passages that already exist
 c) take a sample of size HYDE_SAMPLE_SIZE from the complete dataset
 d) 
 create new list: hyde_based_contexts (dict elements:qid, question, hyde_article, hyde_based_context)

 for each member of the sample:

   - get question qid and add it
   - add question to dataset
   - if not hyde_article exists:
      generate hyde_article
   - get hyde_based_context
   - add new dict element with: qid, question, hyde_article, hyde_based_context)

persist new dataset to file hyde_based_contexts_<n>

ALTERNATIVE:

use chain from video course


In [1]:
import numpy as np
import pandas as pd
import os
import sys
import matplotlib.pyplot as plt
import random
import scipy.stats as stats
from csv import DictReader
import csv
from datetime import datetime
import math

In [2]:
# Chunking
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Ollama
from langchain_ollama import OllamaEmbeddings
from langchain_ollama import ChatOllama
from langchain_chroma import Chroma

# Prompts
from langchain.prompts import PromptTemplate

# Runnables
from langchain.schema.runnable import RunnableSequence

In [3]:
# Do imports for squad_scoring and prompts
from pathlib import Path

utils_folder = Path("..")
sys.path.append(str(utils_folder))

utils_folder = Path("../src/deh")
sys.path.append(str(utils_folder))

import squad_scoring
import prompts

##### Set Global variables

In [4]:
# Folders for storing data and the results
DATA_ROOT = "../../../deh_data_results/data"         # Set to your own data folder
RESULTS_ROOT = "../../../deh_data_results/results"   # Set to your own results folder
HYDE_BASED_CONTEXTS_ROOT = F"{DATA_ROOT}/hyde_based_contexts"   # Set to your own hyde-based contexts folder

# Vector Store Parameters
ollama_embedding_model = "avr/sfr-embedding-mistral"
embeddings = OllamaEmbeddings(model=ollama_embedding_model)
persist_directory = f"{DATA_ROOT}/chroma_deh_rag_db"
collection_name = "deh_rag"
VECTORIZE_SQUAD_DATASET = False      # Set to True to vectorize the squad dataset. If False, 
                                    # then the documents and their embeddings should already
                                    # exist in the vector store.
CREATE_QUESTION_CONTEXTS = False     # Set to True to create question contexts from the vector store; 
                                    # if False, the question contexts are loaded from a csv file.
CREATE_HYDE_CONTEXTS = True         # Set to True to create hyde contexts; if False,
                                    # the hyde contexts are loaded from a csv file.                                    

# LLM Parameters
CHAT_MODEL_NAME = "llama3.1"
MAX_TOKENS = 100
TEMPERATURE = 0.5
TOP_P = 0.95
FREQUENCY_PENALTY = 0.0
PRESENCE_PENALTY = 0.0

CURRENT_QUERY_PROMPT_IDX = 0

# Bootstrap Parameters
SAMPLE_SIZE = 1000
BOOTSTRAPS_N = 10000
#TODO check if this code is ok for setting the seed
# SEED = 42
# set_seed = random.seed(SEED)

# Experiment Parameters - define all the experiments to run
experiments = [{"name": "NO_RAG", "rag": False, "rag_model": None, "query_prompt_idx": 0, "conduct": False},
               {"name": "BASIC_RAG", "rag": True, "rag_model": "basic", "query_prompt_idx": 1, "conduct": False},
               {"name": "BASIC_RAG_DONT_LIE", "rag": True, "rag_model": "basic_dont_lie", "query_prompt_idx": 2, "conduct": False},
               {"name": "BASIC_RAG_HYDE", "rag": True, "rag_model": "basic_hyde", "query_prompt_idx": 2, "conduct": True},
               {"name": "BASIC_RAG_MILVUS", "rag": True, "rag_model": "basic_milvus", "conduct": False},
               {"name": "BASIC_RAG_SEMANTIC_CHUNKING", "rag": True, "rag_model": "basic_semantic_chunking", "conduct": False},
               {"name": "BASIC_RAG_SUPPRESS_ANSWERS", "rag": True, "rag_model": "basic_suppress_answers", "query_prompt_idx": 2, "conduct": False},
               {"name": "FULL_RAG", "rag": True, "rag_model": "full", "conduct": False}]

PERSIST_ANSWER_SAMPLES = False   # Set to True to persist the llm answers for each sample, for each experiment


##### Define the prompts and a function to get the LLM

In [5]:
query_prompts = [
    PromptTemplate(
        template=prompts.rag_text_prompts[2],
        input_variables = ["question"]
    ),
    # PromptTemplate(
    #     template=prompts.rag_text_prompts[1],
    #     input_variables = ["context", "question"]
    # ),

    PromptTemplate(
        template = """
    You are an assistant for question-answering tasks.
    Please only use the following pieces of retrieved context to answer the question.
    Use ten words maximum and keep the answer concise.

    Question: {question}
    Context: {context}

    Answer:
    """,
        input_variables = ["context", "question"]
    ),

    PromptTemplate(
        template=("""
                You are an assistant for question-answering tasks.
                Use the following pieces of retrieved context to answer the question.
                If you don't know the answer, just return 'DONT KNOW'. 
                If you know the answer, keep it as short and concise as possible,
                i.e. to a maximum of a couple of words.

                Question: {question}
                Context: {context}

                Answer:
                """
        ),
        input_variables=["context", "question"]
    ),
    PromptTemplate(
        template = prompts.hyde_prompts[1],
        input_variables = ["question"]
    )
]

In [6]:
# Create the llm instance, based on the current query prompt
def get_llm(current_query_prompt):
    llm = ChatOllama(
        prompt_template = current_query_prompt,
        model = CHAT_MODEL_NAME,
        max_tokens = MAX_TOKENS,
        temperature = TEMPERATURE,
        top_p = TOP_P,
        frequency_penalty = FREQUENCY_PENALTY,
        presence_penalty = PRESENCE_PENALTY,
        gpu_use = True
    )

    return llm

##### Intialize the Vector Store (Chroma; Milvus to be added later)

In [8]:
# Intiialize the Chroma vector store
vector_store = Chroma(
    #collection_name = collection_name,
    collection_name = "deh_rag_per_article",
    embedding_function = embeddings,
    persist_directory = persist_directory
)

##### Load the SQuAD Dataset

In [9]:
data_file = f"{DATA_ROOT}/dev-v2.0.json"
dataset = squad_scoring.load_dataset(data_file)

articles = []
contexts = []
qas = []

for article in dataset:
    title = article["title"]
    articles.append(title)
    for p in article['paragraphs']:
        context = p["context"]
        contexts.append(context)
        for qa in p['qas']:
            question = qa["question"]
            id = qa["id"]
            is_impossible = qa["is_impossible"]
            if is_impossible:
                for pa in qa["plausible_answers"]:
                    answer = pa["text"]
                    qas.append({"title": title, "context": context, "qid": id, "question": question, 
                                "is_impossible": is_impossible, "answer": answer})
            else:
                for a in qa["answers"]:
                    answer = a["text"]
                    qas.append({"title": title, "context": context, "qid": id, "question": question, 
                                "is_impossible": is_impossible, "answer": answer})
                    
# Store dataset as a csv file
csv_file = f"{DATA_ROOT}/dev-v2.0.csv"
with open(csv_file, mode='w') as file:
    fieldnames = ['title', 'context', 'qid', 'question', 'is_impossible', 'answer']
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    for qa in qas:
        writer.writerow(qa)                    

unique_questions = list(set([qa["question"] for qa in qas]))
unique_qas = [dict(t) for t in {tuple(d.items()) for d in qas}]

print(f"#articles in the dataset:     {len(articles)}")
print(f"#contexts in the dataset:   {len(contexts)}")
print(f"#questions in the dataset: {len(qas)}")   
print(f"#unique questions in the dataset: {len(unique_questions)}")
print(f"#unique qas in the dataset: {len(unique_qas)}")

#articles in the dataset:     35
#contexts in the dataset:   1204
#questions in the dataset: 26232
#unique questions in the dataset: 11849
#unique qas in the dataset: 16209


##### If configured, chunk the SQuAD dataset and add the chunks to the vector store

In [10]:
if VECTORIZE_SQUAD_DATASET:

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100
    )
    article_contexts_lens = []
    curr_id = 0

    for article in dataset:
        title = article["title"]
        article_contexts = []

        for p in article['paragraphs']:
            article_contexts.append(p["context"])

        all_article_contexts = "\n\n".join(article_contexts)
        article_chunks = text_splitter.create_documents([all_article_contexts]) 
        for article_chunk in article_chunks:
            article_chunk.metadata = {"source": "squad", "article": title}
        ids = [str(i) for i in list(range(curr_id, curr_id + len(article_chunks)))]
        curr_id += len(article_chunks)
        vector_store.add_documents(documents=article_chunks, ids=ids)
else:
    print("Not vectorizing the squad dataset...")
     

Not vectorizing the squad dataset...


##### Hyde-based Contexts

Hyde:

- if configured: generate Hyde-based contexts and then persist
- else: read the Hyde-based contexts from a .csv file


In [11]:
data_file = f"{DATA_ROOT}/dev-v2.0.json"
dataset = squad_scoring.load_dataset(data_file)

In [12]:
def persist_hyde_based_contexts(csv_file_path, hyde_baased_contexts):

    # Write the the Hyde contexts to a CSV file
    with open(csv_file_path, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=["qid", "question", "hyde_article", "hyde_based_context"])
        writer.writeheader()   # Write the header row
        writer.writerows(hyde_baased_contexts)  # Write the data rows

    print(f"Data successfully written to {csv_file_path}")


In [13]:
# hyde_based_contexts_path = f"{HYDE_BASED_CONTEXTS_ROOT}/hyde_contexts_2_MASTER_COPY_UNBEDINGT_BEHALTEN.csv"

# hyde_based_contexts = []
# with open(hyde_based_contexts_path, mode='r') as file:
#     csv_reader = csv.reader(file)
    
#     for i, row in enumerate(csv_reader):
#         if i == 0:
#             continue

#         hyde_based_contexts.append({"question": row[0], "hyde_context": row[1]})
        
# #print(f"#hyde_based_contexts before removing duplicates: {len(hyde_based_contexts)}")
# hyde_based_contexts_unique  = list({frozenset(item.items()): item for item in hyde_based_contexts}.values())
# #print(f"#hyde_based_contexts before removing duplicates: {len(hyde_based_contexts_unique)}")


In [14]:
# new_hyde_based_contexts = []

# for i, hcb in enumerate(hyde_based_contexts_unique):

#     if i %100 == 0:
#         print(f"Processing question {i}...")

#     question = hcb["question"]
    
#     top_docs = vector_store.similarity_search(
#         query = question,
#         k = 5
#     )
#     hyde_based_context = " ".join([top_doc.page_content for top_doc in top_docs])

#     new_hyde_based_context = {}
#     new_hyde_based_context["qid"] = squad_scoring.get_qid_from_question(hcb["question"], dataset)
#     new_hyde_based_context["question"] = question
#     new_hyde_based_context["hyde_article"] = hcb["hyde_context"]
#     new_hyde_based_context["hyde_based_context"] = hyde_based_context
#     new_hyde_based_contexts.append(new_hyde_based_context)

# print(f"#new_hyde_based_contexts: {len(new_hyde_based_contexts)}")


In [15]:
# hyde_based_contexts_persist_path = f"{HYDE_BASED_CONTEXTS_ROOT}/hyde_based_contexts.csv"
# persist_hyde_based_contexts(hyde_based_contexts_persist_path, new_hyde_based_contexts)

In [16]:
hyde_based_contexts_path = f"{HYDE_BASED_CONTEXTS_ROOT}/hyde_based_contexts.csv"

hyde_based_contexts = []
with open(hyde_based_contexts_path, mode='r') as file:
    csv_reader = csv.reader(file)
    
    for i, row in enumerate(csv_reader):
        if i == 0:
            continue
        hyde_based_contexts.append({"qid": row[0], "question": row[1], 
                                    "hyde_article": row[2], "hyde_based_context": row[3]})
        
qids_already_processed = [hbc["qid"] for hbc in hyde_based_contexts]
questions_already_processed = [hbc["question"] for hbc in hyde_based_contexts]

In [18]:
# Names of datasets : unique_questions, unique_qas
# Generate HYDE_SAMPLE_SIZE random samples from the unique_qas and 
# generate the hyde_based_contexts for each of the samples
HYDE_SAMPLE_SIZE = 50
sample = random.sample(unique_qas, HYDE_SAMPLE_SIZE)

current_query_prompt = query_prompts[3]
llm = get_llm(current_query_prompt)
runnable_chain = RunnableSequence(current_query_prompt | llm)

for i, qa in enumerate(sample):
    #if i %10 == 0:
    print(f"Processing question {i}...")

    question = qa["question"]
    if question in questions_already_processed:
        print(f"Question {question} already processed. Skipping...")
        continue

    new_hyde_based_context = {}
    question = qa["question"]
    qid = qa["qid"]

    # generate Hyde article
    response = runnable_chain.invoke({"question": question})
    hyde_based_article = response.content

    # generate Hyde context based on the Hyde article
    top_docs = vector_store.similarity_search(
        query = question,
        k = 5
    )
    hyde_based_context = " ".join([top_doc.page_content for top_doc in top_docs])

    new_hyde_based_context["qid"] = qid # squad_scoring.get_qid_from_question(hcb["question"], dataset)
    new_hyde_based_context["question"] = question
    new_hyde_based_context["hyde_article"] = hyde_based_article
    new_hyde_based_context["hyde_based_context"] = hyde_based_context
    
    hyde_based_contexts.append(new_hyde_based_context)

hyde_based_contexts_persist_path = f"{HYDE_BASED_CONTEXTS_ROOT}/hyde_based_contexts.csv"
persist_hyde_based_contexts(hyde_based_contexts_persist_path, hyde_based_contexts)

Processing question 0...
Question Upon what chemical characteristic is oxygen's solubility dependent? already processed. Skipping...
Processing question 1...
Processing question 2...
Processing question 3...
Processing question 4...
Processing question 5...
Processing question 6...
Processing question 7...
Processing question 8...
Processing question 9...
Processing question 10...
Processing question 11...
Processing question 12...
Question What is an example of a problem to which effective algorithms have provided a solution in spite of the intractability associated with the breadth of sizes? already processed. Skipping...
Processing question 13...
Question What does it mean for a knot to be considered indecomposable? already processed. Skipping...
Processing question 14...
Processing question 15...
Question How was this possible  already processed. Skipping...
Processing question 16...
Processing question 17...
Question What is the largest suspension bridge in Germany? already proces

In [None]:
def get_hyde_contexts_already_done(csv_file_path, suffixes_l):
    # if not os.path.exists(csv_file_path):
    #     return []

    hyde_contexts_already_done = []
    for suffix in suffixes_l:
        csv_file_path_iter = csv_file_path[:-4] + suffix + ".csv"
        print(f"Reading hyde contexts from a csv file: {csv_file_path_iter}")

        with open(csv_file_path_iter, mode='r') as file:
            csv_reader = csv.reader(file)
            
            for i, row in enumerate(csv_reader):
                if i == 0:
                    continue
                hyde_contexts_already_done.append({"question": row[0], "hyde_context": row[1]})
    print(f"len(hyde_contexts_already_done) --> {len(hyde_contexts_already_done)}")
    return hyde_contexts_already_done

In [None]:
csv_file_path = f"{DATA_ROOT}/hyde_contexts.csv"

hyde_contexts_already_done = get_hyde_contexts_already_done(csv_file_path, ['_1', '_2'])
hyde_questions_already_done = [hc["question"] for hc in hyde_contexts_already_done]
print(hyde_questions_already_done[:100])

In [None]:
# CREATE_HYDE_CONTEXTS = False
# print(DATA_ROOT)
# if CREATE_HYDE_CONTEXTS:
#     print(f"Creating Hyde contexts for the dataset and persisting these in a csv file...")
#     unique_questions = list(set([qa["question"] for qa in qas]))
#     print(f"Number of unique questions: {len(unique_questions)}")

#     current_query_prompt = query_prompts[3]
#     llm = get_llm(current_query_prompt)
#     runnable_chain = RunnableSequence(current_query_prompt | llm)

#     for i, question in enumerate(unique_questions[:3000]):
#         #if i %10 == 0:
#         print("---------------------------------------------------------------")
#         print(f"Processing question {i}...")
#         print(f"Question: {question}")
#         if question in hyde_questions_already_done:
#             print("Question already done...")
#             continue
#         response = runnable_chain.invoke({"question": question})
#         hyde_contexts_already_done.append({"question": question, "hyde_context": response.content})
#         hyde_questions_already_done.append(question)
#         print("")
    
#     csv_file_path = f"{DATA_ROOT}/hyde_contexts_2.csv"
#     persist_hyde_contexts(csv_file_path, hyde_contexts_already_done)
#     hyde_contexts = hyde_contexts_already_done

# else:
#     print(f"Reading the Hyde contexts from a csv file...")

#     hyde_contexts = []
#     #with open(csv_file_path, mode='r') as file:
#     with open(f"{DATA_ROOT}/hyde_contexts_2_MASTER_COPY_UNBEDINGT_BEHALTEN.csv", mode='r') as file:
#         csv_reader = csv.reader(file)
        
#         for i, row in enumerate(csv_reader):
#             if i == 0:
#                 continue
#             hyde_contexts.append({"question": row[0], "hyde_context": row[1]})


In [None]:
# hyde_s = set([hyde_context["question"] for hyde_context in hyde_contexts])
# len(hyde_s)