In [2]:
# import libraries
import json
import os
import random
import pandas as pd
from datetime import datetime

# configure
embedding_path = "../embeddings/guidance_framework_2/"
chunk_path = embedding_path + 'chunk_data.json'

n = 20
min_chunk_length = 500

# load chunk data
if os.path.exists(chunk_path):
    with open(chunk_path, 'r') as f:
        chunk_data = json.load(f)
else:
    print("Chunk data not found.")

# filter short chunks
chunk_data = [chunk for chunk in chunk_data if len(chunk['text']) > min_chunk_length]

# sample n random chunks
random_chunks = random.sample(chunk_data, n)

# generate QA pairs
from langchain.chat_models import init_chat_model
full_LLM_models = ["openai:gpt-4o-mini", "google_genai:gemini-2.0-flash", "mistralai:ministral-3b-latest"]
LLM = full_LLM_models[1]
chat_model = init_chat_model(LLM)

eval_data = []
for chunk in random_chunks:
    chunk_text = chunk['text']
    chunk_id = chunk['vector_id']

    try:
        q_prompt = f"Generate a question from the following text:\n\n{chunk_text}\n\nQuestion:"
        question = chat_model.invoke(q_prompt).content
        a_prompt = f"Answer the following question:\n\n{question}\n\nText:\n\n{chunk_text}\n\nAnswer:"
        answer = chat_model.invoke(a_prompt).content
    except:
        n_questions = len(eval_data)
        print(f"Error after generating {n_questions} questions and answers.")
        break

    eval_data.append([question, answer, chunk_id, chunk_text])

df = pd.DataFrame(eval_data, columns=['question', 'answer', 'chunk_id', 'chunk_text'])

Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..


Error after generating 8 questions and answers.


In [3]:
# view full df
from xlwings import view
view(df)

In [31]:
# save eval
timestamp_id = datetime.now().strftime("%Y%m%d%H%M%S")[2:]
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

eval_dir = './QA_sets/'
if not os.path.exists(eval_dir):
    os.makedirs(eval_dir)

csv_path = eval_dir + f"{timestamp_id}.csv"
metadata_path = eval_dir + f"{timestamp_id}_metadata.json"

metadata = {
    "embedding_path": embedding_path,
    "n": n,
    "min_chunk_length": min_chunk_length,
    "LLM": LLM,
    "save_date": timestamp
}

df.to_csv(csv_path, index=False)
with open(metadata_path, 'w') as f:
    json.dump(metadata, f)