In [1]:
import os
with open(".env", "r") as f:
    lines = f.readlines()
    for line in lines:
        line = line.replace("export ", "")
        key, value = line.split("=")
        os.environ[key] = value.strip()

In [2]:
eval_QAs = [
    # GauntletAI Project 1 - ChatGenius.txt
    ("What features do we need in the Slack clone?", """Authentiication, Real-time messaging, Channel/DM organization, File sharing & search, User presence & status, Thread support, Emoji reactions"""),
    ("When is the first MVP due?", "There is a checkpoint deadline on January 7th, and the full MVP is due January 10th"),
    ("What are some AI features we could add after the MVP?", "AI that talks for the user, incorporate context into the AI, make a voice and video avatar for the user, mirror the user's personality (not an exhaustive list)"),

    # gauntletai.com.txt
    ("What is Gauntlet AI?", "Gauntlet AI is an extremely intensive 12-week AI training to turn engineers into the most sought-after builders and entrepreneurs on the planet"),
    ("How much will Gauntlet graduates get paid?", "$200k/yr"),
    ("Where will Gauntlet grads work when it's done?", "Austin, TX"),
    ("Can Gauntlet grads work remotely when it's done?", "In the vast majority of cases, jobs are in Austin, TX, not remote. There are a few exceptions for European students"),

    # GMT20250106-150124_Recording.cc.vtt
    ("Who are the hiring companies?", "Zax software, Alpha School, and the third has yet to be announced"),
    ("How do we override the concensus view of an LLM?", "We make a brainlift with some Spiky POVs and feed that to the model"),
    ("How can we protect our privacy while using WorkSmart?", "You can turn it off while doing personal stuff, use a separate user on your computer, and/or delete logs that you don't want stored"),

    # GMT20250106-200212_Recording.cc.vtt
    ("What is Alpha School?", "Virtual charter school that recently launched. Location such as Arizona, Austin, Miami, and Brownsville. Aims to fix education. Students have 2 hours of academic per day, and still outperform other schools"),
    ("How is Alpha School different from competitors?", "It's designed for the kids who don't typically love school, and aren't already conscientious learners"),

    # GMT20250107-160241_Recording.cc.vtt
    ("Is there a limit to what we can use on AWS?", "Gauntlet wants to give you control, but please use discretion. For example, don't spin up a ton of EC2 instances"),
    ("What should we do if AI is unable to solve a problem?", "Go to a checkpoint that was working, have AI look at the specific lines causing the issue, and if you keep trying for a while, don't be afraid to ask for help"),

    # GMT20250108-160105_Recording.cc.vtt
    ("What is a Spiky POV?", "A belief that is non-consensus. LLMs would not typically output a Spiky POV because it wasn't in their training data"),
    ("How do we make a BrainLift?", "Follow a specific format in a Workflowy document"),

    # GMT20250108-225810_Recording.cc.vtt
    ("Can we use other cloud services besides AWS?", "Yes, you can use any cloud services. If you want a free option, Gauntlet pays for AWS for you"),

    # GMT20250109-190113_Recording.cc.vtt
    ("What is ZAX Software?", "Zax software is going to buy businesses and run them extremely efficiently using AI. With automation, 1 person can run a whole company"),
    ("Why should someone work for ZAX?", "The compensation will include a profit share, enabling you to possibly make a million dollars a year"),
    ("Why should I build a company in ZAX instead of on my own?", "It's less risky because we pay cash compensation in addition to the profit share"),
]

In [None]:
len(eval_QAs)

In [None]:
# https://huggingface.co/learn/cookbook/en/rag_evaluation

from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer

chunk_size = 200
tokenizer_name = "sentence-transformers/all-MiniLM-L6-v2"
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
    AutoTokenizer.from_pretrained(tokenizer_name),
    chunk_size=chunk_size,
    chunk_overlap=int(chunk_size / 10),
    add_start_index=True,
    strip_whitespace=True,
    separators=["\n\n", "\n", ".", " ", ""],
)

In [5]:
def parse_vtt(filename):
    with open(f"rag_data/{filename}", "r") as f:
        lines = f.readlines()
    lines = [line.strip() for line in lines]
    lines = [line for line in lines if line]
    lines = [line for line in lines if not line.isdigit()] # Remove if it's an integer
    lines = lines[1:] # First line is just "WEBVTT"
    lines = lines[1::2] # Every other line is a timestamp
    text = "\n".join(lines)
    return text
# print(parse_vtt("GMT20250106-150124_Recording.cc.vtt"))

In [None]:
import os

texts = []
for file in os.listdir("rag_data"):
    if file.endswith(".vtt"):
        texts.append(parse_vtt(file))
    else:
        texts.append(open(f"rag_data/{file}", "r").read())

texts = [text.replace("Zach's", "Zax") for text in texts]
texts = [text.replace("Zach Software", "Zax Software") for text in texts]

chunks = text_splitter.create_documents(texts)
chunks = [chunk.page_content for chunk in chunks]

print(len(chunks))

In [None]:
from server.rag import add_documents, similarity_search

add_documents(chunks)

### Answer the questions

In [None]:
from server.completion import bedrock_completion
import pandas as pd

df = pd.DataFrame(columns=["question", "reference_answer", "llm_answer"])

for question, reference_answer in eval_QAs:
    print(question)

    # Without HyDE
    # hypothetical_document = ""

    # With HyDE
    hypothetical_document = bedrock_completion(
        """You are a question answering assistant for Gauntlet AI, an intensive AI training program for engineers.
Answer length MUST be 1 sentence to 1 paragraph in length. Answer questions with a decisive and convincing answer.
Do NOT express uncertainty, NEVER say you don't know something.
""",
        [{"role": "user", "content": question}],
        "llama3-2-3b",
    )
    hypothetical_document = f"\n{hypothetical_document}"
    print(hypothetical_document)

    context = ""
    docs = similarity_search(question + hypothetical_document)
    for doc in docs:
        context += f"{doc}\n"
    prompt = f"""### Instructions
You are a question-answering assistant. You will be given a question and context.
For questions involving dates or times, give absolute answers instead of relative answers if possible (e.g. "3pm" instead of "in 2 hours").
Answer the question ONLY using the context. If the context does not contain the answer, say "I don't know".

### Question
{question}

### Context
{context}

### Answer
"""
    llm_answer = bedrock_completion(
        "You are a question-answering assistant.",
        [{"role": "user", "content": prompt}],
        "llama3-3-70b",
    )
    
    # Add row to dataframe
    df = pd.concat([df, pd.DataFrame({
        "question": [question],
        "reference_answer": [reference_answer], 
        "llm_answer": [llm_answer]
    })], ignore_index=True)

### Judge the answers

In [23]:
df["feedback"] = None
df["score"] = None

In [None]:
for i, row in enumerate(df.values):
    question = row[0]
    reference_answer = row[1]
    llm_answer = row[2]
    feedback = row[3] if len(row) > 3 else None
    score = row[4] if len(row) > 4 else None
    if feedback and score and pd.notna(feedback) and pd.notna(score):
        continue

    prompt = f"""###Task Description:
You will be given a question, a response to evaluate, a reference answer that gets a score of 5, and a score rubric.
1. Write detailed feedback that assess the quality of the response ONLY based on the given score rubric, not evaluating in general.
2. After writing feedback, write a score that is an integer between 1 and 5.
3. The output format must be: \"Feedback: {{write a feedback for criteria}} [RESULT] {{an integer number between 1 and 5}}\"
4. Follow the output format exactly as I asked for, no more and no less. Be sure to include [RESULT] in your output.

### Question
{question}

### Response to evaluate
{llm_answer}

### Reference Answer (This would earn a score of 5)
{reference_answer}

###Score Rubrics
[Is the response correct, accurate, and factual based on the reference answer?]
Score 1: Completely incorrect, inaccurate, and/or not factual.
Score 2: Mostly incorrect, inaccurate, and/or not factual.
Score 3: Somewhat correct, accurate, and/or factual.
Score 4: Mostly correct, accurate, and factual.
Score 5: Completely correct, accurate, and factual.

###Feedback:"""
    judge_response = bedrock_completion(
        "You are a fair judge.",
        [{"role": "user", "content": prompt}],
        "llama3-3-70b",
    )
    try:
        feedback = judge_response.split("Feedback: ")[1].split("[RESULT]")[0]
        score = int(judge_response.split("[RESULT]")[1])
    except Exception as e:
        print(e)
        print(judge_response)
        print()
    print(score)
    df.loc[i, "feedback"] = feedback
    df.loc[i, "score"] = score

In [None]:
df.score.sum()

In [None]:
for row in df[df.score < 3].values:
    print(row)
    print()

In [None]:
from server.completion import bedrock_completion
import random

# for i in range(40):
#     print(i)
random_chunk = random.choice(chunks)

qa = bedrock_completion(QA_generation_prompt, [
    {
        "role": "user",
        "content": f"Context: {random_chunk}\n\nOutput:::",
    },
], model_id="llama3-3-70b")
factoid_question, answer = qa.split("\n")
factoid_question = factoid_question.replace("Factoid question: ", "")
answer = answer.replace("Answer: ", "")
print(factoid_question)
print()
print(answer)
print()
print(random_chunk)

In [44]:
answer = "Whatever you want on it, for example MongoDB or Postgres"

In [None]:
df = pd.concat([df, pd.DataFrame({"question": [factoid_question], "answer": [answer], "context": [random_chunk]})], ignore_index=True)
print(len(df))
df.to_csv("rag_evaluation.csv", index=False)