In [None]:
%%capture
!pip install google-generativeai
!wget -O llm-results-prompt.csv https://github.com/spencer18001/llm_zoomcamp_project_2024/blob/main/llm-results-prompt.csv?raw=1
!wget -O llm-results-prompt2.csv https://github.com/spencer18001/llm_zoomcamp_project_2024/blob/main/llm-results-prompt2.csv?raw=1

In [None]:
import json
from tqdm.auto import tqdm
import pandas as pd
from google.colab import userdata
import google.generativeai as genai

In [None]:
genai.configure(api_key=userdata.get('GEMINI_API_KEY'))
model = genai.GenerativeModel('gemini-1.5-flash')

In [None]:
prompt_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks (not including json quotes):

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [None]:
df_results_prompt = pd.read_csv("llm-results-prompt.csv")
results_prompt = df_results_prompt.to_dict(orient='records')
df_results_prompt

Unnamed: 0,question,answer,document
0,Why did the narrator's companion lock the door...,According to context: The narrator's companion...,44
1,What was the sound that the narrator heard aft...,After hearing Holmes make something resembling...,47
2,What is stored in the safe?,"According to the context provided, Dr. Grimesb...",110
3,Who is the speaker threatening and why?,The speaker is addressing Sherlock Holmes befo...,77
4,Why would the man in the story need to stand o...,The man in the story needed to stand on a chai...,159
5,What is the source of the woman's fear and ter...,The woman's fear and terror stem from hearing ...,11
6,Are they referring to the windows of a specifi...,"Based on the CONTEXT provided from ""The Advent...",116
7,Who was the half-pay major of marines that the...,"According to the original text ""When Dr. Roylo...",35
8,What did Holmes hit with such force that cause...,The CONTEXT does not provide specific informat...,143
9,How did the lady react to the man's statement?,Given that there was no direct dialogue or exp...,14


In [None]:
evaluations_prompt = {}

In [None]:
for record in tqdm(results_prompt):
    doc_id = record["document"]
    if doc_id in evaluations_prompt:
        continue

    prompt = prompt_template.format(**record)
    response = model.generate_content(prompt, safety_settings=[
        {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
        {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
        {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
        {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"}
    ])
    evaluations_prompt[doc_id] = response.text
len(evaluations_prompt)

  0%|          | 0/20 [00:00<?, ?it/s]

20

In [None]:
evaluations_prompt[110]

'{\n  "Relevance": "NON_RELEVANT",\n  "Explanation": "The generated answer describes the contents of Dr. Roylott\'s chamber and mentions the safe, but it explicitly states that there is no information about what\'s inside the safe. This makes the answer non-relevant as it doesn\'t provide any information about the contents of the safe."\n} \n'

In [None]:
json_evaluations_prompt = []
for doc_id, str_eval in evaluations_prompt.items():
    json_eval = json.loads(str_eval)
    json_evaluations_prompt.append(json_eval)
len(json_evaluations_prompt)

20

In [None]:
df_evaluations_prompt = pd.DataFrame(json_evaluations_prompt)

In [None]:
df_evaluations_prompt.Relevance.value_counts()

Unnamed: 0_level_0,count
Relevance,Unnamed: 1_level_1
NON_RELEVANT,8
RELEVANT,6
PARTLY_RELEVANT,6


In [None]:
df_results_prompt2 = pd.read_csv("llm-results-prompt2.csv")
results_prompt2 = df_results_prompt2.to_dict(orient='records')

In [None]:
evaluations_prompt2 = {}

In [None]:
for record in tqdm(results_prompt2):
    doc_id = record["document"]
    if doc_id in evaluations_prompt2:
        continue

    prompt = prompt_template.format(**record)
    response = model.generate_content(prompt, safety_settings=[
        {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
        {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
        {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
        {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"}
    ])
    evaluations_prompt2[doc_id] = response.text

  0%|          | 0/20 [00:00<?, ?it/s]

In [None]:
json_evaluations_prompt2 = []
for doc_id, str_eval in evaluations_prompt2.items():
    json_eval = json.loads(str_eval)
    json_evaluations_prompt2.append(json_eval)
len(json_evaluations_prompt2)

20

In [None]:
df_evaluations_prompt2 = pd.DataFrame(json_evaluations_prompt2)

In [None]:
df_evaluations_prompt2.Relevance.value_counts()

Unnamed: 0_level_0,count
Relevance,Unnamed: 1_level_1
NON_RELEVANT,11
PARTLY_RELEVANT,5
RELEVANT,4
