In [1]:
# Install minsearch
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

--2024-10-08 02:02:17--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3832 (3.7K) [text/plain]
Saving to: ‘minsearch.py’


2024-10-08 02:02:17 (41.1 MB/s) - ‘minsearch.py’ saved [3832/3832]



In [2]:
# Import required modules.
import os
import json
import random
import minsearch
import pandas as pd
from openai import OpenAI
from tqdm.auto import tqdm
from dotenv import load_dotenv
load_dotenv()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/codespace/.config/sagemaker/config.yaml


True

## Ingestion

In [3]:
# Import the data for the project.
df = pd.read_csv('../data/data.csv')

In [4]:
# Show the head of df.
df.head()

Unnamed: 0,id,question,answer,focus,source
0,0,What is (are) Glaucoma?,Glaucoma is a group of diseases that can damag...,Glaucoma,NIHSeniorHealth
1,1,What causes Glaucoma?,"Nearly 2.7 million people have glaucoma, a lea...",Glaucoma,NIHSeniorHealth
2,2,What are the symptoms of Glaucoma?,Symptoms of Glaucoma Glaucoma can develop in ...,Glaucoma,NIHSeniorHealth
3,3,What are the treatments for Glaucoma?,"Although open-angle glaucoma cannot be cured, ...",Glaucoma,NIHSeniorHealth
4,4,Who is at risk for Glaucoma?,Anyone can develop glaucoma. Some people are a...,Glaucoma,NIHSeniorHealth


In [5]:
# Create the documents for analysis.
documents = df.to_dict(orient='records')

In [6]:
# Index the documents.
index = minsearch.Index(
    text_fields=['answer', 'source', 'focus_area'],
    keyword_fields=['id']
)

In [7]:
# Fit the index on the documents.
index.fit(documents)

<minsearch.Index at 0x743b33c4fc50>

## RAG Flow

In [8]:
# Extract the key.
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [9]:
# Create the client.
client = OpenAI()

In [10]:
# Define search function.
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [11]:
# Create the prompt_template.
prompt_template = """
You're an excellent medical assistant. Answer the QUESTION based on the CONTEXT from our medical database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}
CONTEXT: {context}
""".strip()

In [12]:
# Create the entry_template.
entry_template = """
answer: {answer}
focus: {source}
source: {focus_area}
""".strip()

In [13]:
# Define build_prompt function.
def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    
    return prompt

In [14]:
# Define llm function.
def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [15]:
# Define rag function.
def rag(query, model='gpt-4o-mini'):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    #print(prompt)
    answer = llm(prompt, model=model)
    
    return answer

In [16]:
# Ask a question and print the answer.
question = 'What are the treatments for Glaucoma?'
answer = rag(question)
print(answer)

The treatments for Glaucoma include:

1. **Medications**: These may be in the form of eye drops or pills. Some medications reduce pressure by slowing the flow of fluid into the eye, while others improve fluid drainage. Regular use of medications can control increased fluid pressure, although they may stop working over time or cause side effects.

2. **Surgery**: Laser surgery is another treatment option. It involves focusing a strong beam of light on the part of the anterior chamber where the fluid exits, making it easier for the fluid to exit the eye. However, the effects of laser surgery may wear off over time, and patients may need to continue taking glaucoma medications afterwards.

Overall, while treatments can help save remaining vision, they do not restore vision already lost due to glaucoma. Early diagnosis and treatment are critical.


## Retrieval Evaluation

In [17]:
# Create df_question.
df_question = pd.read_csv('../data/ground-truth-retrieval.csv')

In [18]:
# Show the head of df_question.
df_question.head()

Unnamed: 0,id,question
0,0,What is (are) Glaucoma?
1,1,What causes Glaucoma?
2,2,What are the symptoms of Glaucoma?
3,3,What are the treatments for Glaucoma?
4,4,Who is at risk for Glaucoma?


In [20]:
# Create the ground_truth documents.
ground_truth = df_question.to_dict(orient='records')

In [21]:
# Show the first document in ground_truth.
ground_truth[0]

{'id': 0, 'question': 'What is (are) Glaucoma?'}

In [22]:
# Define the hit_rate function.
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [23]:
# Define the mrr function.
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [24]:
# Define the minsearch_search function.
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [25]:
# Define the evaluate function.
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [26]:
# Run the evaluation function.
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/1800 [00:00<?, ?it/s]

{'hit_rate': 1.0, 'mrr': 0.9153796296296295}

## Finding Best Parameters

In [27]:
# Split df_question into df_validation and df_test.
df_validation = df_question[:100]
df_test = df_question[100:]

In [28]:
# Define the simple_optimize function.
def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [29]:
# Create the gt_val documents.
gt_val = df_validation.to_dict(orient='records')

In [30]:
# Define a new minsearch_search function with boost.
def minsearch_search(query, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [31]:
# Define the parameter ranges.
param_ranges = {
    'answer': (0.0, 3.0),
    'source': (0.0, 3.0),
    'focus_area': (0.0, 3.0),
}

In [32]:
# Define the objective and search_function function.
def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], boost_params)

    results = evaluate(gt_val, search_function)
    
    return results['mrr']

In [33]:
# Run the optimization.
simple_optimize(param_ranges, objective, n_iterations=20)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

({'question': 1.8464899585935077,
  'answer': 0.19808490956809177,
  'focus': 0.11211288460808078,
  'source': 2.3984112320171542},
 1.0)

In [34]:
# Define the minsearch_improved function.
def minsearch_improved(query):
    boost = {
        'answer':  0.1981,
        'source': 0.1121,
        'focus_area': 2.3984,
    }

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [35]:
# Evaluate the ground_truth.
evaluate(ground_truth, lambda q: minsearch_improved(q['question']))

  0%|          | 0/1800 [00:00<?, ?it/s]

{'hit_rate': 1.0, 'mrr': 0.996851851851852}

## RAG Evaluation

In [36]:
# Create the prompt2_template.
prompt2_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [37]:
# Show the ground_truth length.
len(ground_truth)

1800

In [38]:
# Import the first ground_truth document to record.
record = ground_truth[0]

In [39]:
# Get the question from record.
question = record['question']

In [40]:
# Get the answer_llm with rag function)
answer_llm = rag(question)

In [41]:
# Print answer_llm.
print(answer_llm)

Glaucoma is a group of diseases that can damage the eye's optic nerve, leading to vision loss and blindness. It can affect anyone, but individuals over 60 have a higher risk. There are different types of glaucoma, with open-angle glaucoma being the most common. In glaucoma, the drainage system within the eye is impaired, causing fluid to build up and increasing pressure within the eye. If this pressure remains uncontrolled, it may cause damage to the optic nerve, resulting in loss of vision. There are no cures for glaucoma, but treatments such as medications and surgery can help save remaining vision. Early diagnosis is crucial to manage the disease effectively.


In [42]:
# Print the prompt.
prompt = prompt2_template.format(question=question, answer_llm=answer_llm)
print(prompt)

You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: What is (are) Glaucoma?
Generated Answer: Glaucoma is a group of diseases that can damage the eye's optic nerve, leading to vision loss and blindness. It can affect anyone, but individuals over 60 have a higher risk. There are different types of glaucoma, with open-angle glaucoma being the most common. In glaucoma, the drainage system within the eye is impaired, causing fluid to build up and increasing pressure within the eye. If this pressure remains uncontrolled, it may cause damage to the optic nerve, resulting in loss of vision. There are no cures for glaucoma, but treatments such as medications and surgery can help save remaining vision. Early diagnosis is crucial to manage the disease ef

In [None]:
# Get the sample dataframe for the evaluation.
df_sample = df_question.sample(n=200, random_state=2)

In [46]:
# Check the head of df_sample.
df_sample.head()

Unnamed: 0,id,question,answer,focus,source
2,2,What are the symptoms of Glaucoma?,Symptoms of Glaucoma Glaucoma can develop in ...,Glaucoma,NIHSeniorHealth
3,3,What are the treatments for Glaucoma?,"Although open-angle glaucoma cannot be cured, ...",Glaucoma,NIHSeniorHealth
10,10,How to prevent High Blood Pressure?,Steps You Can Take You can take steps to preve...,High Blood Pressure,NIHSeniorHealth
12,12,How to diagnose High Blood Pressure?,"If you are diagnosed with high blood pressure,...",High Blood Pressure,NIHSeniorHealth
13,13,What are the treatments for High Blood Pressure?,High blood pressure is treated with lifestyle ...,High Blood Pressure,NIHSeniorHealth


In [47]:
# Check the shape of df_sample.
df_sample.shape

(180, 5)

In [48]:
# Get the sample documents for the evaluation.
sample = df_sample.to_dict(orient='records')

## Evaluate with gpt-4o-mini

In [50]:
# Evaluate the questions with gpt-4o-mini.
evaluations = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question) 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)

    evaluations.append((record, answer_llm, evaluation))

  0%|          | 0/180 [00:00<?, ?it/s]

In [51]:
# Evaluate all the questions and create a dataframe.
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [52]:
# Extract required data and show raw data.
df_eval.relevance.value_counts()

relevance
RELEVANT           165
PARTLY_RELEVANT     13
NON_RELEVANT         2
Name: count, dtype: int64

In [53]:
# Extract required data and show normalized data.
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.916667
PARTLY_RELEVANT    0.072222
NON_RELEVANT       0.011111
Name: proportion, dtype: float64

In [54]:
# Show the NON_RELEVANT rows in the dataframe.
df_eval[df_eval.relevance == 'NON_RELEVANT']

Unnamed: 0,answer,id,question,relevance,explanation
16,"To diagnose surviving cancer, it is essential ...",174,How to diagnose Surviving Cancer?,NON_RELEVANT,The generated answer does not address the ques...
117,Individuals at risk for leukemia include:\n\n-...,1182,Who is at risk for Aneurysm?,NON_RELEVANT,The generated answer discusses risk factors fo...


In [55]:
# Save the datafran to csv in the data folder.
df_eval.to_csv('../data/rag-eval-gpt-4o-mini.csv', index=False)

## Evaluate with gpt-4o

In [56]:
# Evaluate the questions with gpt-4o.
evaluations_gpt4o = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question, model='gpt-4o') 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)
    
    evaluations_gpt4o.append((record, answer_llm, evaluation))

  0%|          | 0/180 [00:00<?, ?it/s]

In [57]:
# Evaluate all the questions and create a dataframe.
df_eval = pd.DataFrame(evaluations_gpt4o, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [58]:
# Extract required data and show raw data.
df_eval.relevance.value_counts()

relevance
RELEVANT           165
PARTLY_RELEVANT     15
Name: count, dtype: int64

In [59]:
# Extract required data and show normalized data.
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.916667
PARTLY_RELEVANT    0.083333
Name: proportion, dtype: float64

In [60]:
# Show the NON_RELEVANT rows in the dataframe.
df_eval[df_eval.relevance == 'NON_RELEVANT']

Unnamed: 0,answer,id,question,relevance,explanation


In [61]:
# Save the datafran to csv in the data folder.
df_eval.to_csv('../data/rag-eval-gpt-4o.csv', index=False)

## The run metrics

In [73]:
# Collect the run metrics.
result_data = {'gpt-4o-mini':  ['$0.17', '12:28'], 'gpt-4o': ['$1.74', '28:45']}

In [76]:
# Convert collected metrics to dataframe.
df_result_data = pd.DataFrame.from_dict(data=result_data, orient='index', columns=['total_cost', 'run_time'])

In [77]:
# Show the dataframe.
df_result_data

Unnamed: 0,total_cost,run_time
gpt-4o-mini,$0.17,12:28
gpt-4o,$1.74,28:45
