In [1]:
# Install minsearch
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

--2024-10-16 12:00:51--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3832 (3.7K) [text/plain]
Saving to: ‘minsearch.py’


2024-10-16 12:00:52 (22.3 MB/s) - ‘minsearch.py’ saved [3832/3832]



In [2]:
# Import required modules.
import os
import json
import random
import minsearch
import pandas as pd
from openai import OpenAI
from tqdm.auto import tqdm
from dotenv import load_dotenv
load_dotenv()

True

## Ingestion

In [3]:
# Import the data for the project.
df = pd.read_csv('../data/data.csv')

In [4]:
# Show the head of df.
df.head()

Unnamed: 0,id,answer,source,focus_area
0,0,There are many different types and designs of ...,NIHSeniorHealth,Knee Replacement
1,1,"- a need to urinate frequently, especially at ...",NIHSeniorHealth,Prostate Cancer
2,2,Who Should Be Tested? The United States Preven...,NIHSeniorHealth,Osteoporosis
3,3,Risk Factors Diabetes and high blood pressure ...,NIHSeniorHealth,Kidney Disease
4,4,Kidney Disease Kidney disease is often called ...,NIHSeniorHealth,Kidney Disease


In [5]:
# Create the documents for analysis.
documents = df.to_dict(orient='records')

In [6]:
# Index the documents.
index = minsearch.Index(
    text_fields=['answer', 'source', 'focus_area'],
    keyword_fields=['id']
)

In [7]:
# Fit the index on the documents.
index.fit(documents)

<minsearch.Index at 0x74099c09ecf0>

## RAG Flow

In [8]:
# Extract the key.
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [9]:
# Create the client.
client = OpenAI()

In [10]:
# Define search function.
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [11]:
# Create the prompt_template.
prompt_template = """
You're an excellent medical assistant. Answer the QUESTION based on the CONTEXT from our medical database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}
CONTEXT: {context}
""".strip()

In [12]:
# Create the entry_template.
entry_template = """
answer: {answer}
source: {source}
focus_area: {focus_area}
""".strip()

In [13]:
# Define build_prompt function.
def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    
    return prompt

In [14]:
# Define llm function.
def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [15]:
# Define rag function.
def rag(query, model='gpt-4o-mini'):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    #print(prompt)
    answer = llm(prompt, model=model)
    
    return answer

In [17]:
# Ask a question and print the answer.
question = 'What are some risk factors that might prompt a doctor to evaluate a patient for osteoporosis?'
answer = rag(question)
print(answer)

Some risk factors that might prompt a doctor to evaluate a patient for osteoporosis include:

- Being a woman aged 65 or older
- Being a woman younger than 65 and at high risk for fractures
- Being a man or woman over age 50 who has broken a bone
- Experiencing a loss of height, a stooped or hunched posture, or sudden back pain with no apparent cause
- Taking glucocorticoid medications (such as prednisone, cortisone, or dexamethasone) for 2 months or longer or taking other medications known to cause bone loss
- Having a chronic illness or taking medication known to cause bone loss
- Having anorexia nervosa or a history of this eating disorder
- Being a premenopausal woman, not pregnant, with stopped, irregular, or never-started menstrual periods upon reaching puberty.


## Retrieval Evaluation

In [18]:
# Create df_question.
df_question = pd.read_csv('../data/ground-truth-retrieval.csv')

In [19]:
# Show the head of df_question.
df_question.head()

Unnamed: 0,id,question
0,0,What are the main components of an artificial ...
1,0,What is the difference between total knee repl...
2,0,How are the components of a knee joint attache...
3,0,What are the advantages of minimally invasive ...
4,0,What should I consider if I am interested in h...


In [20]:
# Create the ground_truth documents.
ground_truth = df_question.to_dict(orient='records')

In [21]:
# Show the first document in ground_truth.
ground_truth[0]

{'id': 0, 'question': 'What are the main components of an artificial knee?'}

In [22]:
# Define the hit_rate function.
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [23]:
# Define the mrr function.
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [24]:
# Define the minsearch_search function.
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [25]:
# Define the evaluate function.
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [26]:
# Run the evaluation function.
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/990 [00:00<?, ?it/s]

{'hit_rate': 0.9878787878787879, 'mrr': 0.8851875901875906}

## Finding Best Parameters

In [27]:
# Split df_question into df_validation and df_test.
df_validation = df_question[:100]
df_test = df_question[100:]

In [28]:
# Define the simple_optimize function.
def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [29]:
# Create the gt_val documents.
gt_val = df_validation.to_dict(orient='records')

In [30]:
# Define a new minsearch_search function with boost.
def minsearch_search(query, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [31]:
# Define the parameter ranges.
param_ranges = {
    'answer': (0.0, 3.0),
    'source': (0.0, 3.0),
    'focus_area': (0.0, 3.0),
}

In [32]:
# Define the objective and search_function function.
def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], boost_params)

    results = evaluate(gt_val, search_function)
    
    return results['mrr']

In [33]:
# Run the optimization.
simple_optimize(param_ranges, objective, n_iterations=20)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

({'answer': 1.0812377449535782,
  'source': 2.6054399406994126,
  'focus_area': 0.6008972544485423},
 0.8778333333333334)

In [34]:
# Define the minsearch_improved function.
def minsearch_improved(query):
    boost = {
        'answer':  1.0812377449535782,
        'source': 2.6054399406994126,
        'focus_area': 0.6008972544485423,
    }

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [35]:
# Evaluate the ground_truth.
evaluate(ground_truth, lambda q: minsearch_improved(q['question']))

  0%|          | 0/990 [00:00<?, ?it/s]

{'hit_rate': 0.990909090909091, 'mrr': 0.9296151996152001}

## RAG Evaluation

In [36]:
# Create the prompt2_template.
prompt2_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [37]:
# Show the ground_truth length.
len(ground_truth)

990

In [38]:
# Import the first ground_truth document to record.
record = ground_truth[0]

In [39]:
# Get the question from record.
question = record['question']

In [40]:
# Get the answer_llm with rag function)
answer_llm = rag(question)

In [41]:
# Print answer_llm.
print(answer_llm)

The main components of an artificial knee are:

1. The femoral component, which attaches to the thigh bone.
2. The tibial component, which attaches to the shin bone.
3. The patellar component, which is the knee cap.


In [42]:
# Print the prompt.
prompt = prompt2_template.format(question=question, answer_llm=answer_llm)
print(prompt)

You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: What are the main components of an artificial knee?
Generated Answer: The main components of an artificial knee are:

1. The femoral component, which attaches to the thigh bone.
2. The tibial component, which attaches to the shin bone.
3. The patellar component, which is the knee cap.

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}


In [43]:
# Get the sample dataframe for the evaluation.
df_sample = df_question.sample(n=200, random_state=2)

In [44]:
# Check the head of df_sample.
df_sample.head()

Unnamed: 0,id,question
880,176,What is urinary retention and how does it affe...
291,58,How does low health literacy affect individual...
974,194,What symptoms might indicate that someone has ...
556,111,What lifestyle changes should be made to help ...
158,31,What are the common signs or symptoms indicati...


In [45]:
# Check the shape of df_sample.
df_sample.shape

(200, 2)

In [46]:
# Get the sample documents for the evaluation.
sample = df_sample.to_dict(orient='records')

## Evaluate with gpt-4o-mini

In [47]:
# Evaluate the questions with gpt-4o-mini.
evaluations = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question) 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)

    evaluations.append((record, answer_llm, evaluation))

  0%|          | 0/200 [00:00<?, ?it/s]

In [48]:
# Evaluate all the questions and create a dataframe.
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [49]:
# Extract required data and show raw data.
df_eval.relevance.value_counts()

relevance
RELEVANT           186
PARTLY_RELEVANT      9
NON_RELEVANT         5
Name: count, dtype: int64

In [50]:
# Extract required data and show normalized data.
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.930
PARTLY_RELEVANT    0.045
NON_RELEVANT       0.025
Name: proportion, dtype: float64

In [51]:
# Show the NON_RELEVANT rows in the dataframe.
df_eval[df_eval.relevance == 'NON_RELEVANT']

Unnamed: 0,answer,id,question,relevance,explanation
27,The provided context does not contain any info...,13,What is the address of the U.S. Food and Drug ...,NON_RELEVANT,The generated answer does not provide any info...
37,The provided context does not contain any info...,13,Which organization can provide assistance with...,NON_RELEVANT,The generated answer indicates that there is n...
83,The context provided does not specify the freq...,77,What frequency of seizures is observed in pati...,NON_RELEVANT,The generated answer does not address the ques...
105,The provided context does not contain specific...,100,What regions are most affected by African Tryp...,NON_RELEVANT,The generated answer states that it cannot pro...
143,The provided context does not contain specific...,100,What treatment options are available for someo...,NON_RELEVANT,The generated answer explicitly states that it...


In [52]:
# Save the datafran to csv in the data folder.
df_eval.to_csv('../data/rag-eval-gpt-4o-mini.csv', index=False)

## Evaluate with gpt-4o

In [53]:
# Evaluate the questions with gpt-4o.
evaluations_gpt4o = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question, model='gpt-4o') 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)
    
    evaluations_gpt4o.append((record, answer_llm, evaluation))

  0%|          | 0/200 [00:00<?, ?it/s]

In [54]:
# Evaluate all the questions and create a dataframe.
df_eval = pd.DataFrame(evaluations_gpt4o, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [55]:
# Extract required data and show raw data.
df_eval.relevance.value_counts()

relevance
RELEVANT           184
PARTLY_RELEVANT     10
NON_RELEVANT         6
Name: count, dtype: int64

In [56]:
# Extract required data and show normalized data.
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.92
PARTLY_RELEVANT    0.05
NON_RELEVANT       0.03
Name: proportion, dtype: float64

In [57]:
# Show the NON_RELEVANT rows in the dataframe.
df_eval[df_eval.relevance == 'NON_RELEVANT']

Unnamed: 0,answer,id,question,relevance,explanation
27,I'm unable to provide the address of the U.S. ...,13,What is the address of the U.S. Food and Drug ...,NON_RELEVANT,The generated answer does not provide any rele...
94,The provided context does not contain informat...,154,What role does glycophorin A play in the trans...,NON_RELEVANT,The generated answer does not address the ques...
105,The provided CONTEXT does not include specific...,100,What regions are most affected by African Tryp...,NON_RELEVANT,The generated answer does not provide any info...
141,"I'm sorry, but the CONTEXT provided does not c...",181,What are the main functions of the kidneys in ...,NON_RELEVANT,The generated answer does not address the ques...
142,The context provided from your medical databas...,197,What types of childhood nephrotic syndrome exist?,NON_RELEVANT,The generated answer does not provide any info...
143,"I'm sorry, but the context provided does not c...",100,What treatment options are available for someo...,NON_RELEVANT,The generated answer does not provide any info...


In [58]:
# Save the datafran to csv in the data folder.
df_eval.to_csv('../data/rag-eval-gpt-4o.csv', index=False)

## The run metrics

In [59]:
# Collect the run metrics.
result_data = {'gpt-4o-mini':  ['$0.19', '22:13'], 'gpt-4o': ['$1.62', '28:15']}

In [60]:
# Convert collected metrics to dataframe.
df_result_data = pd.DataFrame.from_dict(data=result_data, orient='index', columns=['total_cost', 'run_time'])

In [61]:
# Show the dataframe.
df_result_data

Unnamed: 0,total_cost,run_time
gpt-4o-mini,$0.19,22:13
gpt-4o,$1.62,28:15
