In [10]:
import os
import dspy

In [None]:
print(os.getcwd())

In [12]:
import sys
from pathlib import Path

# Append the directory above "Evaluation" (i.e., the parent directory) to the sys.path
# This assumes you're running this from within the "Evaluation" directory

# Add the parent directory to sys.path
sys.path.append("..")

from db_retriever_module import ChromadbRetrieverModule

# Get the current working directory (where your notebook is)
current_dir = Path().resolve()

# Go two levels up to reach the base directory ('some/')
base_dir = current_dir.parent

# Add the base directory to the system path
sys.path.append(str(base_dir))

In [13]:
class AnswerGenerator(dspy.Signature):
    """
    A class to generate answers to questions based on provided context using DSPy's Signature.
    
    Attributes:
        context (dspy.InputField): Text that may contain relevant facts for generating an answer.
        question (dspy.InputField): The question for which an answer is sought.
        answer (dspy.OutputField): The generated short factual answer, aimed to be between 1 to 15 words.
    """
    context = dspy.InputField(description="May contain relevant facts for answering.")
    question = dspy.InputField(description="The question to be answered.")
    answer = dspy.OutputField(description="Short factual answer to the question, typically 1-15 words.")

class RetrievalAugmentedGenerator(dspy.Module):
    """
    Implements a Retrieval-Augmented Generation (RAG) model by combining retrieval and generation capabilities.

    This model first retrieves relevant passages based on the input question and then generates an answer
    by considering both the question and the retrieved context.
    
    Attributes:
        num_passages (int): Number of passages to retrieve for context enrichment.
        retrieve (dspy.Retrieve): The retrieval component of the RAG model.
        generate_answer (dspy.ChainOfThought): The answer generation component based on retrieved context.
    """
    def __init__(self, num_passages=5):
        super().__init__()
        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_answer = dspy.ChainOfThought(AnswerGenerator)
    
    def forward(self, question):
        """
        Retrieves relevant passages based on the question and generates an answer.

        Args:
            question (str): The input question for which an answer is needed.
        
        Returns:
            dspy.Prediction: The context used for answering and the generated answer.
        """
        context = self.retrieve(question).passages
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=prediction.answer)

In [14]:
import os
import logging
import dspy

# Assuming RAG and ChromadbRM classes are defined elsewhere in your code
# from your_module import RAG, ChromadbRM

def setup():
    """
    Initializes and configures the DSPy library for a Retrieval-Augmented Generation (RAG) setup.
    
    This involves configuring a language model and a retrieval model with specified settings,
    and then initializing the RAG module with these configurations.

    Returns:
        An instance of the RAG class, ready for use.
        
    Raises:
        EnvironmentError: If the OPENAI_API_KEY environment variable is not set.
        Exception: For general exceptions related to DSPy configuration or RAG initialization.
    """
    try:

        from dotenv import load_dotenv
        load_dotenv()
        
        # Retrieve the OpenAI API key safely
        openai_api_key = os.getenv("OPENAI_API_KEY")
        if not openai_api_key:
            raise EnvironmentError("OPENAI_API_KEY not set in environment variables.")
        
        # Configuration parameters
        MODEL_NAME = 'gpt-3.5-turbo'
        COLLECTION_NAME = "test-medical_abstract_collection"
        PERSIST_DIR = "local_chroma.db"
        LOCAL_EMBED_MODEL = "sentence-transformers/paraphrase-MiniLM-L6-v2"
        
        # Configuring the language model
        turbo = dspy.OpenAI(model=MODEL_NAME)
        
        # Configuring the retrieval model
        chroma_rm = ChromadbRetrieverModule(db_collection_name=COLLECTION_NAME, persist_directory=PERSIST_DIR,
                               local_embed_model=LOCAL_EMBED_MODEL, api_key=openai_api_key)
        
        # Apply the configurations
        dspy.settings.configure(lm=turbo, rm=chroma_rm)
        
        # Initialize the RAG module
        rag_instance = RetrievalAugmentedGenerator()
        
        logging.info("RAG setup completed successfully.")
        
        return rag_instance
        
    except EnvironmentError as env_err:
        logging.error(f"Environment error during setup: {env_err}")
        raise
    except Exception as e:
        logging.error(f"Failed to complete RAG setup: {e}")
        raise

In [15]:
rag = setup()

Collection Count: 578


In [16]:
# You saved it in the validate synthetics data ipynb  - synthetic_dataset.csv
import pandas as pd
df = pd.read_csv(os.path.join(os.getcwd(),"processed","synthetic_dataset.csv"))
df = df[['question', 'ground_truths']]

In [17]:
df.head(10)

Unnamed: 0,question,ground_truths
0,What was the effect of oropharyngeal anesthesi...,['Oropharyngeal anesthesia led to an increase ...
1,What was the prognostic value of low neutrophi...,"['Low neutrophil function, particularly defect..."
2,What was the treatment that resulted in both c...,['']
3,What was the conclusion regarding the role of ...,['The conclusion was that CNS prophylaxis with...
4,What are the advantages of using duplex Dopple...,"['The advantages include absence of toxicity, ..."
5,Why is congenital hypertrophy of the retinal p...,['Congenital hypertrophy of the RPE serves as ...
6,What is the main conclusion drawn from the stu...,['The study suggests an abnormality in an intr...
7,What is the purpose of the back isometric dyna...,['']
8,What was the effect of prophylactic peroral ac...,['Prophylactic peroral ACV prevented the devel...
9,What unique myopathic changes were observed in...,['Vacuolar changes with periodic acid-Schiff-p...


In [18]:
from sklearn.model_selection import train_test_split

In [19]:
# split the data into train and test
train, test = train_test_split(df, test_size=0.2)

In [20]:
# save the train and test data
train.to_csv(os.path.join(os.getcwd(),"processed","train_synthetic.csv"), index=False)
test.to_csv(os.path.join(os.getcwd(),"processed","test_synthetic.csv"), index=False)

# load the train and test data
train = pd.read_csv(os.path.join(os.getcwd(),"processed","train_synthetic.csv"))
test = pd.read_csv(os.path.join(os.getcwd(),"processed","test_synthetic.csv"))

In [21]:
import pandas as pd
from tqdm.auto import tqdm  # Use tqdm.auto to automatically select a suitable interface (notebook, terminal, etc.)

# Assuming 'test' is your DataFrame and 'rag' is a function defined elsewhere that takes a question and returns a response object

# Create an empty list to store rows
eval_results_rows = []

# Wrap test.iterrows() with tqdm to display a progress bar
for index, row in tqdm(test.iterrows(), total=test.shape[0], desc="Processing questions"):
    # Get the question
    question = row['question']
    
    # Response from rag
    response = rag(question)
    
    # Create a dictionary to represent a row
    row_dict = {
        'question': question, 
        'contexts': response.context, 
        'answer': response.answer, 
        'ground_truths': row['ground_truths']
    }
    
    # Append the row dictionary to the list
    eval_results_rows.append(row_dict)

# Create the df_eval_results DataFrame from the list of rows
df_eval_results = pd.DataFrame(eval_results_rows)


Processing questions:   0%|          | 0/5 [00:00<?, ?it/s]

In [22]:
df_eval_results

Unnamed: 0,question,contexts,answer,ground_truths
0,What is the purpose of the back isometric dyna...,[bretylium tosylate versus lidocaine in experi...,Measure and improve back muscle strength and f...,['']
1,What was the effect of dietary calcium on bloo...,[laser angioplasty in peripheral vascular dise...,Lowered blood pressure in hypertensive rats wi...,['Rats receiving high dietary calcium showed a...
2,What were the main measurements taken during t...,[ultrasonographic assessment of placental abno...,"Intramuscular meperidine, promethazine, and ch...",['The main measurements included serial respir...
3,What was the conclusion regarding the role of ...,[three mixed venous saturation catheters in pa...,The conclusion regarding the role of CNS radio...,['The conclusion was that CNS prophylaxis with...
4,What was the prognostic value of low neutrophi...,[trichothiodystrophy with chronic neutropenia ...,Significant in predicting late pyogenic infect...,"['Low neutrophil function, particularly defect..."


In [23]:
import ast

# df_eval_results ground_truths to list
df_eval_results['ground_truths'] = df_eval_results['ground_truths'].apply(lambda x: ast.literal_eval(x))

### Now, that we have answers for all the questions, we can evaluate the RAG model.

In [None]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    answer_similarity,
    context_relevancy
)
from datasets import Dataset
from ragas import evaluate

ds = Dataset.from_pandas(df_eval_results)

result = evaluate(
    ds,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
        answer_similarity,
        context_relevancy
    ],
)

In [25]:
result

{'context_precision': 0.4233, 'faithfulness': 0.0000, 'answer_relevancy': 0.5440, 'context_recall': 0.8000, 'answer_similarity': 0.8402, 'context_relevancy': 0.0000}

In [None]:
result.to_pandas()

## Log to Wandb

In [27]:
import wandb

In [None]:
# ok lets login to wandb
# wandb.login(key="your - api - key - here")

In [None]:
def start_wandb_run(dataset, result, project_name="medical_abstract-rag-synthetic-data-eval",
                     chunk_size=128, sentence_chunk_overlap=16):
    """
    Initializes a Weights & Biases run to log metrics, parameters, and results for tracking experiments.

    Parameters:
    - dataset: The dataset being evaluated or processed in the run.
    - result: A dictionary containing the results to log in the run.
    - project_name (str): The name of the Weights & Biases project where the run will be logged.
    - chunk_size (int): The size of the chunks to split the dataset into.
    - sentence_chunk_overlap (int): The overlap size between consecutive dataset chunks.

    Returns:
    None
    """
    try:
        # Start a new Weights & Biases run
        wandb.init(project=project_name, config={
        "number_of_questions": len(ds),
        "comments": "Simple QA RAG model with no teleprompter - chunk overlap size 0",
        "model": "RAG",
        "dataset": "Synthetic",
        "num_passages": 5,
        "openai_model": "gpt-3.5-turbo",
        "chroma_collection_name": "test-medical_abstract_collection",
        "chroma_persist_directory": "local_chroma.db",
        "chroma_local_embed_model": "sentence-transformers/paraphrase-MiniLM-L6-v2",

    })

        # Log the result to the current run
        wandb.log(result)

        # Finish the current run to ensure all data is synced
        wandb.finish()
    except Exception as e:
        print(f"An error occurred during the Weights & Biases operation: {e}")
        # Optionally, handle exceptions such as retrying the operation or logging the error to a file

# Start and log the wandb run
start_wandb_run(ds, result)


### Now, let's compile the RAG using teleprompters.

In [30]:
train.reset_index(inplace=True, drop=True)

In [31]:
train = train[:10]

In [32]:
train

Unnamed: 0,question,ground_truths
0,What are some alternative dosage regimens for ...,['Some alternative dosage regimens for rt-PA i...
1,What are the major drawbacks associated with u...,['The major drawbacks of using 125I for therap...
2,What was the effect of oropharyngeal anesthesi...,['Oropharyngeal anesthesia led to an increase ...
3,What was the overall risk of developing second...,['']
4,What is the role of neural cell adhesion molec...,['N-CAM is involved in direct cell-cell adhesi...
5,What are the advantages of using duplex Dopple...,"['The advantages include absence of toxicity, ..."
6,What was the effect of prophylactic peroral ac...,['Prophylactic peroral ACV prevented the devel...
7,What was the predominant type of IgA found in ...,"[""Secretory IgA comprised 92%, 81.6%, and 76.7..."
8,What is the incidence of maternal varicella in...,['Maternal varicella occurs in fewer than five...
9,Why is congenital hypertrophy of the retinal p...,['Congenital hypertrophy of the RPE serves as ...


In [33]:
import ast

trainset = []
for i in range(5):
    ex = dspy.Example(
        question=train['question'].iloc[i],
        answer=ast.literal_eval(train['ground_truths'].iloc[i])[0]
    )
    ex = ex.with_inputs('question')
    trainset.append(ex)

In [34]:
trainset

[Example({'question': 'What are some alternative dosage regimens for rt-PA in patients with myocardial infarction?', 'answer': 'Some alternative dosage regimens for rt-PA in patients with myocardial infarction include bolus, front-loaded, and accelerated infusions.'}) (input_keys={'question'}),
 Example({'question': 'What are the major drawbacks associated with using 125I for therapy in tumours?', 'answer': 'The major drawbacks of using 125I for therapy in tumours include its long 60-day half-life, leading to radiological and waste disposal problems, and the extreme short range of its radiotoxic effects.'}) (input_keys={'question'}),
 Example({'question': 'What was the effect of oropharyngeal anesthesia on obstructive sleep apnea in the study subjects?', 'answer': 'Oropharyngeal anesthesia led to an increase in obstructive apneas and hypopneas, as well as a higher frequency of oxyhemoglobin desaturations during sleep.'}) (input_keys={'question'}),
 Example({'question': "What was the ov

In [39]:
from dspy.teleprompt import BootstrapFewShot
# Validation logic: check that the predicted answer is correct.
# Also check that the retrieved context does actually contain that answer.
def check_answer_and_context_validity(example, pred, trace=None):
    answer_EM = dspy.evaluate.answer_exact_match(example, pred)
    answer_PM = dspy.evaluate.answer_passage_match(example, pred)
    return answer_EM and answer_PM

# Set up a basic teleprompter, which will compile our RAG program.
teleprompter = BootstrapFewShot(metric=check_answer_and_context_validity)

# Compile!
compiled_rag = teleprompter.compile(RetrievalAugmentedGenerator(), trainset=trainset)

100%|██████████| 5/5 [00:00<00:00, 22.63it/s]

Bootstrapped 0 full traces after 5 examples in round 0.





In [40]:
import pandas as pd
import ast

def compile_evaluation_metrics(data, model_response):
    """
    Compiles evaluation metrics from a dataset using responses from a specified model.

    Args:
        data (pd.DataFrame): The dataset containing questions and ground truths.
        model_response (function): The model function to generate responses based on questions.

    Returns:
        pd.DataFrame: A DataFrame containing original questions, model contexts, model answers, and ground truths.
    """
    compiled_data = []

    for _, entry in data.iterrows():
        try:
            question_text = entry['question']
            model_output = model_response(question_text)
            result_entry = {
                'question': question_text,
                'contexts': model_output.context,
                'answer': model_output.answer,
                'ground_truths': entry['ground_truths']
            }
            compiled_data.append(result_entry)
        except Exception as e:
            print(f"Error processing question: {question_text}, Error: {e}")

    evaluation_df = pd.DataFrame(compiled_data)

    # Safely convert 'ground_truths' from string representation to list
    def safe_literal_eval(value):
        try:
            return ast.literal_eval(value)
        except (ValueError, SyntaxError):
            return []  # or return a default value such as ['error_parsing']

    evaluation_df['ground_truths'] = evaluation_df['ground_truths'].apply(safe_literal_eval)

    return evaluation_df


In [41]:
df_eval_results = compile_evaluation_metrics(test, compiled_rag)

In [None]:
ds = Dataset.from_pandas(df_eval_results)

result = evaluate(
    ds,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
        answer_similarity,
        context_relevancy
    ],
)

In [None]:
result.to_pandas()