In [1]:
import os
import dspy

In [8]:
import sys
from pathlib import Path

# Get the current working directory (where your notebook is)
current_dir = Path().resolve()

# Go two levels up to reach the base directory ('some/')
base_dir = current_dir.parent

# Add the base directory to the system path
sys.path.append(str(base_dir))

from db_retriever_module import ChromadbRetrieverModule

In [None]:
class AnswerGenerator(dspy.Signature):
    """
    A class to generate answers to questions based on provided context using DSPy's Signature.
    
    Attributes:
        context (dspy.InputField): Text that may contain relevant facts for generating an answer.
        question (dspy.InputField): The question for which an answer is sought.
        answer (dspy.OutputField): The generated short factual answer, aimed to be between 1 to 15 words.
    """
    context = dspy.InputField(description="May contain relevant facts for answering.")
    question = dspy.InputField(description="The question to be answered.")
    answer = dspy.OutputField(description="Short factual answer to the question, typically 1-15 words.")

class RetrievalAugmentedGenerator(dspy.Module):
    """
    Implements a Retrieval-Augmented Generation (RAG) model by combining retrieval and generation capabilities.

    This model first retrieves relevant passages based on the input question and then generates an answer
    by considering both the question and the retrieved context.
    
    Attributes:
        num_passages (int): Number of passages to retrieve for context enrichment.
        retrieve (dspy.Retrieve): The retrieval component of the RAG model.
        generate_answer (dspy.ChainOfThought): The answer generation component based on retrieved context.
    """
    def __init__(self, num_passages=5):
        super().__init__()
        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_answer = dspy.ChainOfThought(AnswerGenerator)
    
    def forward(self, question):
        """
        Retrieves relevant passages based on the question and generates an answer.

        Args:
            question (str): The input question for which an answer is needed.
        
        Returns:
            dspy.Prediction: The context used for answering and the generated answer.
        """
        context = self.retrieve(question).passages
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=prediction.answer)


In [None]:
import os
import logging
import dspy

# Assuming RAG and ChromadbRM classes are defined elsewhere in your code
# from your_module import RAG, ChromadbRM

def setup():
    """
    Initializes and configures the DSPy library for a Retrieval-Augmented Generation (RAG) setup.
    
    This involves configuring a language model and a retrieval model with specified settings,
    and then initializing the RAG module with these configurations.

    Returns:
        An instance of the RAG class, ready for use.
        
    Raises:
        EnvironmentError: If the OPENAI_API_KEY environment variable is not set.
        Exception: For general exceptions related to DSPy configuration or RAG initialization.
    """
    try:

        from dotenv import load_dotenv
        load_dotenv()
        
        # Retrieve the OpenAI API key safely
        openai_api_key = os.getenv("OPENAI_API_KEY")
        if not openai_api_key:
            raise EnvironmentError("OPENAI_API_KEY not set in environment variables.")
        
        # Configuration parameters
        MODEL_NAME = 'gpt-3.5-turbo'
        COLLECTION_NAME = "test-overlap-0"
        PERSIST_DIR = "local_chroma.db"
        LOCAL_EMBED_MODEL = "sentence-transformers/paraphrase-MiniLM-L6-v2"
        
        # Configuring the language model
        turbo = dspy.OpenAI(model=MODEL_NAME)
        
        # Configuring the retrieval model
        chroma_rm = ChromadbRetrieverModule(db_collection_name=COLLECTION_NAME, persist_directory=PERSIST_DIR,
                               local_embed_model=LOCAL_EMBED_MODEL, api_key=openai_api_key)
        
        # Apply the configurations
        dspy.settings.configure(lm=turbo, rm=chroma_rm)
        
        # Initialize the RAG module
        rag_instance = RetrievalAugmentedGenerator()
        
        logging.info("RAG setup completed successfully.")
        
        return rag_instance
        
    except EnvironmentError as env_err:
        logging.error(f"Environment error during setup: {env_err}")
        raise
    except Exception as e:
        logging.error(f"Failed to complete RAG setup: {e}")
        raise



In [None]:
rag = setup()

In [None]:
# You saved it in the validate synthetics data ipynb  - synthetic_dataset.csv
import pandas as pd

file_path = os.path.join(os.getcwd(), 'medical_tc_test.csv')

df = pd.read_csv(os.path.join(os.getcwd(),"processed","synthetic_dataset.csv"))

df = df[['question', 'ground_truths']]