In [1]:
import gradio as gr
import chromadb
import uuid
from pypdf import PdfReader
import google.generativeai as genai
import os
import time
from dotenv import load_dotenv
import re

# Configure the generative AI model 
load_dotenv() #loads the API key put in a .env file
try:
    genai.configure(api_key=os.environ['GOOGLE_API_KEY'])
except Exception as e:
    print(f"Error configuring Google AI. Please ensure your API key is correct. Error: {e}")

# Define file paths for your documents
current_dir = os.getcwd() 
FILE_PATH_MDD = os.path.join(current_dir, '..', 'documents', 'MDD.txt')
FILE_PATH_MDR = os.path.join(current_dir, '..', 'documents', 'MDR.txt')
COLLECTION_NAME = "mdr_gap_analysis_txt"

# Data processing function. Uses a smart logic to skip short paragraph 
# while maintaining titles and proximity between titles and the related paragraph.
def process_txt_file(file_path, source_name):
    """
    Reads a .txt file and splits it into chunks based on paragraphs
    (separated by double newlines).

    Args:
        file_path (str): The path to the .txt file.
        source_name (str): The name of the source document for metadata.

    Returns:
        tuple: A tuple containing two lists: (text_chunks, metadatas).
    """
    all_text_chunks = []
    all_metadatas = []
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            full_text = f.read()

        # Chunking strategy: split by double newline (a common paragraph separator)
        paragraphs = full_text.split('\n\n')

        for para in paragraphs:
            stripped_para = para.strip()
            if len(stripped_para) > 25:  # Filter out very short or empty lines
                all_text_chunks.append(stripped_para)
                # Metadata is simpler for a txt file, just the source
                all_metadatas.append({'source': source_name})

    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
    except Exception as e:
        print(f"An error occurred while processing {file_path}: {e}")

    return all_text_chunks, all_metadatas

# Initialize the RAG System (Vector Database with ChromaDB)

print("Initializing RAG system... This may take a minute.")
start_time = time.time()
client = chromadb.Client()
#client = chromadb.PersistentClient(path="chroma_db_txt")

# Define the local embedding function (Chroma actually have its own default, but Sentence Transformer seems good)
from chromadb.utils import embedding_functions
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

# Delete the collection if it already exists to ensure a fresh start and create the new one
if COLLECTION_NAME in [c.name for c in client.list_collections()]:
    client.delete_collection(name=COLLECTION_NAME)
    print(f"Deleted existing collection: {COLLECTION_NAME}")

collection = client.get_or_create_collection(
    name=COLLECTION_NAME,
    embedding_function=sentence_transformer_ef
)

# Process and load both documents into the database
try:
    chunks_mdd, metas_mdd = process_txt_file(FILE_PATH_MDD, 'MDD 93/42/EEC')
    print(f"Processed MDD: {len(chunks_mdd)} chunks.")

    chunks_mdr, metas_mdr = process_txt_file(FILE_PATH_MDR, 'MDR 2017/745')
    print(f"Processed MDR: {len(chunks_mdr)} chunks.")

    text_chunks = chunks_mdd + chunks_mdr
    metadatas = metas_mdd + metas_mdr
    collection.add(
        ids=[str(uuid.uuid4()) for _ in text_chunks],
        documents=text_chunks,
        metadatas=metadatas
    )
    end_time = time.time()
    print(f"RAG system initialized successfully in {end_time - start_time:.2f} seconds.")
except Exception as e:
    print(f"\nAn error occurred during initialization: {e}")
    collection = None

# Core Function for Querying and Generation

def get_gap_analysis(question): # The actual function called by Gradio at runtime
    if not question or not question.strip():
        return "Please enter a question before submitting."
    if not collection:
        return "Error: The RAG system is not initialized. Please check the file paths and restart the notebook."
    
    print(f"\nReceived question: {question}")
    
    # Query the vector store to get relevant context from both documents
    results = collection.query(query_texts=[question], include = ["documents", "metadatas", "distances"], n_results=10)

    sources_markdown = "### Sources Used for Analysis\n\n"
    retrieved_documents = results['documents'][0]
    retrieved_metadatas = results['metadatas'][0]
    retrieved_distances = results['distances'][0]

    for i, (doc, meta, dist) in enumerate(zip(retrieved_documents, retrieved_metadatas, retrieved_distances)):
        # Convert distance to a more intuitive similarity score (1 - distance)
        relevance_score = 1 - dist
        source_info = f"**Source {i+1}:** {meta.get('source', 'N/A')}, Page {meta.get('page', 'N/A')}\n"
        relevance_info = f"**Relevance Score:** {relevance_score:.2f}\n\n"
        content_info = f"```\n{doc}\n```\n\n---\n\n"
        sources_markdown += source_info + relevance_info + content_info
            
    # Separate the context by source
    context_mdr = ""
    context_mdd = ""
    for doc, meta, distance in zip(retrieved_documents, retrieved_metadatas, retrieved_distances):
        if meta.get('source') == 'MDR 2017/745':
            context_mdr += f"[Page {meta.get('page', 'N/A')}]: {doc}, distance: {distance}\n\n"
        elif meta.get('source') == 'MDD 93/42/EEC':
            context_mdd += f"[Page {meta.get('page', 'N/A')}]: {doc}, distance: {distance}\n\n"

    # Engineer the prompt for the LLM (mid complexity prompt)
    prompt = f"""
    Act as a Senior Regulatory Consultant for a MedTech company. Your task is to perform a gap analysis based on the user's question, using **exclusively** the provided context from two documents: the old Medical Device Directive (MDD) and the new Medical Device Regulation (MDR).

    Follow this structure for your response:
    1.  **Summary of MDD Requirements:** Based on the MDD context, briefly summarize the old requirements.
    2.  **Summary of MDR Requirements:** Based on the MDR context, summarize the new, more demanding requirements.
    3.  **Gap Analysis:** Clearly highlight the key differences and new obligations the company needs to address.
    4.  **Strategic Recommendation:** Provide a concise, actionable recommendation.

    **Crucial Rule:** If the context for one of the documents is missing or insufficient, state that clearly. Do not invent information.

    ---
    **CONTEXT FROM MDD 93/42/EEC:**
    {context_mdd if context_mdd else "No specific context retrieved."}

    ---
    **CONTEXT FROM MDR 2017/745:**
    {context_mdr if context_mdr else "No specific context retrieved."}

    ---
    **USER QUESTION:** {question}

    **CONSULTANT'S ANALYSIS:**
    """

    # Call the LLM to generate the analysis
    try:
        model = genai.GenerativeModel('gemini-1.5-flash')
        response = model.generate_content(prompt)
        print("Successfully generated response from LLM.")
        return response.text, sources_markdown
    except Exception as e:
        print(f"Error calling the LLM: {e}")
        return f"An error occurred while generating the response: {e}"

# Create and Launch the Gradio App

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# MDR Gap Analysis AI Companion")
    gr.Markdown("### Welcome.\nPose your question below to analyze the regulatory differences between the MDD and MDR.")
    
    with gr.Row():
        question_input = gr.Textbox(
            label="Your Question",
            placeholder="e.g., How have the requirements for post-market surveillance changed from the MDD to the MDR?",
            lines=3
        )
    
    submit_button = gr.Button("Get Analysis!")
    
    with gr.Row():
        answer_output = gr.Markdown(label="Consultant's Analysis")

    with gr.Accordion("View Sources and Relevance Scores", open=False):
        sources_output = gr.Markdown(label="Retrieved Context")
        
    # Define the examples to show in the UI
    examples = [
        "How have the requirements for post-market surveillance changed from the MDD to the MDR?",
        "What are the new requirements for the Unique Device Identification (UDI) system under the MDR, and how does this compare to the MDD?",
        "What are the most significant new clinical evaluation requirements for a Class IIa device to be compliant with the MDR?"
    ]
    gr.Examples(examples=examples, inputs=question_input)

    # Link the button to the function
    submit_button.click(
        fn=get_gap_analysis,
        inputs=question_input,
        outputs=[answer_output, sources_output]
    )

# Launch the app. In a Jupyter Notebook, the interface will appear directly in the cell output.
demo.launch(debug=True, share=True)

Initializing RAG system... This may take a minute.
Processed MDD: 724 chunks.
Processed MDR: 2795 chunks.
RAG system initialized successfully in 17.71 seconds.
Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://a7be6ff73162c9686d.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)



Received question: What are the most significant new clinical evaluation requirements for a Class IIa device to be compliant with the MDR?
Successfully generated response from LLM.

Received question: What are the new requirements for the Unique Device Identification (UDI) system under the MDR, and how does this compare to the MDD?
Successfully generated response from LLM.

Received question: What are the most significant new clinical evaluation requirements for a Class IIa device to be compliant with the MDR?
Successfully generated response from LLM.

Received question: What does the MDR in Article 83 require for a manufacturer's post-market surveillance system, and what is the equivalent requirement in the MDD?
Successfully generated response from LLM.

Received question: Describe the general requirements for a clinical evaluation as outlined in MDR Annex XIV, Part A, and compare them to the clinical data requirements in MDD Annex X. 
Successfully generated response from LLM.
Keyboa

