In [27]:
import chromadb
import uuid
from pypdf import PdfReader
import google.generativeai as genai
import os
from dotenv import load_dotenv

# --- 1. Setup: API Key, Embedding Model, and ChromaDB Client ---

# Set your API key for the LLM
load_dotenv()
genai.configure(api_key=os.environ['GOOGLE_API_KEY'])

In [28]:
# Initialize the ChromaDB client (this uses a local, in-memory database)
client = chromadb.Client()

# Import the embedding function from ChromaDB's utilities.
# This will use the "all-MiniLM-L6-v2" model by default.
from chromadb.utils import embedding_functions
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction()

# Create the ChromaDB collection, telling it which embedding function to use.
# ChromaDB will use this to automatically create vectors for your documents.
collection = client.get_or_create_collection(
    name='mdd_mdr_chroma',
    embedding_function=sentence_transformer_ef
)


def process_pdf_with_smart_chunking(reader, source_name):
    """
    Processes a PDF reader object using a smarter chunking strategy that
    attempts to keep titles with their subsequent paragraphs.

    Args:
        reader (PdfReader): An initialized pypdf PdfReader object.
        source_name (str): The name of the source document for metadata.

    Returns:
        tuple: A tuple containing two lists: (text_chunks, metadatas).
    """
    all_text_chunks = []
    all_metadatas = []
    
    # Iterate through each page of the document
    for page_num, page in enumerate(reader.pages):
        text = page.extract_text()
        if not text:
            continue

        # Split the page text by lines to have more granular control
        lines = text.split('\n')
        current_chunk = ""

        for line in lines:
            stripped_line = line.strip()
            
            # Heuristic to identify potential titles: short, often uppercase, and not ending with a period.
            is_potential_title = (
                len(stripped_line) > 5 and
                len(stripped_line) < 80 and
                stripped_line.isupper() and
                not stripped_line.endswith('.')
            )

            # If we find a potential title, we save the previous chunk and start a new one with the title.
            if is_potential_title:
                if current_chunk: # Save the completed chunk before starting a new one
                    all_text_chunks.append(current_chunk.strip())
                    all_metadatas.append({'source': source_name, 'page': page_num + 1})
                current_chunk = stripped_line + "\n" # Start the new chunk with the title
            else:
                # If it's not a title, just add the line to the current chunk
                current_chunk += line + " "

        # After processing all lines, add the last remaining chunk for the page
        if current_chunk:
            all_text_chunks.append(current_chunk.strip())
            all_metadatas.append({'source': source_name, 'page': page_num + 1})
            
    return all_text_chunks, all_metadatas

# --- 2. Load and Split the Document (Manual "Chunking") ---

current_dir = os.getcwd() 
file_path1 = os.path.join(current_dir, '..', 'documents', 'MDD.pdf')
file_path2 = os.path.join(current_dir, '..', 'documents', 'MDR.pdf')
text_chunks1 = []
metadatas1 = []
text_chunks2 = []
metadatas2 = []

# Create a reader object
reader1 = PdfReader(file_path1)
reader2 = PdfReader(file_path2)

# Process each PDF using the new, smarter function
print("Processing MDD document...")
text_chunks1, metadatas1 = process_pdf_with_smart_chunking(reader1, 'MDD 93/42/EEC')

print("Processing MDR document...")
text_chunks2, metadatas2 = process_pdf_with_smart_chunking(reader2, 'MDR 2017/745')

text_chunks = text_chunks1 + text_chunks2
metadatas = metadatas1 + metadatas2

Processing MDD document...
Processing MDR document...


In [29]:
# --- 3. Add Documents to the Collection ---

# Add the plain text chunks to the collection.
# ChromaDB will use the embedding function we defined earlier to process them.
collection.add(
    ids=[str(uuid.uuid4()) for _ in text_chunks],
    documents=text_chunks,
    metadatas=metadatas
)

print(f"Successfully added {len(text_chunks)} text chunks to the collection.")

Successfully added 361 text chunks to the collection.


In [30]:
# --- 4. Query the Collection and Generate a Response ---

question = "My company has a Class IIa device that was compliant with the MDD. What are the most significant new clinical evaluation requirements I need to address to be compliant with the MDR?"

# Query the collection to find the most relevant chunks.
# ChromaDB handles creating the embedding for the question and finding similar documents.
results = collection.query(
    query_texts=[question],
    n_results=10  # Retrieve a good number of chunks to ensure we get context from both sources
)

# The results contain the retrieved documents, distances, metadata, etc.
retrieved_documents = results['documents'][0]
retrieved_metadatas = results['metadatas'][0]
retrieved_documents, retrieved_metadatas

(['Ar ticle 9  Common specif ications  1. Without prejudice to Ar ticle 1(2) and 17(5) and the deadline laid do wn in those pro visions, where no har monised  standards exist or where relevant har monised standards are not suff icient, or where there is a need to address public  health concer ns, the Commission, af ter hav ing consulted the MDCG, ma y , by means of im plementing acts, adopt  common specifications (CS) in respect of the g eneral safety and performa nce requirements set out in Annex I, the  tec hnical documentation set out in Annex es II and III, the clinical evaluation and post-market clinical f ollo w-up set out  in Annex XIV or the requirements regarding clinical investig ation set out in Annex XV . Those implementing acts shall be  adopt ed in accordance with the examination procedure refer red to in Ar ticle 114(3).  2. Devices that are in conf or mity with the CS refe r red to in paragraph 1 shall be presumed to be in conf or mity with  the requirements of this Reg

In [31]:
# --- 5. Combine Results and Call the LLM ---

# --- 3. Prepare the Context for the LLM ---
# Separate the retrieved chunks by their source using the metadata
context_mdr = ""
context_mdd = ""

for doc, meta in zip(retrieved_documents, retrieved_metadatas):
    if 'source' in meta:
        if meta['source'] == 'MDR 2017/745':
            context_mdr += doc + "\n\n"
        elif meta['source'] == 'MDD 93/42/EEC':
            context_mdd += doc + "\n\n"

# Create the prompt for the LLM
prompt = f"""
Act as a Senior Regulatory Consultant for a MedTech company. Your task is to perform a gap analysis based on the user's question, using **exclusively** the provided context from two documents: the old Medical Device Directive (MDD) and the new Medical Device Regulation (MDR).

Follow this structure for your response:
1.  **Summary of MDD Requirements:** Based on the MDD context, briefly summarize the old requirements for the topic.
2.  **Summary of MDR Requirements:** Based on the MDR context, summarize the new, more demanding requirements.
3.  **Gap Analysis:** Clearly highlight the key differences and new obligations the company needs to address.
4.  **Strategic Recommendation:** Provide a concise, actionable recommendation.

**Crucial Rule:** If the context for one of the documents is missing or insufficient, state that clearly. Do not invent information.

---
**CONTEXT FROM MDD 93/42/EEC:**
{context_mdd if context_mdd else "No specific context retrieved."}

---
**CONTEXT FROM MDR 2017/745:**
{context_mdr if context_mdr else "No specific context retrieved."}

---
**USER QUESTION:** {question}

**CONSULTANT'S ANALYSIS:**
"""

# Call the Gemini API
model = genai.GenerativeModel('gemini-1.5-flash')
response = model.generate_content(prompt)

print("\n--- Question ---")
print(question)
print("\n--- Answer from LLM ---")
print(response.text)
print("\n--- Sources ---")
print(results['metadatas'][0])


--- Question ---
My company has a Class IIa device that was compliant with the MDD. What are the most significant new clinical evaluation requirements I need to address to be compliant with the MDR?

--- Answer from LLM ---
**1. Summary of MDD Requirements:**

The provided MDD text outlines clinical evaluation requirements focusing on confirming device performance and identifying undesirable side effects under normal use.  This could involve a compilation of existing scientific literature or clinical investigations.  The investigations must follow ethical guidelines (Helsinki Declaration), have a well-defined plan, and include sufficient observations for valid conclusions.  All adverse incidents must be recorded and reported.  A written report critically evaluating all data is mandatory.  The MDD text does not specify the extent or detail required for clinical data or investigations for Class IIa devices specifically.


**2. Summary of MDR Requirements:**

The provided MDR text mentio

In [21]:
client.delete_collection(name = "mdd_mdr_chroma")