In [12]:
import chromadb
import uuid
from pypdf import PdfReader
import google.generativeai as genai
import os
from dotenv import load_dotenv

# --- 1. Setup: API Key, Embedding Model, and ChromaDB Client ---

# Set your API key for the LLM
load_dotenv()
genai.configure(api_key=os.environ['GOOGLE_API_KEY'])

In [13]:
# Initialize the ChromaDB client (this uses a local, in-memory database)
client = chromadb.Client()

# Import the embedding function from ChromaDB's utilities.
# This will use the "all-MiniLM-L6-v2" model by default.
#from chromadb.utils import embedding_functions
#sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction()

# Create the ChromaDB collection, telling it which embedding function to use.
# ChromaDB will use this to automatically create vectors for your documents.
collection = client.get_or_create_collection(
    name='mdr_plain_chroma',
    #embedding_function=sentence_transformer_ef
)


# --- 2. Load and Split the Document (Manual "Chunking") ---

current_dir = os.getcwd()
file_path = os.path.join(current_dir, '..', 'documents', 'MDR.pdf')
text_chunks = []
metadatas = []

# Create a reader object
reader = PdfReader(file_path)

# Iterate through each page
for page_num, page in enumerate(reader.pages):
    text = page.extract_text()
    if text: # Ensure text was extracted
        # A simple chunking strategy: split text by paragraphs
        paragraphs = text.split('\n\n')
        for para in paragraphs:
            if len(para.strip()) > 50: # Ignore very short paragraphs
                text_chunks.append(para.strip())
                metadatas.append({'page': page_num + 1})

In [14]:
# --- 3. Add Documents to the Collection ---

# Add the plain text chunks to the collection.
# ChromaDB will use the embedding function we defined earlier to process them.
collection.add(
    ids=[str(uuid.uuid4()) for _ in text_chunks],
    documents=text_chunks,
    metadatas=metadatas
)

print(f"Successfully added {len(text_chunks)} text chunks to the collection.")

Successfully added 175 text chunks to the collection.


In [20]:
# --- 4. Query the Collection and Generate a Response ---

question = "What are the obligations of a manufacturer?"

# Query the collection to find the most relevant chunks.
# ChromaDB handles creating the embedding for the question and finding similar documents.
results = collection.query(
    query_texts=[question],
    n_results=3  # Ask for the top 3 most relevant results
)

# The results contain the retrieved documents, distances, metadata, etc.
retrieved_documents = results['documents'][0]
retrieved_documents

["3. The person responsible f or regulatory compl iance shall at least be responsible f or ensur ing that: \n(a)  the conf or mity of the devices is appropr iately c heck ed, in accordance with the quality manag ement system under \nwhich the devices are manufact ured, bef ore a device is released; \n(b)  the tec hnical documentation and the EU declaration of conf or mity are dra wn up and kep t up-to-dat e; \n(c)  the post-market sur veillance obligations are compl ied with in accordance with Ar ticle 10(10); \n(d)  the repor ting obligations refer red to in Ar ticles 87 to 91 are fulfilled ; \n(e)  in the case of inve stigational devices, the statement refer red to in Section 4.1 of Chapt er II of Annex XV is issued. \n4. If a number of persons are jointly responsible f or regulator y compliance in accordance with paragraphs 1, 2 and 3, \ntheir respective areas of responsibility shall be stipulat ed in wr iting. \n5. The person responsible f or regulat or y compl iance shall suffer n

In [19]:
# --- 5. Combine Results and Call the LLM ---

# Create a context string by joining the retrieved documents
context = "\n\n---\n\n".join(retrieved_documents)

# Create the prompt for the LLM
prompt = f"""
You are an AI assistant acting as a Senior Regulatory Consultant specializing in the EU MDR. Your task is to answer the user's question clearly and accurately, based **exclusively** on the provided context.

Your tone must be professional, authoritative, and helpful. Explain complex points in simple terms for a client.

Follow this structure for your response:
1.  **Salutation:** Start with a professional greeting (e.g., "Thank you for your question.").
2.  **Direct Answer:** Provide a concise, synthesized answer to the user's question.
3.  **Explanation & Implication:** Briefly explain *why* this is the case or what the implication is for the company.
4.  **Source Citation:** You **MUST** cite the specific article, section, or page number from the provided context that supports your answer.

Do not output the numbers of the structure and the title (as **Salutation**).
**Crucial Rule:** If the answer cannot be found in the provided context, you must state: "The provided context does not contain specific information on this topic." Do not invent an answer.

Context:
{context}

Question: {question}

Answer:
"""

# Call the Gemini API
model = genai.GenerativeModel('gemini-1.5-flash')
response = model.generate_content(prompt)

print("\n--- Question ---")
print(question)
print("\n--- Answer from LLM ---")
print(response.text)
print("\n--- Sources ---")
print(results['metadatas'][0])


--- Question ---
What are the obligations of a manufacturer?

--- Answer from LLM ---
Thank you for your question.

Manufacturers must have at least one person responsible for regulatory compliance within their organization who possesses the requisite expertise in the field of medical devices.  This expertise can be demonstrated by either a relevant diploma/certificate and one year of experience or four years of professional experience in regulatory affairs or quality management systems relating to medical devices. Micro and small enterprises are exempt from this requirement but must have such a person permanently and continuously at their disposal.  This is detailed in Article 15, paragraphs 1 and 2.

Furthermore, prior to placing a device on the market, manufacturers must undertake a conformity assessment in accordance with the applicable conformity assessment procedures set out in Annexes IX to XI (Article 52, paragraph 1).


--- Sources ---
[{'page': 29}, {'page': 50}, {'page': 28