In [80]:
from langchain_community.document_loaders import JSONLoader
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_openai import ChatOpenAI
from langchain.schema import Document
from langchain_community.vectorstores.utils import filter_complex_metadata

import json
    
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

loader = JSONLoader(
    file_path=r"C:\Users\FahRe\Desktop\agentic-LLM-app\backend\data\atc2.json",
    jq_schema=".ATC_Codes[]",
    text_content=False  # Important: this avoids the ValueError you hit
)

# Load raw documents
raw_docs = loader.load()
documents = []



for doc in raw_docs:
    if isinstance(doc.page_content, str):
        data = json.loads(doc.page_content)
    elif isinstance(doc.page_content, dict):
        data = doc.page_content
    else:
        raise ValueError("Unsupported page_content type")

    text = f"""
    Product-Medikament: {data.get("Product-Medikament")}
    Beschreibung: {data.get("Beschreibung")}
    Anwendung: {data.get("Anwendung")}
    Gruppe: {data.get("Gruppe")}
    Hauptkategorie ATC: {data.get("ATC Oberkategorie")}
    Unterkategorie ATC: {data.get("ATC Unterkategorie")}
    """
    
    documents.append(Document(page_content=text.strip(), metadata=data))

llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
db = Chroma.from_documents(
    documents=[
        Document(page_content=doc.page_content, metadata=filter_complex_metadata(doc.metadata))
        for doc in documents
    ],
    embedding=embedding_function
)



AttributeError: 'str' object has no attribute 'metadata'

In [87]:
from langchain_community.document_loaders import JSONLoader
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.schema import Document
import json
from langchain_openai import OpenAIEmbeddings
import os
from dotenv import load_dotenv
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


# ✅ Utility to clean metadata
def clean_metadata(metadata: dict) -> dict:
    """Ensure all metadata values are Chroma-compatible (str, int, float, bool, None)."""
    allowed_types = (str, int, float, bool, type(None))
    cleaned = {}
    for k, v in metadata.items():
        if isinstance(v, allowed_types):
            cleaned[k] = v
        elif isinstance(v, list):
            cleaned[k] = ", ".join(map(str, v))  # Convert list to CSV string
        else:
            cleaned[k] = str(v)  # Fallback: convert everything else to string
    return cleaned

# Step 1: Initialize embedding model
#embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
embedding_function = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY)


# Step 2: Load JSON
loader = JSONLoader(
    file_path=r"C:\Users\FahRe\Desktop\agentic-LLM-app\backend\data\atc2.json",
    jq_schema=".ATC_Codes[]",
    text_content=False
)

# Step 3: Load raw documents
raw_docs = loader.load()
documents = []

# Step 4: Build cleaned Documents
for doc in raw_docs:
    data = json.loads(doc.page_content) if isinstance(doc.page_content, str) else doc.page_content

    # Format the readable content
    text = f"""
    Product-Medikament: {data.get("Product-Medikament")}
    Beschreibung: {data.get("Beschreibung")}
    Anwendung: {data.get("Anwendung")}
    Gruppe: {data.get("Gruppe")}
    ATC Oberkategorie: {data.get("ATC Oberkategorie")}
    ATC Unterkategorie: {data.get("ATC Unterkategorie")}
    """
    safe_metadata = clean_metadata(data)
    safe_metadata["source"] = "atc2.json - ATC-Code sortierte Textbausteine aktuell"
    
    documents.append(Document(page_content=text.strip(), metadata=safe_metadata))

# Step 5: Create vectorstore
db = Chroma.from_documents(documents, embedding_function, persist_directory = r"C:\Users\FahRe\Desktop\agentic-LLM-app\backend\data\chroma_db_openai2" )


In [94]:
from langchain_community.vectorstores import Chroma
db = Chroma(persist_directory = r"C:\Users\FahRe\Desktop\agentic-LLM-app\backend\data\chroma_db_openai2", embedding_function=embedding_function)
query = "Wie sollte ich Chlorhexamed forte answenden?"
docs = db.similarity_search(query, k=3)
retrieved_text = "\n".join([doc.page_content for doc in docs])
top_source = f"{docs[0].metadata.get('source', 'unbekannt')} (Seite {docs[0].metadata.get('page', '?')})"


# Optional: print context
for i, d in enumerate(docs):
    print(f"\n--- Top-{i+1} Similar Doc ---\n{d.page_content}")

# Prompt and chain
prompt = PromptTemplate(
    input_variables=["context", "question", "source"],
    template="""
    Du bist ein medizinischer Assistent. Nutze den folgenden Kontext, um die Frage so präzise wie möglich zu beantworten.
    Wenn die Antwort nicht im Kontext enthalten ist, gib das ehrlich an. Antworte kurz und sachlich.

    Kontext: {context}
    
    Quelle:
    {source}

    Frage: {question}
    Antwort:
    """
)
qa_chain = LLMChain(llm=llm, prompt=prompt)
response = qa_chain.run({
    "context": retrieved_text,
    "source": top_source,
    "question": query
})

print("\nAntwort:", response)


--- Top-1 Similar Doc ---
Product-Medikament: Chlorhexamed / Chlorhexamed forte
    Beschreibung: Lösung: Enthält den Wirkstoff Chlorhexidin und ist ein Mund- und Rachendesinfiziens zur vorübergehenden unterstützenden Behandlung bei Zahnfleischentzündungen (Gingivitis) und nach parodontalchirurgischen Eingriffen (Eingriffe am Zahnfleisch).
    Anwendung: Chlorhexamed (forte) Lösung: Nach dem Zähneputzen mindestens 5 Minuten warten bis zur Anwendung. Mit der unverdünnten Lösung den Mund spülen oder gurgeln.
    Gruppe: None
    ATC Oberkategorie: A Alimentäres System und Stoffwechsel
    ATC Unterkategorie: A01 Stomatologika

--- Top-2 Similar Doc ---
Product-Medikament: Chlorhexamed / Chlorhexamed forte
    Beschreibung: Gel: Zur Behandlung und Vorbeugung von bakteriellen und mykotischen Entzündungen der Mundschleimhaut, des Zahnfleisches, des Zahnbettes und bei Aphten verwendet.
    Anwendung: Chlorhexamed Gel: Mit einem Wattestäbchen eine kleine Menge direkt auf die entzündete Stell

In [1]:
from langchain_community.document_loaders.excel import UnstructuredExcelLoader
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain.text_splitter import CharacterTextSplitter
import os
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings

api_key = os.getenv("OPENAI_API_KEY")

file_path = r"C:\Users\FahRe\Desktop\agentic-LLM-app\backend\data\Interaktionen nach IA-Nummern.xlsx"
file_path2 = r"C:\Users\FahRe\Desktop\agentic-LLM-app\backend\data\ATC-Code sortierte Textbausteine aktuell.xlsx"
loader1 = UnstructuredExcelLoader(file_path=file_path, sheet_name="Tabelle1")
loader2 = UnstructuredExcelLoader(file_path=file_path, sheet_name="Tabelle2")
loader3 = UnstructuredExcelLoader(file_path=file_path2, sheet_name="A")
loader4 = UnstructuredExcelLoader(file_path=file_path2, sheet_name="B")
loader5 = UnstructuredExcelLoader(file_path=file_path2, sheet_name="C")
loader6 = UnstructuredExcelLoader(file_path=file_path2, sheet_name="D")
loader7 = UnstructuredExcelLoader(file_path=file_path2, sheet_name="G")
loader8 = UnstructuredExcelLoader(file_path=file_path2, sheet_name="H")
loader9 = UnstructuredExcelLoader(file_path=file_path2, sheet_name="J")
loader10 = UnstructuredExcelLoader(file_path=file_path2, sheet_name="L")
loader11 = UnstructuredExcelLoader(file_path=file_path2, sheet_name="M")
loader12 = UnstructuredExcelLoader(file_path=file_path2, sheet_name="N")
loader13 = UnstructuredExcelLoader(file_path=file_path2, sheet_name="P")
loader14 = UnstructuredExcelLoader(file_path=file_path2, sheet_name="R")
loader15 = UnstructuredExcelLoader(file_path=file_path2, sheet_name="S")
loader16 = UnstructuredExcelLoader(file_path=file_path2, sheet_name="V")


docs_sheet1 = loader1.load()
docs_sheet2 = loader2.load()
docs_sheet3 = loader3.load()
docs_sheet4 = loader4.load()
docs_sheet5 = loader5.load()
docs_sheet6 = loader6.load()
docs_sheet7 = loader7.load()
docs_sheet8 = loader8.load()
docs_sheet9 = loader9.load()
docs_sheet10 = loader10.load()
docs_sheet11 = loader11.load()
docs_sheet12 = loader12.load()
docs_sheet13 = loader13.load()
docs_sheet14 = loader14.load()
docs_sheet15 = loader15.load()
docs_sheet16 = loader16.load()


docs = docs_sheet1 + docs_sheet2 + docs_sheet3  + docs_sheet4  + docs_sheet5  + docs_sheet6 + docs_sheet7 + docs_sheet8 + docs_sheet9 + docs_sheet10  + docs_sheet11 + docs_sheet12 + docs_sheet13 + docs_sheet14 + docs_sheet15 + docs_sheet16
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=8000,
    chunk_overlap=1000,
    length_function=len
)
split_docs = text_splitter.split_documents(docs)
#embedding_function = OpenAIEmbeddings(api_key=api_key, model="text-embedding-ada-002")
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
db = FAISS.from_documents(split_docs, embedding_function)
retriever = db.as_retriever(search_kwargs={"k": 1})


Created a chunk of size 9698, which is longer than the specified 8000
Created a chunk of size 15270, which is longer than the specified 8000
Created a chunk of size 8758, which is longer than the specified 8000
Created a chunk of size 11226, which is longer than the specified 8000
Created a chunk of size 8639, which is longer than the specified 8000
Created a chunk of size 10245, which is longer than the specified 8000
Created a chunk of size 11622, which is longer than the specified 8000
Created a chunk of size 12239, which is longer than the specified 8000
Created a chunk of size 13803, which is longer than the specified 8000
Created a chunk of size 13682, which is longer than the specified 8000
Created a chunk of size 11496, which is longer than the specified 8000
Created a chunk of size 10869, which is longer than the specified 8000
Created a chunk of size 8642, which is longer than the specified 8000
Created a chunk of size 13449, which is longer than the specified 8000
Created a 

In [2]:
qa = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(temperature=0, model="gpt-4.1-mini", max_tokens=1000),
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,)


query = "Wie sollte ich Vita Hepa einnehmen?"
result = qa({"query": query})
print(result['result'])

  result = qa({"query": query})


In den bereitgestellten Informationen ist keine Angabe zur Einnahme von Vita Hepa enthalten. Daher kann ich Ihnen leider keine genaue Auskunft zur Einnahme von Vita Hepa geben. Bitte konsultieren Sie die Packungsbeilage des Medikaments oder wenden Sie sich an Ihren Arzt oder Apotheker für genaue Anweisungen zur Einnahme.
