In [52]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [53]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain.schema import Document
from langchain_community.vectorstores import Chroma
import numpy as np
from typing import List

## Beispieldaten erstellen

In [54]:
import os

os.makedirs("data", exist_ok=True)

sample_docs = {
    "rag_intro.txt": """Retrieval-Augmented Generation (RAG) is a technique 
that combines information retrieval with generative models. 
It allows language models to access external knowledge sources 
to provide more accurate and up-to-date answers.""",

    "rag_architecture.txt": """The RAG architecture typically consists of two parts: 
a retriever and a generator. The retriever finds relevant documents 
from a knowledge base, while the generator (often a large language model) 
uses those documents to produce responses.""",

    "rag_usecases.txt": """RAG can be applied in many areas such as 
question answering, chatbots, document summarization, and enterprise search. 
It improves reliability by grounding responses in factual documents.""",

"rag_challenges.txt": """Some challenges of RAG include retrieving irrelevant documents,
managing large-scale knowledge bases, and ensuring that the generator
uses the retrieved documents correctly without introducing hallucinations.""",

    "rag_future.txt": """The future of RAG research focuses on improving retriever models,
combining structured and unstructured data, and optimizing efficiency
for real-world enterprise applications.""",

    "rag_vs_finetuningvs_promptengineering.txt": """RAG, fine-tuning, and prompt engineering differ in how they improve 
language models.

Fine-tuning changes the model’s weights by training it on additional 
domain-specific data. This makes the model more accurate for that domain, 
but it is costly and needs retraining when knowledge changes.

RAG (Retrieval-Augmented Generation) does not change the model itself. 
Instead, it connects the model to an external retriever that provides 
relevant documents. The model then generates answers based on both its 
own knowledge and the retrieved context. This keeps results more current 
and fact-based.

Prompt engineering does not retrain the model or add external data. 
It only improves how we ask questions by designing effective prompts. 
This method is simple and fast but limited, since it does not add 
new knowledge.

In short: fine-tuning embeds knowledge, RAG retrieves knowledge, 
and prompt engineering shapes the way knowledge is used."""

}

for filename, content in sample_docs.items():
    with open(os.path.join("data", filename), "w", encoding="utf-8") as f:
        f.write(content)

print("Text files created in 'data' folder:", list(sample_docs.keys()))


Text files created in 'data' folder: ['rag_intro.txt', 'rag_architecture.txt', 'rag_usecases.txt', 'rag_challenges.txt', 'rag_future.txt', 'rag_vs_finetuningvs_promptengineering.txt']


## Text in Chunks aufteilen

## Dokumente laden

In [None]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader

loader = DirectoryLoader(
    "data",                     
    glob="*.txt",              
    loader_cls=TextLoader,      
    loader_kwargs={'encoding': 'utf-8'}  
)

documents = loader.load()

print(f"{len(documents)} Dokumente geladen")
print("\nVorschau erstes Dokument:")
print(documents[0].page_content[:200] + "...")

Loaded 6 documents

First document preview:
The RAG architecture typically consists of two parts: 
a retriever and a generator. The retriever finds relevant documents 
from a knowledge base, while the generator (often a large language model) 
u...


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,     
    chunk_overlap=50,   
    length_function=len,
    separators=["\n\n", "\n", " ", ""]
)

chunks = text_splitter.split_documents(documents)

print(f"{len(chunks)} Chunks aus {len(documents)} Dokumenten erstellt")

print("\nChunk-Beispiel:")
print(f"Inhalt: {chunks[0].page_content[:150]}...")
print(f"Metadaten: {chunks[0].metadata}")

Created 8 chunks from 6 documents

Chunk example:
Content: The RAG architecture typically consists of two parts: 
a retriever and a generator. The retriever finds relevant documents 
from a knowledge base, whi...
Metadata: {'source': 'data\\rag_architecture.txt'}


In [57]:
chunks

[Document(metadata={'source': 'data\\rag_architecture.txt'}, page_content='The RAG architecture typically consists of two parts: \na retriever and a generator. The retriever finds relevant documents \nfrom a knowledge base, while the generator (often a large language model) \nuses those documents to produce responses.'),
 Document(metadata={'source': 'data\\rag_challenges.txt'}, page_content='Some challenges of RAG include retrieving irrelevant documents,\nmanaging large-scale knowledge bases, and ensuring that the generator\nuses the retrieved documents correctly without introducing hallucinations.'),
 Document(metadata={'source': 'data\\rag_future.txt'}, page_content='The future of RAG research focuses on improving retriever models,\ncombining structured and unstructured data, and optimizing efficiency\nfor real-world enterprise applications.'),
 Document(metadata={'source': 'data\\rag_intro.txt'}, page_content='Retrieval-Augmented Generation (RAG) is a technique \nthat combines info

## Embedding-Modell

In [58]:
api_key = os.getenv("OPENAI_API_KEY")

## ChromaDB Vektordatenbank erstellen

In [None]:
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

persist_directory = "./chroma_db"   # Ordner für persistente Speicherung

vectorstore = Chroma.from_documents(
    documents=chunks,                     # Die aufgeteilten Textdokumente
    embedding=OpenAIEmbeddings(),         # Embedding-Modell (Text → Vektoren)
    persist_directory=persist_directory,  # Datenbank auf Festplatte speichern
    collection_name="rag_collection"      # Name der Sammlung in Chroma
)

print(f"Vektordatenbank erstellt mit {vectorstore._collection.count()} Vektoren")
print(f"Gespeichert in: {persist_directory}")

Vector store created with 16 vectors
Persisted to: ./chroma_db


## Ähnlichkeitssuche testen

In [60]:
query = "What is the difference between RAG and fine-tuning?"

results = vectorstore.similarity_search(query, k=3)
results

[Document(metadata={'source': 'data\\rag_vs_finetuningvs_promptengineering.txt'}, page_content='RAG, fine-tuning, and prompt engineering differ in how they improve \nlanguage models.\n\nFine-tuning changes the model’s weights by training it on additional \ndomain-specific data. This makes the model more accurate for that domain, \nbut it is costly and needs retraining when knowledge changes.'),
 Document(metadata={'source': 'data\\rag_vs_finetuningvs_promptengineering.txt'}, page_content='RAG, fine-tuning, and prompt engineering differ in how they improve \nlanguage models.\n\nFine-tuning changes the model’s weights by training it on additional \ndomain-specific data. This makes the model more accurate for that domain, \nbut it is costly and needs retraining when knowledge changes.'),
 Document(metadata={'source': 'data\\rag_usecases.txt'}, page_content='RAG can be applied in many areas such as \nquestion answering, chatbots, document summarization, and enterprise search. \nIt improves

In [None]:
for i, res in enumerate(results, 1):
   print(f"\nErgebnis {i}:")
   print(f"Inhalt: {res.page_content}")
   print(f"Metadaten: {res.metadata}")


Result 1:
Content: RAG, fine-tuning, and prompt engineering differ in how they improve 
language models.

Fine-tuning changes the model’s weights by training it on additional 
domain-specific data. This makes the model more accurate for that domain, 
but it is costly and needs retraining when knowledge changes.
Metadata: {'source': 'data\\rag_vs_finetuningvs_promptengineering.txt'}

Result 2:
Content: RAG, fine-tuning, and prompt engineering differ in how they improve 
language models.

Fine-tuning changes the model’s weights by training it on additional 
domain-specific data. This makes the model more accurate for that domain, 
but it is costly and needs retraining when knowledge changes.
Metadata: {'source': 'data\\rag_vs_finetuningvs_promptengineering.txt'}

Result 3:
Content: RAG can be applied in many areas such as 
question answering, chatbots, document summarization, and enterprise search. 
It improves reliability by grounding responses in factual documents.
Metadata: {'source'

In [None]:
query = "What are some use cases of RAG?"
results = vectorstore.similarity_search(query, k=3)

for i, res in enumerate(results, 1):
    print(f"\nErgebnis {i}:")
    print(f"Inhalt: {res.page_content}")
    print(f"Metadaten: {res.metadata}")


Result 1:
Content: RAG can be applied in many areas such as 
question answering, chatbots, document summarization, and enterprise search. 
It improves reliability by grounding responses in factual documents.
Metadata: {'source': 'data\\rag_usecases.txt'}

Result 2:
Content: RAG can be applied in many areas such as 
question answering, chatbots, document summarization, and enterprise search. 
It improves reliability by grounding responses in factual documents.
Metadata: {'source': 'data\\rag_usecases.txt'}

Result 3:
Content: Some challenges of RAG include retrieving irrelevant documents,
managing large-scale knowledge bases, and ensuring that the generator
uses the retrieved documents correctly without introducing hallucinations.
Metadata: {'source': 'data\\rag_challenges.txt'}


## Ähnlichkeitssuche mit Bewertung

ChromaDB verwendet standardmäßig L2-Distanz:
- 0.0 = perfekte Übereinstimmung  
- Näher zu 0 = relevanter
- Höhere Werte = weniger relevant

In [None]:
query = "What are the main components of the RAG architecture?"
results = vectorstore.similarity_search_with_score(query, k=3)

for i, (doc, score) in enumerate(results, 1):
    print(f"\nErgebnis {i} (Bewertung={score:.4f}):")
    print(f"Inhalt: {doc.page_content}")
    print(f"Metadaten: {doc.metadata}")


Result 1 (score=0.2337):
Content: The RAG architecture typically consists of two parts: 
a retriever and a generator. The retriever finds relevant documents 
from a knowledge base, while the generator (often a large language model) 
uses those documents to produce responses.
Metadata: {'source': 'data\\rag_architecture.txt'}

Result 2 (score=0.2337):
Content: The RAG architecture typically consists of two parts: 
a retriever and a generator. The retriever finds relevant documents 
from a knowledge base, while the generator (often a large language model) 
uses those documents to produce responses.
Metadata: {'source': 'data\\rag_architecture.txt'}

Result 3 (score=0.3461):
Content: Some challenges of RAG include retrieving irrelevant documents,
managing large-scale knowledge bases, and ensuring that the generator
uses the retrieved documents correctly without introducing hallucinations.
Metadata: {'source': 'data\\rag_challenges.txt'}


## Sprachmodell initialisieren

In [64]:
from langchain_openai import ChatOpenAI

llm=ChatOpenAI(
    model_name="gpt-3.5-turbo"
)

In [65]:
test_response=llm.invoke("What is Small Language Models")
test_response

AIMessage(content='Small language models are a type of artificial intelligence model that use a smaller amount of data and computational resources compared to larger language models. They are typically used for tasks such as text classification, sentiment analysis, and language generation. Small language models may have limited capabilities compared to larger models, but they are often more efficient and easier to deploy in production settings.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 70, 'prompt_tokens': 12, 'total_tokens': 82, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'id': 'chatcmpl-CFhge0YBHDJBTRXy8G3LDftNegRbk', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None}, id='run--a2b32356-74bd

In [66]:
from langchain.chat_models.base import init_chat_model

llm=init_chat_model("openai:gpt-3.5-turbo")
#llm=init_chat_model("groq:")
llm

ChatOpenAI(client=<openai.resources.chat.completions.completions.Completions object at 0x000002108B4EBA60>, async_client=<openai.resources.chat.completions.completions.AsyncCompletions object at 0x000002108B4F3580>, root_client=<openai.OpenAI object at 0x000002108B4EBC40>, root_async_client=<openai.AsyncOpenAI object at 0x000002108B4EBF40>, model_kwargs={}, openai_api_key=SecretStr('**********'))

In [67]:
llm.invoke("What is AI")

AIMessage(content='AI, or artificial intelligence, refers to the simulation of human intelligence processes by machines or computer systems. These processes include learning, reasoning, problem solving, perception, and language understanding. AI technology is used in a wide range of applications, such as virtual assistants, self-driving cars, medical diagnosis, and natural language processing. AI systems can improve over time through the use of algorithms and data, making them increasingly sophisticated and able to perform complex tasks.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 89, 'prompt_tokens': 10, 'total_tokens': 99, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'id': 'chatcmpl-CFhgfq4CMqORmkaBPJ3nCTlq3IRnv'

## RAG-Kette erstellen

In [68]:
from langchain.chains import create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain

In [None]:
retriever=vectorstore.as_retriever(
    search_kwarg={"k":3} 
)
retriever

VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000002108B4F3220>, search_kwargs={})

In [None]:
from langchain_core.prompts import ChatPromptTemplate

system_prompt = """You are a helpful assistant for question-answering tasks.  
Use the retrieved context below to answer the user’s question.  

- If the answer is not in the context, say: "I don’t know based on the provided documents."  
- Be concise (max. 3 sentences).  
- Ground your answer in the context, don’t invent facts.  

Context:
{context}"""

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}")
])
prompt


ChatPromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template='You are a helpful assistant for question-answering tasks.  \nUse the retrieved context below to answer the user’s question.  \n\n- If the answer is not in the context, say: "I don’t know based on the provided documents."  \n- Be concise (max. 3 sentences).  \n- Ground your answer in the context, don’t invent facts.  \n\nContext:\n{context}'), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], input_types={}, partial_variables={}, template='{input}'), additional_kwargs={})])

In [None]:
from langchain.chains.combine_documents import create_stuff_documents_chain
document_chain=create_stuff_documents_chain(llm,prompt)
document_chain

RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableLambda(format_docs)
}), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
| ChatPromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template='You are a helpful assistant for question-answering tasks.  \nUse the retrieved context below to answer the user’s question.  \n\n- If the answer is not in the context, say: "I don’t know based on the provided documents."  \n- Be concise (max. 3 sentences).  \n- Ground your answer in the context, don’t invent facts.  \n\nContext:\n{context}'), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], input_types={}, partial_variables={}, template='{input}'), additional_kwargs={})])
| ChatOpenAI(client=<openai.resources.chat.completions.com

In [None]:
from langchain.chains import create_retrieval_chain
rag_chain=create_retrieval_chain(retriever,document_chain)
rag_chain

RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableBinding(bound=RunnableLambda(lambda x: x['input'])
           | VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000002108B4F3220>, search_kwargs={}), kwargs={}, config={'run_name': 'retrieve_documents'}, config_factories=[])
})
| RunnableAssign(mapper={
    answer: RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
              context: RunnableLambda(format_docs)
            }), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
            | ChatPromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template='You are a helpful assistant for question-answering tasks.  \nUse the retrieved context below to answer the user’s question.  \n\n- If th

In [73]:
response=rag_chain.invoke({"input":"How can I apply RAG in building a chatbot?"})
response

{'input': 'How can I apply RAG in building a chatbot?',
 'context': [Document(metadata={'source': 'data\\rag_usecases.txt'}, page_content='RAG can be applied in many areas such as \nquestion answering, chatbots, document summarization, and enterprise search. \nIt improves reliability by grounding responses in factual documents.'),
  Document(metadata={'source': 'data\\rag_usecases.txt'}, page_content='RAG can be applied in many areas such as \nquestion answering, chatbots, document summarization, and enterprise search. \nIt improves reliability by grounding responses in factual documents.'),
  Document(metadata={'source': 'data\\rag_architecture.txt'}, page_content='The RAG architecture typically consists of two parts: \na retriever and a generator. The retriever finds relevant documents \nfrom a knowledge base, while the generator (often a large language model) \nuses those documents to produce responses.'),
  Document(metadata={'source': 'data\\rag_architecture.txt'}, page_content='T

In [74]:
response['answer']

'You can apply RAG in building a chatbot by using the retriever component to find relevant documents from a knowledge base, and the generator component (often a large language model) to produce responses based on those documents. This approach improves reliability by grounding the chatbot responses in factual information, making the interactions more accurate and informative.'