In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain.schema import Document
from langchain_community.vectorstores import Chroma
import numpy as np
from typing import List

## Beispieldaten erstellen

In [3]:
import os

os.makedirs("data", exist_ok=True)

sample_docs = {
    "rag_intro.txt": """Retrieval-Augmented Generation (RAG) is a technique 
that combines information retrieval with generative models. 
It allows language models to access external knowledge sources 
to provide more accurate and up-to-date answers.""",

    "rag_architecture.txt": """The RAG architecture typically consists of two parts: 
a retriever and a generator. The retriever finds relevant documents 
from a knowledge base, while the generator (often a large language model) 
uses those documents to produce responses.""",

    "rag_usecases.txt": """RAG can be applied in many areas such as 
question answering, chatbots, document summarization, and enterprise search. 
It improves reliability by grounding responses in factual documents.""",

"rag_challenges.txt": """Some challenges of RAG include retrieving irrelevant documents,
managing large-scale knowledge bases, and ensuring that the generator
uses the retrieved documents correctly without introducing hallucinations.""",

    "rag_future.txt": """The future of RAG research focuses on improving retriever models,
combining structured and unstructured data, and optimizing efficiency
for real-world enterprise applications.""",

    "rag_vs_finetuningvs_promptengineering.txt": """RAG, fine-tuning, and prompt engineering differ in how they improve 
language models.

Fine-tuning changes the model’s weights by training it on additional 
domain-specific data. This makes the model more accurate for that domain, 
but it is costly and needs retraining when knowledge changes.

RAG (Retrieval-Augmented Generation) does not change the model itself. 
Instead, it connects the model to an external retriever that provides 
relevant documents. The model then generates answers based on both its 
own knowledge and the retrieved context. This keeps results more current 
and fact-based.

Prompt engineering does not retrain the model or add external data. 
It only improves how we ask questions by designing effective prompts. 
This method is simple and fast but limited, since it does not add 
new knowledge.

In short: fine-tuning embeds knowledge, RAG retrieves knowledge, 
and prompt engineering shapes the way knowledge is used."""

}

for filename, content in sample_docs.items():
    with open(os.path.join("data", filename), "w", encoding="utf-8") as f:
        f.write(content)

print("Text files created in 'data' folder:", list(sample_docs.keys()))


Text files created in 'data' folder: ['rag_intro.txt', 'rag_architecture.txt', 'rag_usecases.txt', 'rag_challenges.txt', 'rag_future.txt', 'rag_vs_finetuningvs_promptengineering.txt']


## Text in Chunks aufteilen

## Dokumente laden

In [4]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader

loader = DirectoryLoader(
    "data",                     
    glob="*.txt",              
    loader_cls=TextLoader,      
    loader_kwargs={'encoding': 'utf-8'}  
)

documents = loader.load()

print(f"{len(documents)} Dokumente geladen")
print("\nVorschau erstes Dokument:")
print(documents[0].page_content[:200] + "...")

6 Dokumente geladen

Vorschau erstes Dokument:
The RAG architecture typically consists of two parts: 
a retriever and a generator. The retriever finds relevant documents 
from a knowledge base, while the generator (often a large language model) 
u...


In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,     
    chunk_overlap=50,   
    length_function=len,
    separators=["\n\n", "\n", " ", ""]
)

chunks = text_splitter.split_documents(documents)

print(f"{len(chunks)} Chunks aus {len(documents)} Dokumenten erstellt")

print("\nChunk-Beispiel:")
print(f"Inhalt: {chunks[0].page_content[:150]}...")
print(f"Metadaten: {chunks[0].metadata}")

8 Chunks aus 6 Dokumenten erstellt

Chunk-Beispiel:
Inhalt: The RAG architecture typically consists of two parts: 
a retriever and a generator. The retriever finds relevant documents 
from a knowledge base, whi...
Metadaten: {'source': 'data\\rag_architecture.txt'}


In [6]:
chunks

[Document(metadata={'source': 'data\\rag_architecture.txt'}, page_content='The RAG architecture typically consists of two parts: \na retriever and a generator. The retriever finds relevant documents \nfrom a knowledge base, while the generator (often a large language model) \nuses those documents to produce responses.'),
 Document(metadata={'source': 'data\\rag_challenges.txt'}, page_content='Some challenges of RAG include retrieving irrelevant documents,\nmanaging large-scale knowledge bases, and ensuring that the generator\nuses the retrieved documents correctly without introducing hallucinations.'),
 Document(metadata={'source': 'data\\rag_future.txt'}, page_content='The future of RAG research focuses on improving retriever models,\ncombining structured and unstructured data, and optimizing efficiency\nfor real-world enterprise applications.'),
 Document(metadata={'source': 'data\\rag_intro.txt'}, page_content='Retrieval-Augmented Generation (RAG) is a technique \nthat combines info

## Embedding-Modell

In [7]:
api_key = os.getenv("OPENAI_API_KEY")

## ChromaDB Vektordatenbank erstellen

In [8]:
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

persist_directory = "./chroma_db"
embedding = OpenAIEmbeddings()

vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embedding,
    collection_name="rag_collection",
    persist_directory=persist_directory,
)

print(vectorstore._collection.count())
print(f"Vektordatenbank erstellt mit {vectorstore._collection.count()} Vektoren")
print(f"Gespeichert in: {persist_directory}")



8
Vektordatenbank erstellt mit 8 Vektoren
Gespeichert in: ./chroma_db


## Ähnlichkeitssuche testen

In [9]:
query = "What is the difference between RAG and fine-tuning?"

results = vectorstore.similarity_search(query, k=3)
results

[Document(metadata={'source': 'data\\rag_vs_finetuningvs_promptengineering.txt'}, page_content='RAG, fine-tuning, and prompt engineering differ in how they improve \nlanguage models.\n\nFine-tuning changes the model’s weights by training it on additional \ndomain-specific data. This makes the model more accurate for that domain, \nbut it is costly and needs retraining when knowledge changes.'),
 Document(metadata={'source': 'data\\rag_usecases.txt'}, page_content='RAG can be applied in many areas such as \nquestion answering, chatbots, document summarization, and enterprise search. \nIt improves reliability by grounding responses in factual documents.'),
 Document(metadata={'source': 'data\\rag_architecture.txt'}, page_content='The RAG architecture typically consists of two parts: \na retriever and a generator. The retriever finds relevant documents \nfrom a knowledge base, while the generator (often a large language model) \nuses those documents to produce responses.')]

In [10]:
for i, res in enumerate(results, 1):
   print(f"\nErgebnis {i}:")
   print(f"Inhalt: {res.page_content}")
   print(f"Metadaten: {res.metadata}")


Ergebnis 1:
Inhalt: RAG, fine-tuning, and prompt engineering differ in how they improve 
language models.

Fine-tuning changes the model’s weights by training it on additional 
domain-specific data. This makes the model more accurate for that domain, 
but it is costly and needs retraining when knowledge changes.
Metadaten: {'source': 'data\\rag_vs_finetuningvs_promptengineering.txt'}

Ergebnis 2:
Inhalt: RAG can be applied in many areas such as 
question answering, chatbots, document summarization, and enterprise search. 
It improves reliability by grounding responses in factual documents.
Metadaten: {'source': 'data\\rag_usecases.txt'}

Ergebnis 3:
Inhalt: The RAG architecture typically consists of two parts: 
a retriever and a generator. The retriever finds relevant documents 
from a knowledge base, while the generator (often a large language model) 
uses those documents to produce responses.
Metadaten: {'source': 'data\\rag_architecture.txt'}


In [11]:
query = "What are some use cases of RAG?"
results = vectorstore.similarity_search(query, k=3)

for i, res in enumerate(results, 1):
    print(f"\nErgebnis {i}:")
    print(f"Inhalt: {res.page_content}")
    print(f"Metadaten: {res.metadata}")


Ergebnis 1:
Inhalt: RAG can be applied in many areas such as 
question answering, chatbots, document summarization, and enterprise search. 
It improves reliability by grounding responses in factual documents.
Metadaten: {'source': 'data\\rag_usecases.txt'}

Ergebnis 2:
Inhalt: Some challenges of RAG include retrieving irrelevant documents,
managing large-scale knowledge bases, and ensuring that the generator
uses the retrieved documents correctly without introducing hallucinations.
Metadaten: {'source': 'data\\rag_challenges.txt'}

Ergebnis 3:
Inhalt: The future of RAG research focuses on improving retriever models,
combining structured and unstructured data, and optimizing efficiency
for real-world enterprise applications.
Metadaten: {'source': 'data\\rag_future.txt'}


## Ähnlichkeitssuche mit Bewertung

ChromaDB verwendet standardmäßig L2-Distanz:
- 0.0 = perfekte Übereinstimmung  
- Näher zu 0 = relevanter
- Höhere Werte = weniger relevant

In [12]:
query = "What are the main components of the RAG architecture?"
results = vectorstore.similarity_search_with_score(query, k=3)

for i, (doc, score) in enumerate(results, 1):
    print(f"\nErgebnis {i} (Bewertung={score:.4f}):")
    print(f"Inhalt: {doc.page_content}")
    print(f"Metadaten: {doc.metadata}")


Ergebnis 1 (Bewertung=0.2337):
Inhalt: The RAG architecture typically consists of two parts: 
a retriever and a generator. The retriever finds relevant documents 
from a knowledge base, while the generator (often a large language model) 
uses those documents to produce responses.
Metadaten: {'source': 'data\\rag_architecture.txt'}

Ergebnis 2 (Bewertung=0.3461):
Inhalt: Some challenges of RAG include retrieving irrelevant documents,
managing large-scale knowledge bases, and ensuring that the generator
uses the retrieved documents correctly without introducing hallucinations.
Metadaten: {'source': 'data\\rag_challenges.txt'}

Ergebnis 3 (Bewertung=0.3538):
Inhalt: RAG can be applied in many areas such as 
question answering, chatbots, document summarization, and enterprise search. 
It improves reliability by grounding responses in factual documents.
Metadaten: {'source': 'data\\rag_usecases.txt'}


## Sprachmodell initialisieren

In [13]:
from langchain_openai import ChatOpenAI

llm=ChatOpenAI(
    model_name="gpt-3.5-turbo"
)

In [14]:
test_response=llm.invoke("What is Small Language Models")
test_response

AIMessage(content='Small language models refer to natural language processing models that have a relatively small number of parameters compared to larger language models. These models are typically used for simpler tasks that do not require as much computational power or resources, such as text classification, sentiment analysis, or language generation on a small scale.\n\nSmall language models are often trained on fewer examples and may have limited capabilities compared to larger models like GPT-3 or BERT. However, they can still be effective for certain applications and are more accessible for researchers or developers with limited resources. Additionally, small language models can be fine-tuned on specific datasets to improve performance on specific tasks.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 127, 'prompt_tokens': 12, 'total_tokens': 139, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reas

In [15]:
from langchain.chat_models.base import init_chat_model

llm=init_chat_model("openai:gpt-3.5-turbo")
#llm=init_chat_model("groq:")
llm

ChatOpenAI(client=<openai.resources.chat.completions.completions.Completions object at 0x0000017F41361600>, async_client=<openai.resources.chat.completions.completions.AsyncCompletions object at 0x0000017F413611B0>, root_client=<openai.OpenAI object at 0x0000017F44CE3D60>, root_async_client=<openai.AsyncOpenAI object at 0x0000017F41362A70>, model_kwargs={}, openai_api_key=SecretStr('**********'))

In [16]:
llm.invoke("What is AI")

AIMessage(content='AI, or artificial intelligence, is the simulation of human intelligence processes by machines, especially computer systems. This includes the ability to learn, reason, make decisions, and understand natural language. AI is used in a wide range of applications, including speech recognition, image recognition, autonomous vehicles, and more.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 60, 'prompt_tokens': 10, 'total_tokens': 70, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'id': 'chatcmpl-CFmG8P4vIY9F2Tf2AWWpflkjFaQra', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None}, id='run--6adf4364-91c1-4bc1-9c0d-9897a01633d8-0', usage_metadata={'input_tokens': 10, 'output_tokens': 60, 'tot

## RAG-Kette erstellen

In [17]:
from langchain.chains import create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain

In [18]:
retriever=vectorstore.as_retriever(
    search_kwarg={"k":3} 
)
retriever

VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x0000017F41870D30>, search_kwargs={})

In [19]:
from langchain_core.prompts import ChatPromptTemplate

system_prompt = """You are a helpful assistant for question-answering tasks.  
Use the retrieved context below to answer the user’s question.  

- If the answer is not in the context, say: "I don’t know based on the provided documents."  
- Be concise (max. 3 sentences).  
- Ground your answer in the context, don’t invent facts.  

Context:
{context}"""

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}")
])
prompt


ChatPromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template='You are a helpful assistant for question-answering tasks.  \nUse the retrieved context below to answer the user’s question.  \n\n- If the answer is not in the context, say: "I don’t know based on the provided documents."  \n- Be concise (max. 3 sentences).  \n- Ground your answer in the context, don’t invent facts.  \n\nContext:\n{context}'), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], input_types={}, partial_variables={}, template='{input}'), additional_kwargs={})])

In [20]:
from langchain.chains.combine_documents import create_stuff_documents_chain
document_chain=create_stuff_documents_chain(llm,prompt)
document_chain

RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableLambda(format_docs)
}), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
| ChatPromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template='You are a helpful assistant for question-answering tasks.  \nUse the retrieved context below to answer the user’s question.  \n\n- If the answer is not in the context, say: "I don’t know based on the provided documents."  \n- Be concise (max. 3 sentences).  \n- Ground your answer in the context, don’t invent facts.  \n\nContext:\n{context}'), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], input_types={}, partial_variables={}, template='{input}'), additional_kwargs={})])
| ChatOpenAI(client=<openai.resources.chat.completions.com

In [21]:
from langchain.chains import create_retrieval_chain
rag_chain=create_retrieval_chain(retriever,document_chain)
rag_chain

RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableBinding(bound=RunnableLambda(lambda x: x['input'])
           | VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x0000017F41870D30>, search_kwargs={}), kwargs={}, config={'run_name': 'retrieve_documents'}, config_factories=[])
})
| RunnableAssign(mapper={
    answer: RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
              context: RunnableLambda(format_docs)
            }), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
            | ChatPromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template='You are a helpful assistant for question-answering tasks.  \nUse the retrieved context below to answer the user’s question.  \n\n- If th

In [22]:
response=rag_chain.invoke({"input":"How can I apply RAG in building a chatbot?"})
response

{'input': 'How can I apply RAG in building a chatbot?',
 'context': [Document(metadata={'source': 'data\\rag_usecases.txt'}, page_content='RAG can be applied in many areas such as \nquestion answering, chatbots, document summarization, and enterprise search. \nIt improves reliability by grounding responses in factual documents.'),
  Document(metadata={'source': 'data\\rag_architecture.txt'}, page_content='The RAG architecture typically consists of two parts: \na retriever and a generator. The retriever finds relevant documents \nfrom a knowledge base, while the generator (often a large language model) \nuses those documents to produce responses.'),
  Document(metadata={'source': 'data\\rag_challenges.txt'}, page_content='Some challenges of RAG include retrieving irrelevant documents,\nmanaging large-scale knowledge bases, and ensuring that the generator\nuses the retrieved documents correctly without introducing hallucinations.'),
  Document(metadata={'source': 'data\\rag_intro.txt'}, 

In [23]:
response['answer']

'You can apply RAG in building a chatbot by using the retriever to find relevant documents from a knowledge base, and then the generator (like a large language model) can use those documents to produce responses. This combination allows the chatbot to access external knowledge sources to provide more accurate and up-to-date answers, improving the reliability of the responses.'