# Part 2: RAG


## 1. Import

In [130]:
from typing import Literal, Any
from copy import deepcopy

from typing_extensions import TypedDict
import matplotlib.pyplot as plt
import numpy as np
from decouple import config
from pydantic import BaseModel, Field
from IPython.display import Image, display
from tqdm import tqdm
from dotenv import load_dotenv

from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters.markdown import MarkdownHeaderTextSplitter
from langchain.prompts import PromptTemplate
from langchain_ibm import WatsonxEmbeddings
from langchain_ibm import WatsonxLLM
from langgraph.graph import START, StateGraph
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams

import litellm
from litellm import completion
import instructor
from instructor import Mode


import os

## 2. Load API key and model setup

In [131]:
print(os.path.exists(".env")) 

True


In [132]:
# Load .env file
load_dotenv() 

# Get the API key
WX_API_KEY = os.getenv('WX_API_KEY')
WX_PROJECT_ID = os.getenv('WX_Project_ID')

if WX_API_KEY:
    print("API Key Loaded Successfully")
else:
    print("API Key Not Found! Check your .env file.")

WX_API_URL = "https://us-south.ml.cloud.ibm.com"


API Key Loaded Successfully


In [133]:
from langchain_ibm import WatsonxLLM
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams


llm = WatsonxLLM(

        model_id= "ibm/granite-3-8b-instruct",
        url=WX_API_URL,
        apikey=WX_API_KEY,
        project_id=WX_PROJECT_ID,

        params={
            GenParams.DECODING_METHOD: "greedy",
            GenParams.TEMPERATURE: 0,
            GenParams.MIN_NEW_TOKENS: 5,
            GenParams.MAX_NEW_TOKENS: 1_000,
            GenParams.REPETITION_PENALTY:1.2
        }

)

In [134]:
llm_result = llm.generate(["Hi how are you?"])

print(type(llm_result))
print(llm_result)

<class 'langchain_core.outputs.llm_result.LLMResult'>
generations=[[Generation(text="\nI'm an artificial intelligence and don't have feelings, but I'm here to help you. How can I assist you today?", generation_info={'finish_reason': 'eos_token'})]] llm_output={'token_usage': {'generated_token_count': 31, 'input_token_count': 5}, 'model_id': 'ibm/granite-3-8b-instruct', 'deployment_id': None} run=[RunInfo(run_id=UUID('e1226eaf-e0e7-4722-bded-9d2e0958faa4'))] type='LLMResult'


## 3. Load the markdown file

In [135]:
from langchain_community.document_loaders import TextLoader


document = TextLoader(r"C:\Users\charl\Documents\CBS\Code Projects_Python\AIML25\AIML25_Project\parsed_markdown\consolidated_markdown_data.md").load()[0]
document.metadata

{'source': 'C:\\Users\\charl\\Documents\\CBS\\Code Projects_Python\\AIML25\\AIML25_Project\\parsed_markdown\\consolidated_markdown_data.md'}

## 4. Chunking the data from the markdown

In [136]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_text_splitters import MarkdownHeaderTextSplitter

#splitting the markdown file based on headers
headers_to_split_on = [("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
header_chunks = markdown_splitter.split_text(document.page_content)

#each header chunk is then split again by token size
token_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)

#applying the token-based split by looping through each header chunk
final_chunks = []
for chunk in header_chunks:
    sub_chunks = token_splitter.split_text(chunk.page_content)
    for sub_chunk in sub_chunks:
        final_chunks.append(Document(page_content=sub_chunk, metadata=chunk.metadata))



In [137]:
#filtering out content under the header "page footer" or data that is mostly links
filtered_chunks = []
for chunk in final_chunks:
    is_footer = (
    chunk.metadata.get("Header 2") == 'Page Header' or
    chunk.metadata.get("Header 2") == "Page Footer" or
    chunk.metadata.get("Header 2") == "Page Number"
)
    is_link = (
        chunk.page_content.count("http") > 1      # mostly links
    )

    if is_footer or is_link:
        continue  # Skip useless headers or content
    filtered_chunks.append(chunk)


In [138]:
for i, chunk in enumerate(filtered_chunks):
    if "574,99 €" in chunk.page_content:
        print(f"FOUND in chunk {i}:\n{chunk.page_content}\n")

FOUND in chunk 2675:
574,99 € <!-- text, from page 0 (l=0.089,t=0.882,r=0.251,b=0.907), with ID d921a42c-dd4f-4fd7-99a0-2d2839878787 -->

FOUND in chunk 2956:
- **Price**: 574,99 €
- **Includes**: MwSt. (Mehrwertsteuer)
- **Delivery**: kostenlose Lieferung ab 50 € <!-- text, from page 0 (l=0.087,t=0.882,r=0.404,b=0.932), with ID 33768bdd-fdd2-4b49-a931-9860c75eebb6 -->



In [139]:
filtered_chunks [:20]

[Document(metadata={}, page_content='<!-- ===== consolidated_markdown_data.md ===== -->  \n<!-- ===== K 2 Battery _ Kärcher.md ===== -->'),
 Document(metadata={'Header 2': 'K 2 BATTERY'}, page_content='Der leistungsstarke Akku-Hochdruckreiniger K 2 Battery für vielseitige, flexible Einsätze ohne Stromanschluss. 36-V-Wechselakku und Ladegerät sind als Sonderzubehör separat erhältlich. <!-- text, from page 0 (l=0.064,t=0.080,r=0.932,b=0.148), with ID 329efb5d-cc5e-4ef2-8d0a-9f47c52857eb -->'),
 Document(metadata={'Header 2': 'K 2 BATTERY', 'Header 3': 'Description'}, page_content='The image shows a yellow and black pressure washer from the brand Kärcher. It is a compact, portable model with a handle at the top for easy carrying. The device features a battery compartment at the top, indicating it is battery-operated. The Kärcher logo is prominently displayed on the front.'),
 Document(metadata={'Header 2': 'K 2 BATTERY', 'Header 3': 'Description'}, page_content='#### Key Features:  \n- **

In [140]:
from copy import deepcopy


def update_documents_with_headers(chunks):
    """
    Creates a new list of Document objects with page_content prepended with headers
    in [Header1/Header2/Header3]: format
    
    Returns new objects rather than modifying the original chunks
    """
    updated_chunks = []
    max_depth=3 
    
    for doc in chunks:
        # Create a deep copy of the document to avoid modifying the original
        new_doc = deepcopy(doc)
        
        # Get all headers that exist in metadata
        headers = []
        for i in range(1, max_depth + 1):
            key = f'Header {i}'
            if key in new_doc.metadata:
                headers.append(f"{key}: {new_doc.metadata[key]}")
        
        # Create the header prefix and update page_content
        if headers:
            prefix = f"[{'/'.join(headers)}]: "
            new_doc.page_content = prefix + "\n" + new_doc.page_content
        
        updated_chunks.append(new_doc)
    
    return updated_chunks


docs = update_documents_with_headers(filtered_chunks)

In [141]:
import re
#clean the data from urls
def clean_content(text):


    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"\s+", " ", text).strip()
    text = text.title()

    return text

In [142]:
# Apply clean_content to each document's page_content
for doc in docs:
    doc.page_content = clean_content(doc.page_content)

In [143]:
for doc in docs[:10]:
    print(doc.page_content, end="\n\n")

<!-- ===== Consolidated_Markdown_Data.Md ===== --> <!-- ===== K 2 Battery _ Kärcher.Md ===== -->

[Header 2: K 2 Battery]: Der Leistungsstarke Akku-Hochdruckreiniger K 2 Battery Für Vielseitige, Flexible Einsätze Ohne Stromanschluss. 36-V-Wechselakku Und Ladegerät Sind Als Sonderzubehör Separat Erhältlich. <!-- Text, From Page 0 (L=0.064,T=0.080,R=0.932,B=0.148), With Id 329Efb5D-Cc5E-4Ef2-8D0A-9F47C52857Eb -->

[Header 2: K 2 Battery/Header 3: Description]: The Image Shows A Yellow And Black Pressure Washer From The Brand Kärcher. It Is A Compact, Portable Model With A Handle At The Top For Easy Carrying. The Device Features A Battery Compartment At The Top, Indicating It Is Battery-Operated. The Kärcher Logo Is Prominently Displayed On The Front.

[Header 2: K 2 Battery/Header 3: Description]: #### Key Features: - **Color**: Predominantly Yellow With Black Accents. - **Design**: - The Pressure Washer Has A Sturdy Handle For Portability. - A Battery Compartment Is Visible At The Top, 

## 5. Embedding the chunks

In [144]:
from langchain_ibm import WatsonxEmbeddings


embed_params = {}

watsonx_embedding = WatsonxEmbeddings(
    model_id="ibm/granite-embedding-278m-multilingual",
    url= WX_API_URL,
    project_id=WX_PROJECT_ID,
    apikey=WX_API_KEY,
    params=embed_params,
)

In [145]:
from langchain_chroma import Chroma


local_vector_db = Chroma.from_documents(
    collection_name="my_collection",
    embedding=watsonx_embedding,
    persist_directory="my_vector_db", # This will save the vector database to disk! Delete it if you want to start fresh.
    documents=docs,
    
)

In [146]:
# Use the vectorstore as a retriever
retriever = local_vector_db.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k": 3,
    }
)

## 6. Similarity search

In [147]:
# Retrieve the most similar text for the given query
query = "What is the price in € of the K 7 Power Flex Home?"

# Retrieve the documents using the retriever
retrieved_documents = retriever.invoke(query)

# Print the results
for document in retrieved_documents:
    print(f"{'#' * 80}\nID: {document.id}")
    first_n_of_content = document.page_content[:500].replace('\n\n', ' ')
    print(f"Content: {first_n_of_content}\n")

################################################################################
ID: c82372e4-cc96-430e-a37f-563216ad1007
Content: [Header 1: K 7 Power Flex Home <!-- Title, From Page 0 (L=0.065,T=0.080,R=0.436,B=0.100), With Id 7417F6Ca-Fa9F-401A-9B38-6819C6E6Fbec -->/Header 2: Price Information]: - **Price**: 624,99 € - **Includes**: Mwst. (Vat) - **Delivery**: Kostenlose Lieferung Ab 50 € (Free Delivery From 50 €) <!-- Key_Value, From Page 0 (L=0.087,T=0.882,R=0.404,B=0.932), With Id 0791E3D2-41E5-47F4-Aba9-Ff9E9C94Cfce -->

################################################################################
ID: f851d3c7-ebb4-4881-8626-6aafd8e949af
Content: [Header 1: K 7 Power Flex Home <!-- Title, From Page 0 (L=0.065,T=0.080,R=0.436,B=0.100), With Id 7417F6Ca-Fa9F-401A-9B38-6819C6E6Fbec -->/Header 2: Price Information]: - **Price**: 694,99 € - **Includes**: Mwst. (Vat) - **Delivery**: Kostenlose Lieferung Ab 50 € (Free Delivery From 50 €) <!-- Key_Value, From Page 0 (L=0.087,T=0.882,

In [148]:
from langchain.prompts import PromptTemplate

few_shot_examples = """
Example 1:
Question: What is the price of the product 'K 7 Smart Control Flex Home'?
Answer: 644.99 €

Example 2:
Question: What is the delivery time of the product 'K 5 FJ Home'?
Answer: 3-4 Werktage

Example 3:
Question: What is the field 'Stromart (V/Hz)' for product '	K 4 WCM'?
Answer: 230 / 50
"""

template = """You are an assistant helping to complete missing fields in a product database by extracting accurate information from documentation.

Each question corresponds to a missing field in the product's entry (e.g., price, description, accessories). Use the retrieved context to find and return the correct value.

If the context does not contain the necessary information, respond with "Unknown". Keep answers factual, concise, and suitable for filling into a CSV cell. Do not speculate or include explanations. 

Do not answer in a sentence but just with the value.

{{few_shot_examples}}

Question:
{question}

Context: 
{context}

Answer:
"""
prompt = PromptTemplate.from_template(template)

In [149]:
prompt.invoke(
    input={
        "question": "What is the price in € of the product K 7 Premium Power Flex Home?",
        "context": retrieved_documents[0].page_content,
        
    }
)

StringPromptValue(text='You are an assistant helping to complete missing fields in a product database by extracting accurate information from documentation.\n\nEach question corresponds to a missing field in the product\'s entry (e.g., price, description, accessories). Use the retrieved context to find and return the correct value.\n\nIf the context does not contain the necessary information, respond with "Unknown". Keep answers factual, concise, and suitable for filling into a CSV cell. Do not speculate or include explanations. \n\nDo not answer in a sentence but just with the value.\n\n{few_shot_examples}\n\nQuestion:\nWhat is the price in € of the product K 7 Premium Power Flex Home?\n\nContext: \n[Header 1: K 7 Power Flex Home <!-- Title, From Page 0 (L=0.065,T=0.080,R=0.436,B=0.100), With Id 7417F6Ca-Fa9F-401A-9B38-6819C6E6Fbec -->/Header 2: Price Information]: - **Price**: 624,99 € - **Includes**: Mwst. (Vat) - **Delivery**: Kostenlose Lieferung Ab 50 € (Free Delivery From 50 €) 

In [150]:
question = "What is the price in € of the product K 7 Power Flex Home?"

retrieved_docs = local_vector_db.similarity_search(question)
docs_content = "\n\n".join(f"Document {i+1}:\n{doc.page_content}" for i, doc in enumerate(retrieved_docs))
formated_prompt = prompt.invoke({"question": question, "context": docs_content})

print(formated_prompt.to_string()[:1000])

You are an assistant helping to complete missing fields in a product database by extracting accurate information from documentation.

Each question corresponds to a missing field in the product's entry (e.g., price, description, accessories). Use the retrieved context to find and return the correct value.

If the context does not contain the necessary information, respond with "Unknown". Keep answers factual, concise, and suitable for filling into a CSV cell. Do not speculate or include explanations. 

Do not answer in a sentence but just with the value.

{few_shot_examples}

Question:
What is the price in € of the product K 7 Power Flex Home?

Context: 
Document 1:
[Header 1: K 7 Power Flex Home <!-- Title, From Page 0 (L=0.065,T=0.080,R=0.436,B=0.100), With Id 7417F6Ca-Fa9F-401A-9B38-6819C6E6Fbec -->/Header 2: Price Information]: - **Price**: 624,99 € - **Includes**: Mwst. (Vat) - **Delivery**: Kostenlose Lieferung Ab 50 € (Free Delivery From 50 €) <!-- Key_Value, From Page 0 (L=0.08

In [151]:
answer = llm.invoke(formated_prompt)

print(answer)

The final answer is 624.99€.


In [152]:
import pandas as pd
df_edited = pd.read_csv(r"C:\Users\charl\Documents\CBS\Code Projects_Python\AIML25\AIML25_Project\data\SAP_Produktstammdaten_vfinal.csv", sep=",")

In [153]:
import numpy as np
import random

# Define the columns to exclude from deletion
protected_columns = ["Produktname", "Bestellnummer"]

# Identify non-empty cells excluding the protected column
non_empty_cells = [
    (i, col)
    for i in df_edited.index
    for col in df_edited.columns
    if col not in protected_columns and pd.notna(df_edited.at[i, col])
]

# Define how many values to drop (e.g., 3% of available non-protected cells)
drop_fraction = 0.03
num_to_drop = int(len(non_empty_cells) * drop_fraction)

# Randomly select cells to drop
cells_to_drop = random.sample(non_empty_cells, num_to_drop)

# Set selected cells to NaN
for i, col in cells_to_drop:
    df_edited.at[i, col] = np.nan


In [154]:
missing_fields = []

for idx, row in df_edited.iterrows():
    product_name = row["Produktname"]
    
    for field in df_edited.columns:
        value = row[field]
        if pd.isna(value) or str(value).strip().lower() in [""]:
            missing_fields.append((idx, product_name, field))

# Display all missing values
for idx, product_name, field in missing_fields:
    print(f"Row {idx} – {product_name} is missing: {field}")


Row 2 – K 7 Premium Power Flex is missing: Fördermenge (l/h)
Row 3 – K 7 Premium Power Flex Home is missing: Fördermenge (l/h)
Row 3 – K 7 Premium Power Flex Home is missing: Ausstattung
Row 4 – K 7 Premium Smart Control Flex is missing: Farbe
Row 7 – K 7 Power Flex is missing: Lieferzeit
Row 7 – K 7 Power Flex is missing: Lieferumfang
Row 9 – K 7 Smart Control Flex is missing: Druck (bar/MPa)
Row 10 – K 7 WCM Premium Home is missing: Preis (€ inkl. MwSt.)
Row 10 – K 7 WCM Premium Home is missing: Druck (bar/MPa)
Row 11 – K 7 WCM is missing: Druck (bar/MPa)
Row 14 – K 7 WCM Premium is missing: Druck (bar/MPa)
Row 19 – K 5 WCM Premium is missing: Zulauftemperatur (°C)
Row 24 – K 5 FJ is missing: Ausstattung
Row 25 – K 5 FJ Home is missing: Anschlusskabel (m)
Row 26 – K 5 Premium Smart Control Flex Home is missing: Gewicht ohne Zubehör (kg)
Row 27 – K 5 Power Control Flex Home&Brush Anniversary Edition is missing: Gewicht inkl. Verpackung (kg)
Row 28 – K 5 Premium Power Control Flex is m

In [155]:
# Define state for application
class State(TypedDict):
    """ A langgraph state for the application """
    question: str
    context: list[Document]
    answer: str


# Define application steps
def retrieve(state: State):
    """ Our retrieval step. We use our local vector database to retrieve similar documents to the question """
    retrieved_docs = local_vector_db.similarity_search(state["question"], k=3) # NOTE: You can change k to retrieve fewer or more documents
    return {"context": retrieved_docs} 


def generate(state: State):
    """ Our generation step. We use the retrieved documents to generate an answer to the question """

    # Format the prompt
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    formated_prompt = prompt.invoke({"question": state["question"], "context": docs_content})

    # Generate the answer
    response = llm.invoke(formated_prompt)
    return {"answer": response}


# Compile application and test
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve") # Start at the retrieve step
graph = graph_builder.compile() # Compile the graph

In [156]:
response = graph.invoke({"question": "What is the price in € of the product K 7 Premium Power Flex?"})

response

{'question': 'What is the price in € of the product K 7 Premium Power Flex?',
 'context': [Document(id='c82372e4-cc96-430e-a37f-563216ad1007', metadata={'Header 1': 'K 7 POWER FLEX HOME <!-- title, from page 0 (l=0.065,t=0.080,r=0.436,b=0.100), with ID 7417f6ca-fa9f-401a-9b38-6819c6e6fbec -->', 'Header 2': 'Price Information'}, page_content='[Header 1: K 7 Power Flex Home <!-- Title, From Page 0 (L=0.065,T=0.080,R=0.436,B=0.100), With Id 7417F6Ca-Fa9F-401A-9B38-6819C6E6Fbec -->/Header 2: Price Information]: - **Price**: 624,99 € - **Includes**: Mwst. (Vat) - **Delivery**: Kostenlose Lieferung Ab 50 € (Free Delivery From 50 €) <!-- Key_Value, From Page 0 (L=0.087,T=0.882,R=0.404,B=0.932), With Id 0791E3D2-41E5-47F4-Aba9-Ff9E9C94Cfce -->'),
  Document(id='f851d3c7-ebb4-4881-8626-6aafd8e949af', metadata={'Header 1': 'K 7 POWER FLEX HOME <!-- title, from page 0 (l=0.065,t=0.080,r=0.436,b=0.100), with ID 7417f6ca-fa9f-401a-9b38-6819c6e6fbec -->', 'Header 2': 'Price Information'}, page_conte

In [157]:
df_original= pd.read_csv(r"C:\Users\charl\Documents\CBS\Code Projects_Python\AIML25\AIML25_Project\data\SAP_Produktstammdaten_vfinal.csv", sep=",")

In [168]:
def normalize_answer(answer):
    # Remove unnecessary words and units like "kg", "kW", etc.
        normalized_answer = re.sub(r"(kg|kW|L/h|bar|MPa|M²/H|°C|cm|mm)", "", answer)
    # Remove extra spaces, newlines, or any text that isn't the core numeric value
        normalized_answer = re.sub(r"[^a-zA-Z0-9\s./-]", "", normalized_answer)
        normalized_answer = re.sub(r'\n.*', '', normalized_answer)

    # Normalize the format (strip, lowercased, etc.)
        normalized_answer = normalized_answer.lower()
        
        return normalized_answer

results = []

for idx, product_name, field in missing_fields:
    question = f"What is the {field} for product '{product_name}'?"

    # Run your LangGraph app
    state_input = {"question": question}
    output_state = graph.invoke(state_input)

    answer = output_state["answer"]

    answer= normalize_answer(answer)

    # Lookup ground truth from the original DataFrame
    try:
        ground_truth_value = df_original.at[idx, field]
    except KeyError:
        ground_truth_value = None

    is_correct = (
        str(answer).strip().lower() == str(ground_truth_value).strip().lower()
        if pd.notna(ground_truth_value) else None
    )

    results.append({
        "index": idx,
        "product": product_name,
        "field": field,
        "answer": answer,
        "ground_truth": ground_truth_value,
        "correct": is_correct,
    })


In [169]:
results[:20]

[{'index': 2,
  'product': 'K 7 Premium Power Flex',
  'field': 'Fördermenge (l/h)',
  'answer': 'max. 600',
  'ground_truth': 'max. 600',
  'correct': True},
 {'index': 3,
  'product': 'K 7 Premium Power Flex Home',
  'field': 'Fördermenge (l/h)',
  'answer': 'max. 600',
  'ground_truth': 'max. 600',
  'correct': True},
 {'index': 3,
  'product': 'K 7 Premium Power Flex Home',
  'field': 'Ausstattung',
  'answer': 'the provided context does not mention any specific details about the ausstattung of the k 7 premium power flex home. therefore i cannot provide this information based on the given data.',
  'ground_truth': "Vario Power Jet, Integrierte HD-Schlauchtrommel, Integriertes Aufbewahrungsnetz, Quick Connect, Plug 'n' Clean-System, Teleskopgriff, Wassergekühlter Motor, Integrierter Wasserfilter",
  'correct': False},
 {'index': 4,
  'product': 'K 7 Premium Smart Control Flex',
  'field': 'Farbe',
  'answer': 'gelb',
  'ground_truth': 'gelb',
  'correct': True},
 {'index': 7,
  'pro

In [170]:
# Convert results to DataFrame
eval_df = pd.DataFrame(results)

# Accuracy
if "correct" in eval_df.columns:
    accuracy = eval_df["correct"].dropna().mean()
    print(f"✅ LLM Accuracy: {accuracy:.2%}")


✅ LLM Accuracy: 20.00%
