In [102]:
# imports for the project

import pandas as pd
from decouple import config
from ibm_watsonx_ai import APIClient
from ibm_watsonx_ai import Credentials
from ibm_watsonx_ai.foundation_models import ModelInference
from dotenv import load_dotenv
from sklearn.metrics import classification_report 
from tqdm import tqdm
from ibm_watsonx_ai.foundation_models.schema import TextGenParameters

In [103]:
import os
print(os.path.exists(".env")) 

True


In [104]:
# Load .env file
load_dotenv()

# Get the API key
WX_API_KEY = os.getenv('WX_API_KEY')
WX_PROJECT_ID = os.getenv('WX_Project_ID')

if WX_API_KEY:
    print("API Key Loaded Successfully")
else:
    print("API Key Not Found! Check your .env file.")

WX_API_URL = "https://us-south.ml.cloud.ibm.com"


API Key Loaded Successfully


In [105]:
from langchain_ibm import WatsonxLLM
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams


llm = WatsonxLLM(

        model_id= "ibm/granite-3-8b-instruct",
        url=WX_API_URL,
        apikey=WX_API_KEY,
        project_id=WX_PROJECT_ID,

        params={
            GenParams.DECODING_METHOD: "greedy",
            GenParams.TEMPERATURE: 0,
            GenParams.MIN_NEW_TOKENS: 5,
            GenParams.MAX_NEW_TOKENS: 1_000,
            GenParams.REPETITION_PENALTY:1.2
        }

)

In [106]:
from langchain_core.documents import Document
from langchain_community.document_loaders import TextLoader


document = TextLoader(r"C:\Users\charl\Documents\CBS\Code Projects_Python\AIML25\AIML25_Project\consolidated.md").load()[0]
document.metadata

{'source': 'C:\\Users\\charl\\Documents\\CBS\\Code Projects_Python\\AIML25\\AIML25_Project\\consolidated.md'}

In [107]:
from langchain_text_splitters import MarkdownHeaderTextSplitter

In [108]:
# Step 1: Structural split by headers
from langchain_text_splitters import RecursiveCharacterTextSplitter


headers_to_split_on = [("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
header_chunks = markdown_splitter.split_text(document.page_content)

# Step 2: Token-based split inside each header chunk
token_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)

# Step 3: Apply token-based split to each chunk
final_chunks = []
for chunk in header_chunks:
    sub_chunks = token_splitter.split_text(chunk.page_content)
    for sub_chunk in sub_chunks:
        final_chunks.append(Document(page_content=sub_chunk, metadata=chunk.metadata))

In [109]:
filtered_chunks = []
for chunk in final_chunks:
    is_footer = chunk.metadata.get("Header 3") == "Page Footer"
    is_mostly_link_or_boilerplate = (
        len(chunk.page_content.strip()) < 150 or  # too short
        chunk.page_content.count("http") > 1      # mostly links
    )

    if is_footer and is_mostly_link_or_boilerplate:
        continue  # Skip useless footers
    filtered_chunks.append(chunk)

In [110]:
for i, chunk in enumerate(final_chunks):
    if "574,99 €" in chunk.page_content:
        print(f"FOUND in chunk {i}:\n{chunk.page_content}\n")


FOUND in chunk 8:
574,99 € <!-- text, from page 0 (l=0.089,t=0.882,r=0.251,b=0.907), with ID 75d48396-2c0c-44cb-9f01-3c0dc89b119b -->

FOUND in chunk 565:
- **Price**: 574,99 €
- **Includes**: MwSt. (VAT)
- **Delivery**: kostenlose Lieferung ab 50 € (free delivery from 50 €) <!-- text, from page 0 (l=0.087,t=0.882,r=0.404,b=0.932), with ID 6d1d2a2d-6b64-434d-94a7-941377fdaf41 -->



In [111]:
filtered_chunks

[Document(metadata={'Header 2': 'Page Header'}, page_content='The image displays the logo of Kärcher, a company known for its cleaning equipment. The logo consists of the word "KÄRCHER" in bold, black uppercase letters. Below the text, there is a yellow horizontal bar. The background is white, providing contrast to the black text and yellow bar. <!--'),
 Document(metadata={'Header 2': 'Page Header'}, page_content='contrast to the black text and yellow bar. <!-- page_header, from page 0 (l=0.062,t=0.031,r=0.223,b=0.063), with ID b870019e-e912-44a9-b958-687bea146dcc -->'),
 Document(metadata={'Header 1': 'K 7 POWER FLEX HOME <!-- title, from page 0 (l=0.065,t=0.080,r=0.436,b=0.100), with ID 7417f6ca-fa9f-401a-9b38-6819c6e6fbec -->', 'Header 2': 'Text'}, page_content='Für mehr Power: K 7 Power Flex Home Hochdruckreiniger mit PremiumFlex-Schlauch, G 180 Q-Pistole für hartnäckige Verschmutzungen rund ums Haus. Inkl. Home Kit. <!-- text, from page 0 (l=0.064,t=0.121,r=0.850,b=0.148), with ID

In [112]:
from copy import deepcopy


def update_documents_with_headers(chunks):
    """
    Creates a new list of Document objects with page_content prepended with headers
    in [Header1/Header2/Header3]: format
    
    Returns new objects rather than modifying the original chunks
    """
    updated_chunks = []
    max_depth=3 
    
    for doc in chunks:
        # Create a deep copy of the document to avoid modifying the original
        new_doc = deepcopy(doc)
        
        # Get all headers that exist in metadata
        headers = []
        for i in range(1, max_depth + 1):
            key = f'Header {i}'
            if key in new_doc.metadata:
                headers.append(f"{key}: {new_doc.metadata[key]}")
        
        # Create the header prefix and update page_content
        if headers:
            prefix = f"[{'/'.join(headers)}]: "
            new_doc.page_content = prefix + "\n" + new_doc.page_content
        
        updated_chunks.append(new_doc)
    
    return updated_chunks


docs = update_documents_with_headers(filtered_chunks)

In [119]:
import re

product_names = [
 'K 7 Premium Smart Control Flex eco!B',
 'K 7 Premium Smart Control Flex Home',
 'K 7 Premium Power Flex',
 'K 7 Premium Power Flex Home',
 'K 7 Premium Smart Control Flex',
 'K 7 Smart Control Flex eco!Booster',
 'K 7 Smart Control Flex Home',
 'K 7 Power Flex',
 'K 7 Power Flex Home',
 'K 7 Smart Control Flex',
 'K 7 WCM Premium Home',
 'K 7 WCM',
 'K 7 WCM Car&Home',
 'K 7 WCM FJ',
 'K 7 WCM Premium',
 'K 5 Premium Smart Control Flex eco!Booster',
 'K 5 Power Control Flex',
 'K 5 Power Control Flex Home',
 'K 5 WCM',
 'K 5 WCM Premium',
 'K 5 WCM Premium Home',
 'K 5 Classic',
 'K 5 Classic Car & Home',
 'K 5 Classic Home',
 'K 5 FJ',
 'K 5 FJ Home',
 'K 5 Premium Smart Control Flex Home',
 'K 5 Power Control Flex Home&Brush Anniversary Edition',
 'K 5 Premium Power Control Flex',
 'K 5 Premium Power Control Flex Home',
 'K 5 Premium Smart Control Flex',
 'K 5 Smart Control Flex eco!Booster',
 'K 4 Premium Power Control Flex',
 'K 4 FJ',
 'K 4 FJ Home',
 'K 4 WCM Premium',
 'K 4 Classic',
 'K 4 Classic Car',
 'K 4 Classic Home',
 'K 4 WCM Premium Home',
 'K Silent Anniversary Edition',
 'K Silent eco!Booster',
 'K 4 Power Control Flex',
 'K 4 Power Control Flex Home',
 'K 4 Premium Power Control Flex Home',
 'K 4 WCM',
 'K 3 Power Control',
 'K 3 Classic',
 'K 3 Classic Car',
 'K 3 FJ',
 'K 3 FJ Home',
 'K 3 Horizontal Plus Home',
 'K 3 Power Control Home T 5',
 'K 3 Premium Power Control',
 'K 3 Horizontal Plus',
 'K 2 Premium FJ',
 'K 2 Battery',
 'K 2 Classic',
 'K 2 Power Control',
 'K 2 Power Control Car & Home',
 'K 2 Premium FJ Home',
 'K 2 Premium Horizontal VPS Home',
 'K 2 Battery Set',
 'K 2 Horizontal VPS',
 'K 2 Power Control Home',
 'K 2 Universal Edition',
 'K 2 Universal Edition Home',
 'K Mini'
]


def clean_content(text):


    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"\s+", " ", text).strip()
    # Step 1: Temporarily replace product names with placeholders
    placeholders = {}
    for i, name in enumerate(product_names):
        placeholder = f"__PRODUCT_{i}__"
        placeholders[placeholder] = name
        text = text.replace(name, placeholder)

    # Step 2: Clean the rest of the text (lowercase, remove links, etc.)
    text = text.lower()
    

    # Step 3: Restore product names with original casing
    for placeholder, name in placeholders.items():
        text = text.replace(placeholder, name)

    return text


In [120]:
# Apply clean_content to each document's page_content
for doc in docs:
    doc.page_content = clean_content(doc.page_content)


In [121]:
for doc in docs:
    print(doc.page_content, end="\n\n")

[header 2: page header]: the image displays the logo of kärcher, a company known for its cleaning equipment. the logo consists of the word "kärcher" in bold, black uppercase letters. below the text, there is a yellow horizontal bar. the background is white, providing contrast to the black text and yellow bar. <!--

[header 2: page header]: contrast to the black text and yellow bar. <!-- page_header, from page 0 (l=0.062,t=0.031,r=0.223,b=0.063), with id b870019e-e912-44a9-b958-687bea146dcc -->

[header 1: k 7 power flex home <!-- title, from page 0 (l=0.065,t=0.080,r=0.436,b=0.100), with id 7417f6ca-fa9f-401a-9b38-6819c6e6fbec -->/header 2: text]: für mehr power: __product_7__ home hochdruckreiniger mit premiumflex-schlauch, g 180 q-pistole für hartnäckige verschmutzungen rund ums haus. inkl. home kit. <!-- text, from page 0 (l=0.064,t=0.121,r=0.850,b=0.148), with id 9ee6b603-ece9-4526-85da-a653334c1b50 -->

[header 1: k 7 power flex home <!-- title, from page 0 (l=0.065,t=0.080,r=0.43

In [59]:
from langchain_ibm import WatsonxEmbeddings


embed_params = {}

watsonx_embedding = WatsonxEmbeddings(
    model_id="ibm/granite-embedding-278m-multilingual",
    url= WX_API_URL,
    project_id=WX_PROJECT_ID,
    apikey=WX_API_KEY,
    params=embed_params,
)

In [60]:
from langchain_chroma import Chroma


local_vector_db = Chroma.from_documents(
    collection_name="my_collection",
    embedding=watsonx_embedding,
    persist_directory="my_vector_db", # This will save the vector database to disk! Delete it if you want to start fresh.
    documents=docs,
    
)

In [61]:
# Use the vectorstore as a retriever
retriever = local_vector_db.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k": 3,
    }
)

In [71]:
# Retrieve the most similar text
retrieved_documents = retriever.invoke("What is the price in € of the product K 7 Power Flex Home")

for document in retrieved_documents:
    print(f"{'#' * 80}\nID: {document.id}")
    first_n_of_content = document.page_content[:500].replace('\n\n', ' ')
    print(f"Content: {first_n_of_content}\n")

################################################################################
ID: 81a3864a-d4f6-4692-8776-2a2a2c8e8735
Content: [header 1: k 7 power flex home /header 2: price information]: 
- **price**: 504,99 €
- **includes**: inkl. mwst.
- **delivery**: kostenlose lieferung ab 50 €

################################################################################
ID: 61688763-bb05-490e-9e8d-be75eed289a5
Content: [header 1: k 7 power flex home /header 2: price information]: 
- **price**: 694,99 €
- **includes**: mwst. (vat)
- **delivery**: kostenlose lieferung ab 50 € (free delivery from 50 €)

################################################################################
ID: a53525f5-9676-4eac-86e9-d4c5661ca1a2
Content: [header 1: k 7 power flex home /header 2: price information]: 
- **price**: 624,99 €
- **includes**: mwst. (vat)
- **delivery**: kostenlose lieferung ab 50 € (free delivery from 50 €)



In [63]:
from langchain.prompts import PromptTemplate

template = """You are an assistant helping to complete missing fields in a product database by extracting accurate information from documentation.

Each question corresponds to a missing field in the product's entry (e.g., price, description, accessories). Use the retrieved context to find and return the correct value.

If the context does not contain the necessary information, respond with "Unknown". Keep answers factual, concise, and suitable for filling into a CSV cell. Do not speculate or include explanations.

Question:
{question}

Context: 
{context}

Answer:
"""
prompt = PromptTemplate.from_template(template)

In [67]:
prompt.invoke(
    input={
        "question": "What is the price in € of the product K 7 Premium Power Flex?",
        "context": retrieved_documents[0].page_content,
        
    }
)

StringPromptValue(text='You are an assistant helping to complete missing fields in a product database by extracting accurate information from documentation.\n\nEach question corresponds to a missing field in the product\'s entry (e.g., price, description, accessories). Use the retrieved context to find and return the correct value.\n\nIf the context does not contain the necessary information, respond with "Unknown". Keep answers factual, concise, and suitable for filling into a CSV cell. Do not speculate or include explanations.\n\nQuestion:\nWhat is the price in € of the product K 7 Premium Power Flex?\n\nContext: \n[header 1: k 7 power flex home /header 2: price information]: \n- **price**: 504,99 €\n- **includes**: inkl. mwst.\n- **delivery**: kostenlose lieferung ab 50 €\n\nAnswer:\n')

In [70]:
question = "What is the price in € of the product K 7 Power Flex Home?"

retrieved_docs = local_vector_db.similarity_search(question)
docs_content = "\n\n".join(f"Document {i+1}:\n{doc.page_content}" for i, doc in enumerate(retrieved_docs))
formated_prompt = prompt.invoke({"question": question, "context": docs_content})

print(formated_prompt.to_string()[:1000])

You are an assistant helping to complete missing fields in a product database by extracting accurate information from documentation.

Each question corresponds to a missing field in the product's entry (e.g., price, description, accessories). Use the retrieved context to find and return the correct value.

If the context does not contain the necessary information, respond with "Unknown". Keep answers factual, concise, and suitable for filling into a CSV cell. Do not speculate or include explanations.

Question:
What is the price in € of the product K 7 Power Flex Home?

Context: 
Document 1:
[header 1: k 7 power flex home /header 2: price information]: 
- **price**: 504,99 €
- **includes**: inkl. mwst.
- **delivery**: kostenlose lieferung ab 50 €

Document 2:
[header 1: k 7 power flex home /header 2: price information]: 
- **price**: 694,99 €
- **includes**: mwst. (vat)
- **delivery**: kostenlose lieferung ab 50 € (free delivery from 50 €)

Document 3:
[header 1: k 7 power flex home /h

In [69]:
answer = llm.invoke(formated_prompt)

print(answer)

The price in € of the product K 7 Premium Power Flex Home is 504.99.


In [87]:
df = pd.read_csv(r"C:\Users\charl\Documents\CBS\Code Projects_Python\AIML25\AIML25_Project\data\SAP_Produktstammdaten_vfinal.csv", sep=",")

In [92]:
print(df["Produktname"].unique()) 

['K 7 Premium Smart Control Flex eco!B'
 'K 7 Premium Smart Control Flex Home' 'K 7 Premium Power Flex'
 'K 7 Premium Power Flex Home' 'K 7 Premium Smart Control Flex'
 'K 7 Smart Control Flex eco!Booster' 'K 7 Smart Control Flex Home'
 'K 7 Power Flex' 'K 7 Power Flex Home' 'K 7 Smart Control Flex'
 'K 7 WCM Premium Home' 'K 7 WCM' 'K 7 WCM Car&Home' 'K 7 WCM FJ'
 'K 7 WCM Premium' 'K 5 Premium Smart Control Flex eco!Booster'
 'K 5 Power Control Flex' 'K 5 Power Control Flex Home' 'K 5 WCM'
 'K 5 WCM Premium' 'K 5 WCM Premium Home' 'K 5 Classic'
 'K 5 Classic Car & Home' 'K 5 Classic Home' 'K 5 FJ' 'K 5 FJ Home'
 'K 5 Premium Smart Control Flex Home'
 'K 5 Power Control Flex Home&Brush Anniversary Edition'
 'K 5 Premium Power Control Flex' 'K 5 Premium Power Control Flex Home'
 'K 5 Premium Smart Control Flex' 'K 5 Smart Control Flex eco!Booster'
 'K 4 Premium Power Control Flex' 'K 4 FJ' 'K 4 FJ Home' 'K 4 WCM Premium'
 'K 4 Classic' 'K 4 Classic Car' 'K 4 Classic Home' 'K 4 WCM Prem

In [41]:
def query_product_info(product_name, field, markdown_chunks):
    relevant_docs = find_relevant_docs(product_name, markdown_chunks)
    if not relevant_docs:
        return "No documentation found mentioning this product.", ""

    context = "\n\n".join([doc.page_content for doc in relevant_docs])
    context = context[:8000]  # Truncate for token limit

    prompt = SYSTEM_PROMPT.format(
        product_name=product_name,
        context=context,
        field=field
    )

    response = model.generate(prompt)
    answer = response["results"][0]["generated_text"].strip()
    return answer, context


In [42]:
missing_fields = []

for idx, row in df.iterrows():
    product_name = row["Produktname"]
    
    for field in df.columns:
        value = row[field]
        if pd.isna(value) or str(value).strip().lower() in ["", "n/a", "–", "nicht verfügbar", "unbekannt", "nicht angegeben"]:
            missing_fields.append((idx, product_name, field))

# Display all missing values
for idx, product_name, field in missing_fields:
    print(f"Row {idx} – {product_name} is missing: {field}")


Row 2 – K 7 Premium Power Flex is missing: Preis (€ inkl. MwSt.)
Row 25 – K 5 FJ Home is missing: Preis (€ inkl. MwSt.)
Row 29 – K 5 Premium Power Control Flex Home is missing: Preis (€ inkl. MwSt.)
Row 35 – K 4 WCM Premium is missing: Anschlusskabel (m)
Row 36 – K 4 Classic is missing: Preis (€ inkl. MwSt.)
Row 36 – K 4 Classic is missing: Lieferzeit
Row 38 – K 4 Classic Home is missing: Preis (€ inkl. MwSt.)
Row 38 – K 4 Classic Home is missing: Lieferzeit
Row 39 – K 4 WCM Premium Home is missing: Anschlusskabel (m)
Row 47 – K 3 Classic is missing: Preis (€ inkl. MwSt.)
Row 47 – K 3 Classic is missing: Lieferzeit
Row 54 – K 3 Horizontal Plus is missing: Preis (€ inkl. MwSt.)
Row 54 – K 3 Horizontal Plus is missing: Lieferzeit
Row 56 – K 2 Battery is missing: Flächenleistung (m²/h)
Row 56 – K 2 Battery is missing: Anschlussleistung (kW)
Row 61 – K 2 Premium Horizontal VPS Home is missing: Anschlussleistung (kW)
Row 61 – K 2 Premium Horizontal VPS Home is missing: Anschlusskabel (m)
Ro

In [43]:
def find_relevant_docs(product_name, field, markdown_chunks):
    field_lower = field.lower()
    product_lower = product_name.lower()

    relevant = []
    for chunk in markdown_chunks:
        text = chunk.page_content.lower()
        
        if product_lower in text and field_lower in text:
            relevant.append(chunk)

    return relevant
