## 1. LLM

In [141]:
# imports for the project

import pandas as pd
from decouple import config
from ibm_watsonx_ai import APIClient
from ibm_watsonx_ai import Credentials
from ibm_watsonx_ai.foundation_models import ModelInference
from dotenv import load_dotenv
from sklearn.metrics import classification_report 
from tqdm import tqdm
from ibm_watsonx_ai.foundation_models.schema import TextGenParameters

In [142]:
import os
print(os.path.exists(".env")) 

True


In [143]:
# Load .env file
load_dotenv()

# Get the API key
WX_API_KEY = os.getenv('WX_API_KEY')

if WX_API_KEY:
    print("API Key Loaded Successfully")
else:
    print("API Key Not Found! Check your .env file.")


API Key Loaded Successfully


In [144]:
credentials = Credentials(
    url = "https://us-south.ml.cloud.ibm.com",
    api_key = WX_API_KEY,
)

client = APIClient(
    credentials=credentials, 
    project_id="c109b911-7caf-4fe6-b111-b766c2039951"
)

## 2. Load csv file

In [145]:
df = pd.read_csv(r"C:\Users\charl\Documents\CBS\Code Projects_Python\AIML25\AIML25_Project\data\SAP_Produktstammdaten_vfinal.csv", sep=",")

In [146]:
missing_fields = []

for idx, row in df.iterrows():
    product_name = row["Produktname"]
    
    for field in df.columns:
        value = row[field]
        if pd.isna(value) or str(value).strip().lower() in ["", "n/a", "–", "nicht verfügbar", "unbekannt", "nicht angegeben"]:
            missing_fields.append((idx, product_name, field))

# Display all missing values
for idx, product_name, field in missing_fields:
    print(f"Row {idx} – {product_name} is missing: {field}")


Row 2 – K 7 Premium Power Flex is missing: Preis (€ inkl. MwSt.)
Row 25 – K 5 FJ Home is missing: Preis (€ inkl. MwSt.)
Row 29 – K 5 Premium Power Control Flex Home is missing: Preis (€ inkl. MwSt.)
Row 35 – K 4 WCM Premium is missing: Anschlusskabel (m)
Row 36 – K 4 Classic is missing: Preis (€ inkl. MwSt.)
Row 36 – K 4 Classic is missing: Lieferzeit
Row 38 – K 4 Classic Home is missing: Preis (€ inkl. MwSt.)
Row 38 – K 4 Classic Home is missing: Lieferzeit
Row 39 – K 4 WCM Premium Home is missing: Anschlusskabel (m)
Row 47 – K 3 Classic is missing: Preis (€ inkl. MwSt.)
Row 47 – K 3 Classic is missing: Lieferzeit
Row 54 – K 3 Horizontal Plus is missing: Preis (€ inkl. MwSt.)
Row 54 – K 3 Horizontal Plus is missing: Lieferzeit
Row 56 – K 2 Battery is missing: Flächenleistung (m²/h)
Row 56 – K 2 Battery is missing: Anschlussleistung (kW)
Row 61 – K 2 Premium Horizontal VPS Home is missing: Anschlussleistung (kW)
Row 61 – K 2 Premium Horizontal VPS Home is missing: Anschlusskabel (m)
Ro

## 3. Load markdown data

In [147]:
file_path= r"C:\Users\charl\Documents\CBS\Code Projects_Python\AIML25\AIML25_Project\consolidated.md"
markdown_data = load_markdown_file(file_path)

In [148]:
import re

def clean_markdown(markdown_data: str) -> str:
    # 1. Remove HTML comments (e.g. <!-- something -->)
    cleaned = re.sub(r'<!--.*?-->', '', markdown_data, flags=re.DOTALL)
    cleaned = re.sub(r"http\S+", "", cleaned)

    # 2. Remove duplicate or irrelevant headers (e.g. "Page Header", "Page Number", etc.)
    irrelevant_headers = [
        r'^#+\s*Page Header\s*$',
        r'^#+\s*Page Number\s*$',
        r'^#+\s*Page Footer\s*$',
        r'^\d{1,2}/\d{1,2}/\d{4},\s*\d{2}:\d{2}\s*$',
        r'^\d+/\d+\s*$'
    ]
    for pattern in irrelevant_headers:
        cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE | re.MULTILINE)

    # 3. Normalize spacing
    cleaned = re.sub(r'\n{2,}', '\n\n', cleaned, flags=re.IGNORECASE)  # Collapse excessive line breaks
    cleaned = re.sub(r'[ \t]+', ' ', cleaned, flags=re.IGNORECASE)     # Remove extra spaces
    cleaned = cleaned.strip()

    return cleaned


In [149]:
def load_markdown_file(file_path):
    filename = os.path.basename(file_path)  # ✅ get the file name from full path

    with open(file_path, 'r', encoding='utf-8') as f:
        raw_content = f.read()
        cleaned_content = clean_markdown(raw_content)
        product_name = filename.replace(".md", "").replace("_", " ").strip()
        
        markdown_data = [{
            "filename": filename,
            "product_name": product_name,
            "content": cleaned_content
        }]
    
    return markdown_data


In [150]:
for doc in markdown_data:
    print(f"📄 File: {doc['filename']}")
    print("-" * 60)
    print(doc['content'][:3000])  # Print first 1000 characters (optional)
    print("\n" + "=" * 60 + "\n")


📄 File: consolidated.md
------------------------------------------------------------
The image displays the logo of Kärcher, a company known for its cleaning equipment. The logo consists of the word "KÄRCHER" in bold, black uppercase letters. Below the text, there is a yellow horizontal bar. The background is white, providing contrast to the black text and yellow bar. 

# K 7 POWER FLEX HOME 

## Text

Für mehr Power: K 7 Power Flex Home Hochdruckreiniger mit PremiumFlex-Schlauch, G 180 Q-Pistole für hartnäckige Verschmutzungen rund ums Haus. Inkl. Home Kit. 

## Description

The image displays a Kärcher K7 pressure washer set, which includes several components:

- **Main Unit**: The central piece is the Kärcher K7 pressure washer, characterized by its yellow and black color scheme. It has a sturdy handle for easy maneuverability and large wheels for transport. The model name "K7" is prominently displayed on the side.

- **Accessories**:
 - **Surface Cleaner**: A circular attachment de

## 3. Parameters

In [151]:
PARAMS = TextGenParameters(
    temperature=0,              # Higher temperature means more randomness - In this case we don't want randomness
    max_new_tokens= 100,          # Maximum number of tokens to generate
    #min_new_tokens=100, # Minimum number of tokens to generate
    top_k= 10,
    top_p = 0.8,
    stop_sequences= None, # Stop generating text when these sequences are encountered
    )

model = ModelInference(
    api_client=client,
    model_id="ibm/granite-3-8b-instruct",  # We could also try a larger model!
    params=PARAMS
)

## 4. System prompt

In [152]:
SYSTEM_PROMPT = """
You are a multilingual assistant helping to complete missing information about products.

INSTRUCTIONS:
Given the product documentation below, extract the value for the missing field "{field}" for the product "{product_name}".


PRODUCT DOCUMENTATION:
{context}

ANSWER:
"""


In [153]:
def find_relevant_docs(product_name, product_docs):
    for doc in product_docs:
        print(f"Checking product name: '{product_name}' against '{doc['product_name']}'")
        if product_name.strip().lower() == doc["product_name"].strip().lower():
            return doc["content"]
    return ""


In [154]:
from rapidfuzz import fuzz

def find_relevant_docs(product_name, product_docs, threshold=85):
    """Finds relevant markdown doc block using fuzzy match."""
    best_match = ""
    best_score = 0

    for doc in product_docs:
        content = doc["content"]
        # Search all lines for best matching line to product_name
        for line in content.splitlines():
            score = fuzz.token_sort_ratio(product_name.lower(), line.strip().lower())
            if score > best_score and score > threshold:
                best_match = content
                best_score = score

    return best_match


In [155]:
for _, row in df.iterrows():  # This iterates over each row as a Series
    product_name = row["Produktname"]
    md_entry = find_relevant_docs(product_name, markdown_data)


In [172]:
def query_product_info(product_name, field, markdown_data):
    relevant_doc = find_relevant_docs(product_name, markdown_data)
    if not relevant_doc:
        return "No documentation found mentioning this product."

    context = relevant_doc[:8000]  # Just use the one doc directly
    prompt = SYSTEM_PROMPT.format(
        product_name=product_name,
        context=context,
        field=field
    )

    response = model.generate(prompt)  # Make sure your LLM client returns this format
    answer = response["results"][0]["generated_text"].strip()
    return answer


In [173]:
product_name = "K 7 Premium Power Flex"
field = "Preis (€ inkl. MwSt.)"
question = f"What is the {field} of the product {product_name}?"

answer = query_product_info(product_name, field, markdown_data)
print("Answer:", answer)


Answer: The missing field "Preis (€ inkl. MwSt.)" for the product "K 7 Premium Power Flex" is 574,99 €.


In [174]:
product_name = "K 5 FJ Home"
field = "Preis (€ inkl. MwSt.)"
question = f"What is the {field} of the product {product_name}?"

answer = query_product_info(product_name, field, markdown_data)
print("Answer:", answer)

Answer: No documentation found mentioning this product.


In [175]:
# Show the missing fields and query the LLM for each
for idx, product_name, field in missing_fields:
    # Query the LLM to fill the missing value
    print(f"\nFetching answer for missing field '{field}' for product '{product_name}'...")

    answer = query_product_info(product_name, field, markdown_data)
    
    # Show the answer from the LLM without updating the DataFrame
    print(f"Answer for product '{product_name}' ({field}):")
    print(answer)
    print("-" * 60)  # Divider for better readability


Fetching answer for missing field 'Preis (€ inkl. MwSt.)' for product 'K 7 Premium Power Flex'...
Answer for product 'K 7 Premium Power Flex' (Preis (€ inkl. MwSt.)):
The missing field "Preis (€ inkl. MwSt.)" for the product "K 7 Premium Power Flex" is 574,99 €.
------------------------------------------------------------

Fetching answer for missing field 'Preis (€ inkl. MwSt.)' for product 'K 5 FJ Home'...
Answer for product 'K 5 FJ Home' (Preis (€ inkl. MwSt.)):
No documentation found mentioning this product.
------------------------------------------------------------

Fetching answer for missing field 'Preis (€ inkl. MwSt.)' for product 'K 5 Premium Power Control Flex Home'...
Answer for product 'K 5 Premium Power Control Flex Home' (Preis (€ inkl. MwSt.)):
No documentation found mentioning this product.
------------------------------------------------------------

Fetching answer for missing field 'Anschlusskabel (m)' for product 'K 4 WCM Premium'...
Answer for product 'K 4 WCM 

In [176]:
completed_data = []

for _, row in df.iterrows():
    product_name = row["Produktname"]
    context = find_relevant_docs(product_name, markdown_data)
    
    for field in df.columns:
        value = row[field]
        if pd.isna(value) or value in ["", "N/A", "–"]:
            prompt = SYSTEM_PROMPT.format(field=field, context=context)
            response = model.generate(prompt)
            answer = response["results"][0]["generated_text"].strip()

            completed_data.append({
                "product": product_name,
                "field": field,
                "predicted_value": answer
            })


In [177]:
fields_to_check = [
    "Preis (€ inkl. MwSt.)",
    "Lieferzeit",
    "Stromart (V/Hz)",
    "Druck (bar/MPa)",
    "Fördermenge (l/h)",
    "Flächenleistung (m²/h)", 
    "Zulauftemperatur (°C)",
    "Anschlussleistung (kW)",
    "Anschlusskabel (m)",
    "Farbe",
    "Gewicht ohne Zubehör (kg)",
    "Gewicht inkl. Verpackung (kg)",
    "Abmessungen (L × B × H) (mm)",
    "Lieferumfang",
    "Ausstattung",
]


In [180]:
results = []

for _, row in df.iterrows():
    product = row["Produktname"]

    for field in fields_to_check:
        question = f"What is the {field} of {product}?"
        llm_answer = query_product_info(product, field, markdown_data)
        ground_truth = str(row[field])

        # Optional: Normalize answers
        is_correct = llm_answer.strip().lower() == ground_truth.strip().lower()

        results.append({
            "product": product,
            "field": field,
            "llm_answer": llm_answer,
            "ground_truth": ground_truth,
            "correct": is_correct
        })


In [182]:
results

[{'product': 'K 7 Premium Smart Control Flex eco!B',
  'field': 'Preis (€ inkl. MwSt.)',
  'llm_answer': 'The missing field "Preis (€ inkl. MwSt.)" for the product "K 7 Premium Smart Control Flex eco!B" is not provided in the given documentation. However, based on the similar product "K 7 Power Flex Home" which is priced at 574,99 €, it can be inferred that the price for "K 7 Premium Smart Control Flex eco!B" might be around 574,9',
  'ground_truth': '664.99',
  'correct': False},
 {'product': 'K 7 Premium Smart Control Flex eco!B',
  'field': 'Lieferzeit',
  'llm_answer': 'The missing field "Lieferzeit" for the product "K 7 Premium Smart Control Flex eco!B" is not explicitly mentioned in the provided documentation. However, based on the information available for the "K 7 Power Flex Home" product, which is similar, the delivery time is indicated as "2-3 Werktagen" (2-3 working days). Therefore, it can be inferred that the delivery time for "K 7 Premium Smart Control Flex eco!',
  'grou

In [None]:
results_df = pd.DataFrame(results)
accuracy = results_df["correct"].mean()
print(f"Overall Accuracy: {accuracy:.2%}")


Overall Accuracy: 0.00%
