# Part 2: RAG


In [None]:
from typing import Literal, Any
from copy import deepcopy

from typing_extensions import TypedDict
import matplotlib.pyplot as plt
import numpy as np
from decouple import config
from pydantic import BaseModel, Field
from IPython.display import Image, display
from tqdm import tqdm

from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters.markdown import MarkdownHeaderTextSplitter
from langchain.prompts import PromptTemplate
from langchain_ibm import WatsonxEmbeddings
from langchain_ibm import WatsonxLLM
from langgraph.graph import START, StateGraph
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams

import litellm
from litellm import completion
import instructor
from instructor import Mode


import os

In [None]:
WX_API_KEY = "Kmvh0N6KGE3Rq2eJtOSZOgA_0n3oEUEZhbqd5w0fyGRd"
PROJECT_ID = "d0c9b183-186c-4eaf-96dc-d8e4285fe71b"
WX_API_URL = "https://us-south.ml.cloud.ibm.com"

In [None]:
llm = WatsonxLLM(

        model_id= "ibm/granite-13b-instruct-v2",
        url=WX_API_URL,
        apikey=WX_API_KEY,
        project_id=PROJECT_ID,

        params={
            GenParams.DECODING_METHOD: "greedy",
            GenParams.TEMPERATURE: 0,
            GenParams.MIN_NEW_TOKENS: 5,
            GenParams.MAX_NEW_TOKENS: 1_000,
            GenParams.REPETITION_PENALTY:1.2
        }

)

In [None]:
llm_result = llm.generate(["Hi how are you?"])

print(type(llm_result))
print(llm_result)

In [None]:
from pathlib import Path

project_root = Path().cwd()

desc_dir      = project_root / "parsed_markdown"
output_file   = project_root / "parsed_markdown" / "consolidated_markdown_data.md"

md_files = sorted(desc_dir.glob("*.md"))

with output_file.open("w", encoding="utf-8") as out:
    for md in md_files:
        out.write(f"<!-- ===== {md.name} ===== -->\n\n")
        
        out.write(md.read_text(encoding="utf-8"))
        out.write("\n\n")  # add a blank line between files

print(f" Consolidated {len(md_files)} files into {output_file}")


In [None]:
from langchain_core.documents import Document
from langchain_community.document_loaders import TextLoader

project_root = Path().cwd()
file_path    = project_root / "parsed_markdown" / "consolidated_markdown_data.md"

loader    = TextLoader(str(file_path))
documents = loader.load()


document  = documents[0]
print(document .metadata)

In [None]:
from langchain_text_splitters import MarkdownHeaderTextSplitter


In [None]:
# Step 1: Structural split by headers
from langchain_text_splitters import RecursiveCharacterTextSplitter


headers_to_split_on = [("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
header_chunks = markdown_splitter.split_text(document.page_content)

# Step 2: Token-based split inside each header chunk
token_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)

# Step 3: Apply token-based split to each chunk
final_chunks = []
for chunk in header_chunks:
    sub_chunks = token_splitter.split_text(chunk.page_content)
    for sub_chunk in sub_chunks:
        final_chunks.append(Document(page_content=sub_chunk, metadata=chunk.metadata))


In [None]:
filtered_chunks = []
for chunk in final_chunks:
    is_footer = chunk.metadata.get("Header 3") == "Page Footer"
    is_mostly_link_or_boilerplate = (
        len(chunk.page_content.strip()) < 150 or  # too short
        chunk.page_content.count("http") > 1      # mostly links
    )

    if is_footer and is_mostly_link_or_boilerplate:
        continue  # Skip useless footers
    filtered_chunks.append(chunk)


In [None]:
for i, chunk in enumerate(final_chunks):
    if "574,99 €" in chunk.page_content:
        print(f"FOUND in chunk {i}:\n{chunk.page_content}\n")

In [None]:
filtered_chunks

In [None]:
from copy import deepcopy


def update_documents_with_headers(chunks):
    """
    Creates a new list of Document objects with page_content prepended with headers
    in [Header1/Header2/Header3]: format
    
    Returns new objects rather than modifying the original chunks
    """
    updated_chunks = []
    max_depth=3 
    
    for doc in chunks:
        # Create a deep copy of the document to avoid modifying the original
        new_doc = deepcopy(doc)
        
        # Get all headers that exist in metadata
        headers = []
        for i in range(1, max_depth + 1):
            key = f'Header {i}'
            if key in new_doc.metadata:
                headers.append(f"{key}: {new_doc.metadata[key]}")
        
        # Create the header prefix and update page_content
        if headers:
            prefix = f"[{'/'.join(headers)}]: "
            new_doc.page_content = prefix + "\n" + new_doc.page_content
        
        updated_chunks.append(new_doc)
    
    return updated_chunks


docs = update_documents_with_headers(filtered_chunks)

In [None]:
import re

product_names = [
 'K 7 Premium Smart Control Flex eco!B',
 'K 7 Premium Smart Control Flex Home',
 'K 7 Premium Power Flex',
 'K 7 Premium Power Flex Home',
 'K 7 Premium Smart Control Flex',
 'K 7 Smart Control Flex eco!Booster',
 'K 7 Smart Control Flex Home',
 'K 7 Power Flex',
 'K 7 Power Flex Home',
 'K 7 Smart Control Flex',
 'K 7 WCM Premium Home',
 'K 7 WCM',
 'K 7 WCM Car&Home',
 'K 7 WCM FJ',
 'K 7 WCM Premium',
 'K 5 Premium Smart Control Flex eco!Booster',
 'K 5 Power Control Flex',
 'K 5 Power Control Flex Home',
 'K 5 WCM',
 'K 5 WCM Premium',
 'K 5 WCM Premium Home',
 'K 5 Classic',
 'K 5 Classic Car & Home',
 'K 5 Classic Home',
 'K 5 FJ',
 'K 5 FJ Home',
 'K 5 Premium Smart Control Flex Home',
 'K 5 Power Control Flex Home&Brush Anniversary Edition',
 'K 5 Premium Power Control Flex',
 'K 5 Premium Power Control Flex Home',
 'K 5 Premium Smart Control Flex',
 'K 5 Smart Control Flex eco!Booster',
 'K 4 Premium Power Control Flex',
 'K 4 FJ',
 'K 4 FJ Home',
 'K 4 WCM Premium',
 'K 4 Classic',
 'K 4 Classic Car',
 'K 4 Classic Home',
 'K 4 WCM Premium Home',
 'K Silent Anniversary Edition',
 'K Silent eco!Booster',
 'K 4 Power Control Flex',
 'K 4 Power Control Flex Home',
 'K 4 Premium Power Control Flex Home',
 'K 4 WCM',
 'K 3 Power Control',
 'K 3 Classic',
 'K 3 Classic Car',
 'K 3 FJ',
 'K 3 FJ Home',
 'K 3 Horizontal Plus Home',
 'K 3 Power Control Home T 5',
 'K 3 Premium Power Control',
 'K 3 Horizontal Plus',
 'K 2 Premium FJ',
 'K 2 Battery',
 'K 2 Classic',
 'K 2 Power Control',
 'K 2 Power Control Car & Home',
 'K 2 Premium FJ Home',
 'K 2 Premium Horizontal VPS Home',
 'K 2 Battery Set',
 'K 2 Horizontal VPS',
 'K 2 Power Control Home',
 'K 2 Universal Edition',
 'K 2 Universal Edition Home',
 'K Mini'
]


def clean_content(text):


    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"\s+", " ", text).strip()
    # Step 1: Temporarily replace product names with placeholders
    placeholders = {}
    for i, name in enumerate(product_names):
        placeholder = f"__PRODUCT_{i}__"
        placeholders[placeholder] = name
        text = text.replace(name, placeholder)

    # Step 2: Clean the rest of the text (lowercase, remove links, etc.)
    text = text.lower()
    

    # Step 3: Restore product names with original casing
    for placeholder, name in placeholders.items():
        text = text.replace(placeholder, name)

    return text


In [None]:
# Apply clean_content to each document's page_content
for doc in docs:
    doc.page_content = clean_content(doc.page_content)

In [None]:
for doc in docs:
    print(doc.page_content, end="\n\n")

In [None]:
from langchain_ibm import WatsonxEmbeddings


embed_params = {}

watsonx_embedding = WatsonxEmbeddings(
    model_id="ibm/granite-embedding-278m-multilingual",
    url= WX_API_URL,
    project_id=PROJECT_ID,
    apikey=WX_API_KEY,
    params=embed_params,
)