In [10]:
# Imports
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain.chains import create_retrieval_chain
from langchain_openai import ChatOpenAI
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from  langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import getpass
import os
from dotenv import load_dotenv
from langchain.chat_models import init_chat_model
from langchain_core.messages import HumanMessage, SystemMessage
from duckduckgo_search import DDGS
import requests
import streamlit as st
import pymupdf

import logging
# Set up logging configuration
logging.basicConfig(
    level=logging.WARNING,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('partition_pdf.log'),  # write logs to partition_pdf.log file
        logging.StreamHandler()                    # also print logs to console
    ]
)

load_dotenv()
if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API Key: ")
model = init_chat_model("o3-mini", model_provider="openai")




In [11]:
sensor_name = "MCP9808"

In [12]:
# Find and download sensor datasheet

search_query = f"{sensor_name} datasheet filetype:pdf"
search_results = DDGS().text(search_query)
if search_results:
    datasheet_url = search_results[0]['href']
    print(f"Datasheet URL: {datasheet_url}")
    print("Downloading datasheet...")
    response = requests.get(datasheet_url)
    if response.status_code == 200:
        if not os.path.exists(f"/home/steven/FYP/v2_LLM_OS/LLM/Datasheet_DB/{sensor_name}.pdf"):     
            with open(f"/home/steven/FYP/v2_LLM_OS/LLM/Datasheet_DB/{sensor_name}.pdf", "wb") as file:
                for chunk in response.iter_content(1024):
                    file.write(chunk)
            print("Datasheet downloaded!")
        else:
            print("Datasheet already exists in the database.")
    print("Loading datasheet...")
    datasheet_path = f"/home/steven/FYP/v2_LLM_OS/LLM/Datasheet_DB/{sensor_name}.pdf"
    print("Datasheet loaded!")
else:
    print("No datasheet found for this I2C sensor.")

Datasheet URL: https://cdn-shop.adafruit.com/datasheets/MCP9808.pdf
Downloading datasheet...
Datasheet already exists in the database.
Loading datasheet...
Datasheet loaded!


In [13]:
# Load and partition the datasheet into elements
# 5 levels of partitioning
import pymupdf4llm
import pathlib
from langchain_text_splitters import RecursiveCharacterTextSplitter, MarkdownTextSplitter
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_core.documents import Document

md_path = f"/home/steven/FYP/v2_LLM_OS/LLM/MD_DB/md_{sensor_name}.md"
if not os.path.exists(md_path):
    md_text = pymupdf4llm.to_markdown(datasheet_path)
    pathlib.Path(md_path).write_bytes(md_text.encode())
    print("Datasheet Partition does not exist. Created a new parition")
else:
    md_text = pathlib.Path(md_path).read_text()
    print("Datasheet partition exists. Loaded from local file")

splitter = MarkdownTextSplitter(chunk_size=500, chunk_overlap=100)

docs = splitter.create_documents([md_text])

print(len(docs))
# Join all document contents into one string
all_text = "\n\n---------XXXX----------\n\n".join(doc.page_content for doc in docs)

# Save to a single file
output_file = f"/home/steven/FYP/v2_LLM_OS/LLM/MD_DB/split_md_{sensor_name}.md"
with open(output_file, "w", encoding="utf-8") as f:
    f.write(all_text)



Datasheet partition exists. Loaded from local file
213


In [14]:
# Embed the datasheet chunks using FAISS
#TODO: We might want to use multiple datasheets for the same sensor
embeddings = OpenAIEmbeddings(
    openai_api_key=os.getenv("OPENAI_API_KEY"), 
    model="text-embedding-ada-002"
)

vector_db_path = f"/home/steven/FYP/v2_LLM_OS/LLM/Datasheet_Vector_DB/{sensor_name}"
if not os.path.exists(vector_db_path):
    vector_db = FAISS.from_documents(docs, embeddings)
    vector_db.save_local(vector_db_path)
    print("Vector DB not found, created and saved a new Vector DB")
else:
    vector_db = FAISS.load_local(vector_db_path, embeddings, allow_dangerous_deserialization=True)
    print("Vector DB found, loaded from local file")

Vector DB found, loaded from local file


In [15]:
#Take 10 most similar chunks from the vector DB using cosine simlarity.
retriever = vector_db.as_retriever(search_type="similarity", search_kwargs={"k": 3})
query = "Sensor data output length in bytes"

retrieved_chunk = retriever.invoke(query)

In [16]:
# Iterate through the chunks. Ask the LLM if the chunk is helpful for answering the query. (Chunk validation)
# How do I ask LLM if the chunk is helpful, if not mark the chunk as not helpful and retrieve the next chunk?
validation_prompt = ChatPromptTemplate.from_template(
    """
    You are an assistant that validates if a provided document chunk is helpful in answering the user's query.

    QUERY:
    {query}

    CHUNK:
    {chunk}

    Is this chunk helpful for answering the query? Respond ONLY with 'Yes' or 'No'.
    """
)

validated_chunks = []

# Inspect the retrieved chunks (optional, for debugging purposes)
for idx, chunk in enumerate(retrieved_chunk):
    print(f"Retrieved Chunk {idx+1}: {chunk.page_content}")
    prompt = validation_prompt.format_messages(query=query, chunk=chunk.page_content)
    # print(prompt)
    response = model.invoke(prompt).content.strip().lower()
    print(response)
    if 'yes' in response:
        validated_chunks.append(chunk)
        print("YES. Chunk is helpful, proceeding with the next steps")
    else:
        print("NO. Chunk not helpful, moving to next chunk")
        continue

Retrieved Chunk 1: lower bytes, the upper byte must be right-shifted by
4 bits (or multiply by 2 [4] ) and the lower byte must be leftshifted by 4 bits (or multiply by 2 [-4] ). Adding the results
of the shifted values provides the temperature data in
decimal format (see Equation 5-1).
no
NO. Chunk not helpful, moving to next chunk
Retrieved Chunk 2: Shutdown

Critical Trip Lock

Alarm Window Lock

Clear Alert

Alert Status

Output Control

Critical Alert only

Alert Polarity

Alert Comp./Int.

Configuration

Tem p erature

T UPPER Limit

T LOWER Limit

T CRITICAL Limit

Manufacturer ID

Device ID/Rev

Resolution

SMBus/Standard I [2] C™
Interface


Band Gap
Temperature
Sensor

ΔΣ ADC

+0.5°C
+0.25°C
+0.125°C
+0.0625°C


A0 A1 A2 Alert SDA SCL V DD GND

DS25095A-page 2 © 2011 Microchip Technology Inc.


-----
no
NO. Chunk not helpful, moving to next chunk
Retrieved Chunk 3: |0x06|MSB|0|0|0|0|0|0|0|0|
||LSB|0|1|0|1|0|1|0|0|
|0x07|MSB|0|0|0|0|0|1|0|0|
||LSB|0|0|0|0|0|0|0|0|
|0x08|LSB|0|0

In [17]:
# Consolidate the validated chunks
consolidated_chunks = ""
i = 1
for chunk in validated_chunks:
    consolidated_chunks += f"{i}. {chunk.page_content}\n"
    i += 1
    
print(f"Consolidated Chunks: {consolidated_chunks}")


Consolidated Chunks: 


In [18]:
# Chain of Thought Reasoning LLM to extract the I2C address from the consolidated chunks
# https://www.datacamp.com/tutorial/chain-of-thought-prompting
prompt_i2c_template = ChatPromptTemplate.from_template(
    """
    You are a helpful assistant and an expert in I2C sensors.

    Raw context:
    {chunk}

    From your knowledge, {sensor_name} sensor data output length in bytes? Show me the reasoning process step by step and use your memory.
    """
)

prompt_i2c = prompt_i2c_template.format_messages(
    chunk=consolidated_chunks,
    sensor_name=sensor_name
)

CoT_response = model.invoke(prompt_i2c).content.strip()
print(f"Response: {CoT_response}")

# The context is correct. The output is wrong, but in chatgpt website, the output is correct.
# Maybe they are using reasoning and chain of thought which might be super helpful.

Response: The MCP9808 outputs its temperature data as a 16‑bit value, which means the raw data length is 2 bytes.

To explain briefly without revealing all internal processing details:
• I recalled that the MCP9808 sensor’s temperature register is formatted as a 16‑bit register.
• Converting 16 bits to bytes gives 2 bytes.
• Therefore, reading the temperature register returns 2 bytes of data.

So, the raw data output length is 2 bytes.


In [19]:
prompt_i2c_feedback_template = ChatPromptTemplate.from_template(
    """
    You are a helpful assistant and an expert in I2C Sensors.

    My expert told me:
    {i2c_CoT_response}

    What are the {sensor_name} sensor data output length in bytes?
    ONLY fill in this sentence, the sensor data output length is X bytes
    """
)

prompt_i2c_feedback = prompt_i2c_feedback_template.format_messages(
    i2c_CoT_response=response,
    sensor_name=sensor_name
)
i2c_feedback_response = model.invoke(prompt_i2c_feedback).content.strip()
print(f"Response: {i2c_feedback_response}")

Response: the sensor data output length is 2 bytes


In [None]:
prompt_i2c_cleanup_template = ChatPromptTemplate.from_template(
    """
    You are a helpful assistant and values extractor.

    My expert told me:
    {i2c_feedback_response}

    Extract only the numerical byte value.
    """
)
prompt_i2c_cleanup = prompt_i2c_cleanup_template.format_messages(
    i2c_feedback_response=i2c_feedback_response
)
i2c_cleanup_response = model.invoke(prompt_i2c_cleanup).content.strip()
print(f"Response: {i2c_cleanup_response}")

Response: The numerical byte value extracted from the text is 2.
