In [15]:
# Imports
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain.chains import create_retrieval_chain
from langchain_openai import ChatOpenAI
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from  langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import getpass
import os
from dotenv import load_dotenv
from langchain.chat_models import init_chat_model
from langchain_core.messages import HumanMessage, SystemMessage
from duckduckgo_search import DDGS
import requests
import streamlit as st
import pymupdf

import logging
# Set up logging configuration
logging.basicConfig(
    level=logging.WARNING,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('partition_pdf.log'),  # write logs to partition_pdf.log file
        logging.StreamHandler()                    # also print logs to console
    ]
)

load_dotenv()
if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API Key: ")
model = init_chat_model("o3-mini", model_provider="openai")




In [16]:
sensor_name = "MCP9808"
size = '2'

In [17]:
# Find and download sensor datasheet

search_query = f"{sensor_name} datasheet filetype:pdf"
search_results = DDGS().text(search_query)
if search_results:
    datasheet_url = search_results[0]['href']
    print(f"Datasheet URL: {datasheet_url}")
    print("Downloading datasheet...")
    response = requests.get(datasheet_url)
    if response.status_code == 200:
        if not os.path.exists(f"/home/steven/FYP/v2_LLM_OS/LLM/Datasheet_DB/{sensor_name}.pdf"):     
            with open(f"/home/steven/FYP/v2_LLM_OS/LLM/Datasheet_DB/{sensor_name}.pdf", "wb") as file:
                for chunk in response.iter_content(1024):
                    file.write(chunk)
            print("Datasheet downloaded!")
        else:
            print("Datasheet already exists in the database.")
    print("Loading datasheet...")
    datasheet_path = f"/home/steven/FYP/v2_LLM_OS/LLM/Datasheet_DB/{sensor_name}.pdf"
    print("Datasheet loaded!")
else:
    print("No datasheet found for this I2C sensor.")

Datasheet URL: https://cdn-shop.adafruit.com/datasheets/MCP9808.pdf
Downloading datasheet...
Datasheet already exists in the database.
Loading datasheet...
Datasheet loaded!


In [18]:
# Load and partition the datasheet into elements
# 5 levels of partitioning
import pymupdf4llm
import pathlib
from langchain_text_splitters import RecursiveCharacterTextSplitter, MarkdownTextSplitter
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_core.documents import Document

md_path = f"/home/steven/FYP/v2_LLM_OS/LLM/MD_DB/md_{sensor_name}.md"
if not os.path.exists(md_path):
    md_text = pymupdf4llm.to_markdown(datasheet_path)
    pathlib.Path(md_path).write_bytes(md_text.encode())
    print("Datasheet Partition does not exist. Created a new parition")
else:
    md_text = pathlib.Path(md_path).read_text()
    print("Datasheet partition exists. Loaded from local file")

splitter = MarkdownTextSplitter(chunk_size=500, chunk_overlap=100)

docs = splitter.create_documents([md_text])

print(len(docs))
# Join all document contents into one string
all_text = "\n\n---------XXXX----------\n\n".join(doc.page_content for doc in docs)

# Save to a single file
output_file = f"/home/steven/FYP/v2_LLM_OS/LLM/MD_DB/split_md_{sensor_name}.md"
with open(output_file, "w", encoding="utf-8") as f:
    f.write(all_text)



Datasheet partition exists. Loaded from local file
213


In [19]:
# Embed the datasheet chunks using FAISS
#TODO: We might want to use multiple datasheets for the same sensor
embeddings = OpenAIEmbeddings(
    openai_api_key=os.getenv("OPENAI_API_KEY"), 
    model="text-embedding-ada-002"
)

vector_db_path = f"/home/steven/FYP/v2_LLM_OS/LLM/Datasheet_Vector_DB/{sensor_name}"
if not os.path.exists(vector_db_path):
    vector_db = FAISS.from_documents(docs, embeddings)
    vector_db.save_local(vector_db_path)
    print("Vector DB not found, created and saved a new Vector DB")
else:
    vector_db = FAISS.load_local(vector_db_path, embeddings, allow_dangerous_deserialization=True)
    print("Vector DB found, loaded from local file")

Vector DB found, loaded from local file


In [20]:
#Take 10 most similar chunks from the vector DB using cosine simlarity.
retriever = vector_db.as_retriever(search_type="similarity", search_kwargs={"k": 3})
query = "sensor measurement data"
retrieved_chunk = retriever.invoke(query)

In [21]:
# Iterate through the chunks. Ask the LLM if the chunk is helpful for answering the query. (Chunk validation)
# How do I ask LLM if the chunk is helpful, if not mark the chunk as not helpful and retrieve the next chunk?
validation_prompt = ChatPromptTemplate.from_template(
    """
    You are an assistant that validates if a provided document chunk is helpful in answering the user's query.

    QUERY:
    {query}

    CHUNK:
    {chunk}

    Is this chunk helpful for answering the query? Respond ONLY with 'Yes' or 'No'.
    """
)

validated_chunks = []

# Inspect the retrieved chunks (optional, for debugging purposes)
for idx, chunk in enumerate(retrieved_chunk):
    print(f"Retrieved Chunk {idx+1}: {chunk.page_content}")
    prompt = validation_prompt.format_messages(query=query, chunk=chunk.page_content)
    # print(prompt)
    response = model.invoke(prompt).content.strip().lower()
    print(response)
    if 'yes' in response:
        validated_chunks.append(chunk)
        print("YES. Chunk is helpful, proceeding with the next steps")
    else:
        print("NO. Chunk not helpful, moving to next chunk")
        continue

Retrieved Chunk 1: |SENSOR SERIAL INTERFACE TIMING SPECIFICATIONS|Col2|Col3|Col4|Col5|Col6|
|---|---|---|---|---|---|
|Electrical Specifications: Unless otherwise indicated, V = 2.7V to 5.5V, T = -40°C to +125°C, GND = Ground DD A and C = 80 pF. (Note 1) L||||||
|Parameters|Sym|Min|Max|Units|Conditions|
|2-Wire SMBus/Standard Mode I2C™ Compatible Interface (Note 1)||||||
|Serial Port Clock Frequency|f SC|0|400|kHz|(Note 2, 4)|
|Low Clock|t LOW|1300|—|ns|(Note 2)|
|High Clock|t HIGH|600|—|ns|(Note 2)|
no
NO. Chunk not helpful, moving to next chunk
Retrieved Chunk 2: This sensor has an industry standard 400 kHz, 2-wire,
SMBus/I [2] C compatible serial interface, allowing up to
eight or sixteen sensors to be controlled with a single
serial bus (see Table 3-2 for available Address codes).
These features make the MCP9808 ideal for

sophisticated, multi-zone, temperature-monitoring
applications. **Packa g e T yp es**


**8-Pin 2x3 DFN***

SDA 1 8 V DD

SCL 2 EP 7 A0

9

Alert 3 6 A1

GND 4 5

In [22]:
# Consolidate the validated chunks
consolidated_chunks = ""
i = 1
for chunk in validated_chunks:
    consolidated_chunks += f"{i}. {chunk.page_content}\n"
    i += 1
    
print(f"Consolidated Chunks: {consolidated_chunks}")


Consolidated Chunks: 1. - Food Processing

- Personal Computers and Servers

- PC Peripherals

- Consumer Electronics

- Handheld/Portable Devices **Tem p erature Accurac y**

##### **Description**

Microchip Technology Inc.’s MCP9808 digital
temperature sensor converts temperatures between
-20°C and +100°C to a digital word with
±0.25°C/±0.5°C (typical/maximum) accuracy.



In [23]:
# Chain of Thought Reasoning LLM to extract the I2C address from the consolidated chunks
# https://www.datacamp.com/tutorial/chain-of-thought-prompting
prompt_i2c_template = ChatPromptTemplate.from_template(
    """
    You are a helpful assistant and an expert in I2C Sensors. Assume ideal and default condition.

    Raw context (might be inaccurate):
    {chunk}

    1. What physical quantities or parameters does the {sensor_name} measure?
    2. Assuming raw_bytes is a {size}-byte array received from the sensor, provide the bit range for each physical parameter in the format: ParameterName[start_bit:end_bit] (For example, Temperature[0:12])
    3. The raw_bytes is big-endian.
    4. Omit anything that is unrelated to the raw data such as alert, config, or crc.
    5. Do not describe MSB/LSB or byte-level structure, rather combine them into one parameter.
    6. Please explain your reasoning step by step, using both the context and your internal knowledge.
    """
)

prompt_i2c = prompt_i2c_template.format_messages(
    chunk=consolidated_chunks,
    sensor_name=sensor_name,
    size=size,
)

CoT_response = model.invoke(prompt_i2c).content.strip()
print(f"Response: {CoT_response}")

# The context is correct. The output is wrong, but in chatgpt website, the output is correct.
# Maybe they are using reasoning and chain of thought which might be super helpful.

Response: Below is the answer with a step‐by‐step explanation:

1. The MCP9808 sensor measures temperature.

2. When you receive the two-byte (16‑bit) big‑endian raw data from the sensor, the digital temperature value is encoded across 12 bits. In the 16‑bit word, the temperature information occupies bits 15 down to 4. In the requested format, you would express that as:  
  Temperature[15:4]

Step‐by‐step reasoning:
• From the provided context we know the sensor is a digital temperature sensor designed to measure temperature between –20°C and +100°C.

• Internal documentation (and the sensor’s datasheet) tells us that the temperature is represented with a resolution of 0.0625°C, which implies that the value is stored as a scaled 12‑bit two’s complement number within a 16‑bit register. The two bytes that come from the sensor (in big‑endian order) are combined to form this 16‑bit word.

• The sensor uses the upper 12 bits (bit positions 15 down to 4) to encode temperature, while the lowe

In [24]:
prompt_i2c_feedback_template = ChatPromptTemplate.from_template(
    """
    You are a helpful assistant and an expert in I2C Sensors.

    My expert told me:
    {i2c_CoT_response}

    How is the raw measurement values arranged in {sensor_name}? Extract only the measurement parameters.
    ONLY FILL IN the sentence, the raw measurement values are arranged as: (parameter1: [index1:index1], parameter2: [index2:index2], ...)
    """
)

prompt_i2c_feedback = prompt_i2c_feedback_template.format_messages(
    i2c_CoT_response=CoT_response,
    sensor_name=sensor_name
)
i2c_feedback_response = model.invoke(prompt_i2c_feedback).content.strip()
print(f"Response: {i2c_feedback_response}")

Response: the raw measurement values are arranged as: (temperature: [15:4])


In [25]:
import re

prompt_i2c_cleanup_template = ChatPromptTemplate.from_template(
    """
    You are a helpful assistant and an expert in I2C Sensors.

    My expert told me:
    {i2c_feedback_response}

    Convert the arrangement to the correct format.
    If the value spans multiple bytes, only use the first and the last index.
    ONLY FILL IN the sentence, the raw measurement values are arranged as: (parameter1: [index1:index1], parameter2: [index2:index2], ...)
    """
)
prompt_i2c_cleanup = prompt_i2c_cleanup_template.format_messages(
    i2c_feedback_response=i2c_feedback_response
)
i2c_cleanup_response = model.invoke(prompt_i2c_cleanup).content.strip()
print(f"Response: {i2c_cleanup_response}")


Response: the raw measurement values are arranged as: (temperature: [15:4])


In [26]:
# Extract only content within parentheses
matches = re.findall(r'\((.*?)\)', i2c_cleanup_response)

extracted_content = matches[0] if matches else ""

print(f"Response: {extracted_content}")


Response: temperature: [15:4]
