In [67]:
# Imports
import time
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain.chains import create_retrieval_chain
from langchain_openai import ChatOpenAI
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from  langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import getpass
import os
from dotenv import load_dotenv
from langchain.chat_models import init_chat_model
from langchain_core.messages import HumanMessage, SystemMessage
from duckduckgo_search import DDGS
import requests
import streamlit as st
import pymupdf
import re
import logging

log_filename = f"logs/LLM_RAG_{time.strftime('%Y-%m-%d-%H-%M-%S')}.log"
# Set up logging configuration
logger = logging.getLogger()
fhandler = logging.FileHandler(filename=log_filename, mode='a')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.setLevel(logging.DEBUG)

load_dotenv()
if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API Key: ")
model = init_chat_model("o3-mini", model_provider="openai")
logging.debug(f"Loaded model {model}")



In [68]:
sensor_name = "TMP102"
logging.info(f"Sensor name: {sensor_name}")

In [69]:
# Find and download sensor datasheet

search_query = f"{sensor_name} datasheet filetype:pdf"
search_results = DDGS().text(search_query)
if search_results:
    datasheet_url = search_results[0]['href']
    logging.debug(f"Datasheet URL: {datasheet_url}")
    logging.debug("Downloading datasheet...")
    response = requests.get(datasheet_url)
    if response.status_code == 200:
        if not os.path.exists(f"/home/steven/FYP/v2_LLM_OS/LLM/Datasheet_DB/{sensor_name}.pdf"):     
            with open(f"/home/steven/FYP/v2_LLM_OS/LLM/Datasheet_DB/{sensor_name}.pdf", "wb") as file:
                for chunk in response.iter_content(1024):
                    file.write(chunk)
            logging.debug("Datasheet downloaded!")
        else:
            logging.debug("Datasheet already exists in the database.")
    logging.debug("Loading datasheet...")
    datasheet_path = f"/home/steven/FYP/v2_LLM_OS/LLM/Datasheet_DB/{sensor_name}.pdf"
    logging.debug("Datasheet loaded!")
else:
    logging.debug("No datasheet found for this I2C sensor.")

In [70]:
# Load and partition the datasheet into elements
# 5 levels of partitioning
import pymupdf4llm
import pathlib
from langchain_text_splitters import RecursiveCharacterTextSplitter, MarkdownTextSplitter
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_core.documents import Document

md_path = f"/home/steven/FYP/v2_LLM_OS/LLM/MD_DB/md_{sensor_name}.md"
if not os.path.exists(md_path):
    md_text = pymupdf4llm.to_markdown(datasheet_path)
    pathlib.Path(md_path).write_bytes(md_text.encode())
    logging.debug("Datasheet Partition does not exist. Created a new parition")
else:
    md_text = pathlib.Path(md_path).read_text()
    logging.debug("Datasheet partition exists. Loaded from local file")

splitter = MarkdownTextSplitter(chunk_size=500, chunk_overlap=100)

docs = splitter.create_documents([md_text])

logging.debug(len(docs))
# Join all document contents into one string
all_text = "\n\n---------XXXX----------\n\n".join(doc.page_content for doc in docs)

# Save to a single file
output_file = f"/home/steven/FYP/v2_LLM_OS/LLM/MD_DB/split_md_{sensor_name}.md"
with open(output_file, "w", encoding="utf-8") as f:
    f.write(all_text)



In [71]:
# Embed the datasheet chunks using FAISS
#TODO: We might want to use multiple datasheets for the same sensor
embeddings = OpenAIEmbeddings(
    openai_api_key=os.getenv("OPENAI_API_KEY"), 
    model="text-embedding-ada-002"
)

vector_db_path = f"/home/steven/FYP/v2_LLM_OS/LLM/Datasheet_Vector_DB/{sensor_name}"
if not os.path.exists(vector_db_path):
    vector_db = FAISS.from_documents(docs, embeddings)
    vector_db.save_local(vector_db_path)
    logging.debug("Vector DB not found, created and saved a new Vector DB")
else:
    vector_db = FAISS.load_local(vector_db_path, embeddings, allow_dangerous_deserialization=True)
    logging.debug("Vector DB found, loaded from local file")

In [72]:
def retrieve_chunk(query): # Take 10 most similar chunks from the vector DB using cosine simlarity.
    retriever = vector_db.as_retriever(search_type="similarity", search_kwargs={"k": 3})
    retrieved_chunk = retriever.invoke(query)
    return retrieved_chunk

In [73]:
# Iterate through the chunks. Ask the LLM if the chunk is helpful for answering the query. (Chunk validation)
# How do I ask LLM if the chunk is helpful, if not mark the chunk as not helpful and retrieve the next chunk?

def validate_chunk(chunks, query):
    # Create a prompt to validate the chunk
    validation_prompt = ChatPromptTemplate.from_template(
        """
        You are an assistant that validates if a provided document chunk is helpful in answering the user's query.

        QUERY:
        {query}

        CHUNK:
        {chunk}

        Is this chunk helpful for answering the query? Respond ONLY with 'Yes' or 'No'.
        """
    )

    validated_chunks = []

    # Inspect the retrieved chunks (optional, for debugging purposes)
    for idx, chunk in enumerate(chunks):
        logging.debug(f"Retrieved Chunk {idx+1}: {chunk.page_content}")
        prompt = validation_prompt.format_messages(query=query, chunk=chunk.page_content)
        # logging.debug(prompt)
        response = model.invoke(prompt).content.strip().lower()
        logging.debug(response)
        if 'yes' in response:
            validated_chunks.append(chunk)
            logging.debug("YES. Chunk is helpful, proceeding with the next steps")
        else:
            logging.debug("NO. Chunk not helpful, moving to next chunk")
            continue
    return validated_chunks

In [74]:
# Consolidate the validated chunks

def consolidate_chunks(validated_chunks):
    consolidated_chunks = ""
    for idx, chunk in enumerate(validated_chunks):
        consolidated_chunks += f"{idx+1}. {chunk.page_content}\n"
    return consolidated_chunks

In [75]:
def extract_i2c_address(consolidated_chunks):
    # Chain of Thought Reasoning LLM to extract the I2C address from the consolidated chunks
    # https://www.datacamp.com/tutorial/chain-of-thought-prompting
    # Create a prompt to extract the I2C address
    extraction_prompt = ChatPromptTemplate.from_template(
        """
        You are a helpful assistant and an expert in I2C Sensors. Assume ideal and default condition.

        Raw context (might be inaccurate):
        {chunk}

        From your knowledge, what is the I2C address of {sensor_name}? Respond ONLY the hexadecimal value.
        """
    )

    extraction_prompt = extraction_prompt.format_messages(
        chunk=consolidated_chunks,
        sensor_name=sensor_name
    )

    response = model.invoke(extraction_prompt).content.strip()
    logging.debug(f"Response: {response}")
    return response

In [76]:
def extract_i2c_sensor_init_cmd(consolidated_chunks):
    #CoT reasoning to extract the I2C address
    prompt_i2c_template = ChatPromptTemplate.from_template(
        """
        You are a helpful assistant and an expert in I2C Sensors. Assume ideal and default condition.

        Raw context (might be inaccurate):
        {chunk}

        From your knowledge, what is the init or initialize registers of {sensor_name}? Show me the reasoning process step by step and use your memory.
        If it is not needed, please say so.
        """
    )

    prompt_i2c = prompt_i2c_template.format_messages(
        chunk=consolidated_chunks,
        sensor_name=sensor_name
    )

    response = model.invoke(prompt_i2c).content.strip()
    logging.debug(f"Response: {response}")

    # Feedback to get hexadecimal values
    prompt_i2c_feedback_template = ChatPromptTemplate.from_template(
        """
        You are a helpful assistant and an expert in I2C Sensors. Assume ideal and default condition.

        My expert told me:
        {i2c_CoT_response}

        What are the hexadecimal values to write to the i2c address to initialize the {sensor_name} sensor?
        If it is not needed, output "INOP".
        Finish the sentence, the hexadecimal values are:
        """
    )

    prompt_i2c_feedback = prompt_i2c_feedback_template.format_messages(
        i2c_CoT_response=response,
        sensor_name=sensor_name
    )
    i2c_feedback_response = model.invoke(prompt_i2c_feedback).content.strip()
    logging.debug(f"Response: {i2c_feedback_response}")

    if "INOP" in i2c_feedback_response:
        logging.debug("No initialization needed")
        return "INOP"

    else:
        # Cleanup the response to get only the hexadecimal values
        prompt_i2c_cleanup_template = ChatPromptTemplate.from_template(
            """
            You are a helpful assistant and hexadecimal values extractor.

            My expert told me:
            {i2c_feedback_response}

            Extract only the hexadecimal values separated by commas.
            If it is blank, output blank.
            """
        )
        prompt_i2c_cleanup = prompt_i2c_cleanup_template.format_messages(
            i2c_feedback_response=i2c_feedback_response
        )
        i2c_cleanup_response = model.invoke(prompt_i2c_cleanup).content.strip()
        logging.debug(f"Response: {i2c_cleanup_response}")

        return i2c_cleanup_response

In [77]:
def extract_i2c_sensor_read_cmd(consolidated_chunks):
    #CoT reasoning to extract the I2C address
    prompt_i2c_template = ChatPromptTemplate.from_template(
        """
        You are a helpful assistant and an expert in I2C Sensors. Assume ideal and default condition.

        Raw context (might be inaccurate):
        {chunk}

        From your knowledge, what is the trigger measurement read commands of {sensor_name}? Show me the reasoning process step by step and use your memory.
        If it is not needed, please say so.
        """
    )

    prompt_i2c = prompt_i2c_template.format_messages(
        chunk=consolidated_chunks,
        sensor_name=sensor_name
    )

    CoT_response = model.invoke(prompt_i2c).content.strip()
    logging.debug(f"Response: {CoT_response}")

    # Feedback to get hexadecimal values
    prompt_i2c_feedback_template = ChatPromptTemplate.from_template(
        """
        You are a helpful assistant and an expert in I2C Sensors. Assume ideal and default condition.

        My expert told me:
        {i2c_CoT_response}

        What are the hexadecimal values to write to the i2c address to trigger measurement or read data from {sensor_name} sensor?
        If it is not needed, output "INOP".
        Finish the sentence, the hexadecimal values are:
        """
    )

    prompt_i2c_feedback = prompt_i2c_feedback_template.format_messages(
        i2c_CoT_response=CoT_response,
        sensor_name=sensor_name
    )
    i2c_feedback_response = model.invoke(prompt_i2c_feedback).content.strip()
    logging.debug(f"Response: {i2c_feedback_response}")

    if "INOP" in i2c_feedback_response:
        logging.debug("No read command needed")
        return "INOP"
    else:
        # Cleanup the response to get only the hexadecimal values
        prompt_i2c_cleanup_template = ChatPromptTemplate.from_template(
            """
            You are a helpful assistant and hexadecimal values extractor.

            My expert told me:
            {i2c_feedback_response}

            Extract only the hexadecimal values separated by commas.
            """
        )
        prompt_i2c_cleanup = prompt_i2c_cleanup_template.format_messages(
            i2c_feedback_response=i2c_feedback_response
        )
        i2c_cleanup_response = model.invoke(prompt_i2c_cleanup).content.strip()
        logging.debug(f"Response: {i2c_cleanup_response}")

        return i2c_cleanup_response

In [78]:
def extract_i2c_sensor_data_len(consolidated_chunks):
    # CoT reasoning to extract the data length
    prompt_i2c_template = ChatPromptTemplate.from_template(
        """
        You are a helpful assistant and an expert in I2C Sensors. Assume ideal and default condition.

        Raw context (might be inaccurate):
        {chunk}

        From your knowledge, {sensor_name} sensor data output length in bytes? Show me the reasoning process step by step and use your memory.
        """
    )

    prompt_i2c = prompt_i2c_template.format_messages(
        chunk=consolidated_chunks,
        sensor_name=sensor_name
    )

    CoT_response = model.invoke(prompt_i2c).content.strip()
    logging.debug(f"Response: {CoT_response}")

    # Feedback to get length in bytes
    prompt_i2c_feedback_template = ChatPromptTemplate.from_template(
        """
        You are a helpful assistant and an expert in I2C Sensors. Assume ideal and default condition.

        My expert told me:
        {i2c_CoT_response}

        What are the {sensor_name} sensor data output length in bytes?
        ONLY fill in this sentence, the sensor data output length is X bytes
        """
    )

    prompt_i2c_feedback = prompt_i2c_feedback_template.format_messages(
        i2c_CoT_response=response,
        sensor_name=sensor_name
    )
    i2c_feedback_response = model.invoke(prompt_i2c_feedback).content.strip()
    logging.debug(f"Response: {i2c_feedback_response}")

    # Cleanup the response to get only the numerical values
    prompt_i2c_cleanup_template = ChatPromptTemplate.from_template(
        """
        You are a helpful assistant and values extractor.

        My expert told me:
        {i2c_feedback_response}

        Extract only the numerical byte value.
        """
    )
    prompt_i2c_cleanup = prompt_i2c_cleanup_template.format_messages(
        i2c_feedback_response=i2c_feedback_response
    )
    i2c_cleanup_response = model.invoke(prompt_i2c_cleanup).content.strip()
    logging.debug(f"Response: {i2c_cleanup_response}")

    return i2c_cleanup_response

In [None]:
def extract_i2c_data_key_val(consolidated_chunks, size):
    prompt_i2c_template = ChatPromptTemplate.from_template(
        """
        You are a helpful assistant and an expert in I2C Sensors. Assume ideal and default condition.

        Raw context (might be inaccurate):
        {chunk}

        1. What physical quantities or parameters does the {sensor_name} measure?
        2. Assuming raw_bytes is a {size}-byte array received from the sensor, provide the bit range for each physical parameter in the format: ParameterName[start_bit:end_bit] (For example, Temperature[0:11])
        3. The raw_bytes is big-endian.
        4. Omit anything that is unrelated to the raw data such as alert, config, or crc.
        5. Do not describe MSB/LSB or byte-level structure, rather combine them into one parameter.
        6. Please explain your reasoning step by step, using both the context and your internal knowledge.
        """
    )

    prompt_i2c = prompt_i2c_template.format_messages(
        chunk=consolidated_chunks,
        sensor_name=sensor_name,
        size=size
    )

    CoT_response = model.invoke(prompt_i2c).content.strip()
    logging.debug(f"Response: {CoT_response}")

    prompt_i2c_feedback_template = ChatPromptTemplate.from_template(
        """
        You are a helpful assistant and an expert in I2C Sensors. Assume ideal and default condition.

        My expert told me:
        {i2c_CoT_response}

        How is the measurement values arranged in {sensor_name}? Extract only the measurement parameters.
        ONLY FILL IN the sentence, the measurement values are arranged as: (parameter1: [index1:index1], parameter2: [index2:index2], ...)
        """
    )

    prompt_i2c_feedback = prompt_i2c_feedback_template.format_messages(
        i2c_CoT_response=CoT_response,
        sensor_name=sensor_name
    )
    i2c_feedback_response = model.invoke(prompt_i2c_feedback).content.strip()
    logging.debug(f"Response: {i2c_feedback_response}")


    prompt_i2c_cleanup_template = ChatPromptTemplate.from_template(
        """
        You are a helpful assistant and an expert in I2C Sensors. Assume ideal and default condition.

        My expert told me:
        {i2c_feedback_response}

        Convert the arrangement to the correct format.
        If the value spans multiple bytes, only use the first and the last index, in ascending index.
        ONLY FILL IN the sentence, the raw measurement values are arranged as: (parameter1: [index1_LSB:index1_MSB], parameter2: [index2_LSB:index2_MSB], ...)
        """
    )
    prompt_i2c_cleanup = prompt_i2c_cleanup_template.format_messages(
        i2c_feedback_response=i2c_feedback_response
    )
    i2c_cleanup_response = model.invoke(prompt_i2c_cleanup).content.strip()
    logging.debug(f"Response: {i2c_cleanup_response}")

    # Extract only content within parentheses
    matches = re.findall(r'\((.*?)\)', i2c_cleanup_response)

    extracted_content = matches[0] if matches else ""
    extracted_content = "(" + extracted_content + ")"
    logging.debug(f"Response: {extracted_content}")

    return extracted_content


In [80]:
def extract_i2c_sensor_data_scale_format(consolidated_chunks, sensor_data_key_val):
    result = re.sub(r':\s*\[[^\]]*\]', '', sensor_data_key_val)
    sensor_data_key_val = result
    # remove indexes
    # (humidity, temperature)
    logging.debug(f"Sensor data key value: {sensor_data_key_val}")
    
    prompt_i2c_template = ChatPromptTemplate.from_template(
        """
        You are a helpful assistant and an expert in I2C sensors. Assume sensor in room condition.

        Raw context (might be inaccurate! please double check):
        {chunk}

        Create a MATH formula to convert the {sensor_name} data into measurement units.

        Rules:
        1. I have extracted each {sensor_data_key_val} into uint32_t. We call this variable x.
        2. Valid operators are: arithmetic and bitwise operators and modulo.
        3. DO NOT use IF statements.
        4. Use decimal or float numbers. Do not use hex or binary numbers.
        ONLY use x as variable. From your knowledge, explain your reasoning step by step.

        """
    )

    prompt_i2c = prompt_i2c_template.format_messages(
        chunk=consolidated_chunks,
        sensor_name=sensor_name,
        sensor_data_key_val = sensor_data_key_val
    )

    CoT_response = model.invoke(prompt_i2c).content.strip()
    logging.debug(f"Response: {CoT_response}")

    prompt_i2c_feedback_template = ChatPromptTemplate.from_template(
        """
        You are a helpful assistant and an expert in I2C Sensors. Assume sensor in room condition.

        My expert told me:
        {i2c_CoT_response}

        Please provide the reverse polish notation for the conversion formula.
        Represent the raw data as X. If there are bitwise operators, substitute it with programming bitwise operators.
        Provide one reverse polish notation for each parameter: {sensor_data_key_val}.
        """
    )

    prompt_i2c_feedback = prompt_i2c_feedback_template.format_messages(
        i2c_CoT_response=CoT_response,
        sensor_name=sensor_name,
        sensor_data_key_val = sensor_data_key_val
    )
    i2c_feedback_response = model.invoke(prompt_i2c_feedback).content.strip()
    logging.debug(f"Response: {i2c_feedback_response}")

    prompt_i2c_cleanup_template = ChatPromptTemplate.from_template(
        """
        You are a helpful assistant and an expert in I2C Sensors. Assume sensor in room condition.

        My expert told me:
        {i2c_feedback_response}

        X is the raw data. For each parameter from {sensor_data_key_val}, please arrange it as follows:
        ONLY FILL IN the sentence, the measurement values are arranged as: (parameter1: "reverse_polish_notation1", parameter2: "reverse_polish_notation1", ...)
        """
    )
    prompt_i2c_cleanup = prompt_i2c_cleanup_template.format_messages(
        i2c_feedback_response=i2c_feedback_response,
        sensor_data_key_val = sensor_data_key_val
    )
    i2c_cleanup_response = model.invoke(prompt_i2c_cleanup).content.strip()
    logging.debug(i2c_cleanup_response)

    # Extract only content within parentheses
    matches = re.findall(r'\((.*?)\)', i2c_cleanup_response)
    matches = [re.sub(r'[–—]', '-', match) for match in matches]
    
    extracted_content = matches[0] if matches else ""
    extracted_content = "(" + extracted_content + ")"
    logging.debug(f"Response: {extracted_content}")

    return extracted_content

In [81]:
# Main function to run the code
queries = [
    "I2C address hexadecimal value",
    "Initialize registers hexadecimal value" ,
    "Trigger measurement read command hexadecimal value" ,
    "Sensor data output length in bytes",
    "Sensor measurement data",
    "Formula to convert raw sensor data to measurement units"
]

functions = [
    extract_i2c_address,
    extract_i2c_sensor_init_cmd,
    extract_i2c_sensor_read_cmd,
    extract_i2c_sensor_data_len,
    extract_i2c_data_key_val,
    extract_i2c_sensor_data_scale_format
]

def main():
    with open(f"logs/OUT_LLM_RAG_{time.strftime('%Y-%m-%d-%H-%M-%S')}.txt", "w") as f:
        f.write(f"Sensor name: {sensor_name}\n")
        for query, function in zip(queries, functions):
            logging.info(f"Query: {query}")
            print(f"Query: {query}")
            f.write(f"Query: {query}\n")
            retrieved_chunk = retrieve_chunk(query)
            validated_chunks = validate_chunk(retrieved_chunk, query)
            consolidated_chunks = consolidate_chunks(validated_chunks)
            if function == extract_i2c_sensor_data_len:
                response = function(consolidated_chunks)
                sensor_data_len = response
            elif function == extract_i2c_data_key_val:
                response = function(consolidated_chunks, sensor_data_len)
                sensor_data_key_val = response
            elif function == extract_i2c_sensor_data_scale_format:
                response = function(consolidated_chunks, sensor_data_key_val)
            else:
                response = function(consolidated_chunks)
            logging.info(f"Response: {response}")
            print(f"Response: {response}")
            f.write(f"Response: {response}\n")
            logging.info("\n")
            print("\n")

            retrieved_chunk = None
            validated_chunks = None
            consolidated_chunks = None
            response = None
main()

Query: I2C address hexadecimal value
Response: 0x48


Query: Initialize registers hexadecimal value
Response: INOP


Query: Trigger measurement read command hexadecimal value
Response: INOP


Query: Sensor data output length in bytes
Response: 2


Query: Sensor measurement data
Response: (Temperature: [0:11])


Query: Formula to convert raw sensor data to measurement units
Response: (Temperature: "X 4 >> X 11 >> 1 & 4096 * - 0.0625 *")


