In [3]:
# Imports
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain.chains import create_retrieval_chain
from langchain_openai import ChatOpenAI
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from  langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import getpass
import os
from dotenv import load_dotenv
from langchain.chat_models import init_chat_model
from langchain_core.messages import HumanMessage, SystemMessage
from duckduckgo_search import DDGS
import requests
import streamlit as st
import pymupdf

import logging
# Set up logging configuration
logging.basicConfig(
    level=logging.WARNING,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('partition_pdf.log'),  # write logs to partition_pdf.log file
        logging.StreamHandler()                    # also print logs to console
    ]
)

load_dotenv()
if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API Key: ")
model = init_chat_model("o3-mini", model_provider="openai")




In [4]:
sensor_name = "Adafruit AHT20"

In [5]:
# Find and download sensor datasheet

search_query = f"{sensor_name} datasheet filetype:pdf"
search_results = DDGS().text(search_query)
if search_results:
    datasheet_url = search_results[0]['href']
    print(f"Datasheet URL: {datasheet_url}")
    print("Downloading datasheet...")
    response = requests.get(datasheet_url)
    if response.status_code == 200:
        if not os.path.exists(f"/home/steven/FYP/v2_LLM_OS/LLM/Datasheet_DB/{sensor_name}.pdf"):     
            with open(f"/home/steven/FYP/v2_LLM_OS/LLM/Datasheet_DB/{sensor_name}.pdf", "wb") as file:
                for chunk in response.iter_content(1024):
                    file.write(chunk)
            print("Datasheet downloaded!")
        else:
            print("Datasheet already exists in the database.")
    print("Loading datasheet...")
    datasheet_path = f"/home/steven/FYP/v2_LLM_OS/LLM/Datasheet_DB/{sensor_name}.pdf"
    print("Datasheet loaded!")
else:
    print("No datasheet found for this I2C sensor.")

Datasheet URL: https://cdn-learn.adafruit.com/downloads/pdf/adafruit-aht20.pdf
Downloading datasheet...
Datasheet already exists in the database.
Loading datasheet...
Datasheet loaded!


In [6]:
# Load and partition the datasheet into elements
# 5 levels of partitioning

from unstructured.partition.pdf import partition_pdf
from unstructured.staging.base import elements_to_dicts, elements_from_dicts
import json

elements_file_path = f"/home/steven/FYP/v2_LLM_OS/LLM/Elements_DB/elements_{sensor_name}"
if not os.path.exists(elements_file_path):
    elements = partition_pdf(
        filename=datasheet_path,

        strategy="hi_res",
        extract_tables = True,
        infer_table_structure=True,
        model_name="yolox"
    )
    elements_dict = elements_to_dicts(elements)
    with open(elements_file_path, 'w', encoding='utf-8') as f:
        json.dump(elements_dict, f, ensure_ascii=False, indent=4)
    print("Datasheet Partition does not exist. Created a new parition")
else:
    with open(elements_file_path, 'r', encoding='utf-8') as f:
        elements_dict = json.load(f)
    elements = elements_from_dicts(elements_dict)
    print("Datasheet partition exists. Loaded from local file")

Datasheet partition exists. Loaded from local file


In [7]:
# Print the chunks into file
'''
from unstructured.chunking.title import chunk_by_title
output_txt_path = f'{sensor_name}_chunks_with_metadata.txt'

with open(output_txt_path, 'w', encoding='utf-8') as f:
    for idx, element in enumerate(elements, 1):
        f.write(f"Chunk {idx}\n")
        f.write(f"Type: {element.category}\n")

        # Include metadata if available
        metadata = element.metadata.to_dict() if element.metadata else {}
        for key, value in metadata.items():
            f.write(f"{key}: {value}\n")

        f.write("Content:\n")
        f.write(element.text.strip() + "\n")

        f.write("\n----------------------------\n\n")

print(f"Chunked text with metadata saved to {output_txt_path}")'
'''

'\nfrom unstructured.chunking.title import chunk_by_title\noutput_txt_path = f\'{sensor_name}_chunks_with_metadata.txt\'\n\nwith open(output_txt_path, \'w\', encoding=\'utf-8\') as f:\n    for idx, element in enumerate(elements, 1):\n        f.write(f"Chunk {idx}\n")\n        f.write(f"Type: {element.category}\n")\n\n        # Include metadata if available\n        metadata = element.metadata.to_dict() if element.metadata else {}\n        for key, value in metadata.items():\n            f.write(f"{key}: {value}\n")\n\n        f.write("Content:\n")\n        f.write(element.text.strip() + "\n")\n\n        f.write("\n----------------------------\n\n")\n\nprint(f"Chunked text with metadata saved to {output_txt_path}")\'\n'

In [8]:
# Remove image from the chunk since our current embedding model does not support image
elements = [element for element in elements if element.category != 'Image']


In [9]:
# Explicit conversion from elements to Langchain Document objects
#FIXME: The text output is very small why? The chunk result itself is actually really small T_T
#TODO: We might want to combine title and content into one chunk, or delete title category.
output_docs_path = f'{sensor_name}_docs_with_metadata.txt'

from langchain.schema import Document
documents = []
with open(output_docs_path, 'w', encoding='utf-8') as f:
    for idx, element in enumerate(elements, 1):
        f.write(f"Chunk {idx}\n")
        metadata = element.metadata.to_dict() if element.metadata else {}
        metadata.update({"type": element.category})
        doc = Document(page_content=element.text.strip(), metadata=metadata)
        documents.append(doc)
        f.write(f'{element.text} \n')
        f.write("\n---------------------------\n\n")

In [10]:
# Embed the datasheet chunks using FAISS
embeddings = OpenAIEmbeddings(
    openai_api_key=os.getenv("OPENAI_API_KEY"), 
    model="text-embedding-ada-002"
)

vector_db_path = f"/home/steven/FYP/v2_LLM_OS/LLM/Datasheet_Vector_DB/{sensor_name}"
if not os.path.exists(vector_db_path):
    vector_db = FAISS.from_documents(documents, embeddings)
    vector_db.save_local(vector_db_path)
    print("Vector DB not found, created and saved a new Vector DB")
else:
    vector_db = FAISS.load_local(vector_db_path, embeddings, allow_dangerous_deserialization=True)
    print("Vector DB found, loaded from local file")

Vector DB found, loaded from local file


In [11]:
#Take 10 most similar chunks from the vector DB using cosine simlarity.
retriever = vector_db.as_retriever(search_type="similarity", search_kwargs={"k": 10})
query = "I2C address hexadecimal value"

retrieved_chunk = retriever.invoke(query)

In [12]:
# Iterate through the chunks. Ask the LLM if the chunk is helpful for answering the query. (Chunk validation)
# How do I ask LLM if the chunk is helpful, if not mark the chunk as not helpful and retrieve the next chunk?
validation_prompt = ChatPromptTemplate.from_template(
    """
    You are an assistant that validates if a provided document chunk is helpful in answering the user's query.

    QUERY:
    {query}

    CHUNK:
    {chunk}

    Is this chunk helpful for answering the query? Respond ONLY with 'Yes' or 'No'.
    """
)

validated_chunks = []

# Inspect the retrieved chunks (optional, for debugging purposes)
for idx, chunk in enumerate(retrieved_chunk):
    print(f"Retrieved Chunk {idx+1}: {chunk.page_content}")
    prompt = validation_prompt.format_messages(query=query, chunk=chunk.page_content)
    # print(prompt)
    response = model.invoke(prompt).content.strip().lower()
    print(response)
    if 'yes' in response:
        validated_chunks.append(chunk)
        print("YES. Chunk is helpful, proceeding with the next steps")
    else:
        print("NO. Chunk not helpful, moving to next chunk")
        continue

Retrieved Chunk 1: The default I2C address is 0x38. It cannot be changed.
yes
YES. Chunk is helpful, proceeding with the next steps
Retrieved Chunk 2: You should see the AHT20's default I2C address of 0x38 pop-up in the I2C scan list.
yes
YES. Chunk is helpful, proceeding with the next steps
Retrieved Chunk 3: I2C Logic Pins
no
NO. Chunk not helpful, moving to next chunk
Retrieved Chunk 4: I don't see the sensor's I2C address listed!
no
NO. Chunk not helpful, moving to next chunk
Retrieved Chunk 5: Here's the Raspberry Pi wired with I2C:
no
NO. Chunk not helpful, moving to next chunk
Retrieved Chunk 6: On the component configuration page, the AHT20's sensor address should be listed along with the sensor's settings.
no
NO. Chunk not helpful, moving to next chunk
Retrieved Chunk 7: • SDA - I2C data pin, connect to your microcontrollers I2C data line. The logic level is the same as VIN. and it has a 10K pullup already on it.
no
NO. Chunk not helpful, moving to next chunk
Retrieved Chunk 8

In [17]:
# Consolidate the validated chunks
consolidated_chunks = ""
for chunk in validated_chunks:
    consolidated_chunks += f"{chunk.page_content}\n"

print(f"Consolidated Chunks: {consolidated_chunks}")


Consolidated Chunks: The default I2C address is 0x38. It cannot be changed.
You should see the AHT20's default I2C address of 0x38 pop-up in the I2C scan list.

Response: 0x38


In [None]:
# Ask the LLM to extract the I2C address from the consolidated chunks
prompt_i2c_template = ChatPromptTemplate.from_template(
    """
    You are a helpful assistant that returns the hexadecimal address of the I2C sensor.

    Helpful context:
    {chunk}

    From your knowledge, what is the I2C address of {sensor_name}? Respond ONLY the hexadecimal value.
    """
)

prompt_i2c = prompt_i2c_template.format_messages(
    chunk=consolidated_chunks,
    sensor_name=sensor_name
)

response = model.invoke(prompt_i2c).content.strip()
print(f"Response: {response}")