In [1]:
# %pip install -Uqqq pip --progress-bar off
# %pip install -qqq ollama --progress-bar off


# %pip install -qqq owlready2 --progress-bar off

# %pip install -qqq langchain-ollama --progress-bar off
# %pip install -qqq langchain-community --progress-bar off
# %pip install -qqq pypdf --progress-bar off

# %pip install -qqq faiss-cpu --progress-bar off
# %pip install -qqq rank_bm25 --progress-bar off
# %pip install -qqq fuzzywuzzy --progress-bar off
# %pip install -qqq scikit-learn --progress-bar off
# %pip install -qqq sentence-transformers --progress-bar off


In [2]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain_ollama import OllamaEmbeddings
import ollama
import json
from enum import Enum
import json
import re
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.chat_models import ChatOllama
from owlready2 import *
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from fuzzywuzzy import fuzz
from langchain.vectorstores import FAISS
from langchain.retrievers import EnsembleRetriever, BM25Retriever, MultiQueryRetriever
from langchain.embeddings import OllamaEmbeddings

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Initialize the SentenceTransformer model
entity_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

MODEL = "deepseek-r1:8b-llama-distill-q8_0"

file_path ="PDFs/sensors.pdf"

file_name = file_path.split('/')[-1].split('.')[0]
response_path = 'responses/R1_responses_{file}.json'.format(file=file_name)
generated_path = 'generated_JSON/R1_generated_{file}.json'.format(file=file_name)
ontology_path = 'ontology/R1_ontology_{file}.owl'.format(file=file_name)
validtion_json_path = 'QA/R1_validation_{file}.json'.format(file=file_name)

In [4]:
def clean_text(text):
    # Remove LaTeX equations
    text = re.sub(r'\$.*?\$', '', text)  # Remove inline equations
    # Fix hyphenated words
    text = re.sub(r'(\w+)-\n(\w+)', r'\1\2', text)
    # Remove excessive whitespace
    return re.sub(r'\s+', ' ', text).strip()

In [5]:
# Load the PDF
pdf_loader = PyPDFLoader(file_path=file_path,extract_images=False)
docs = pdf_loader.load();
# Enhanced cleaning pipeline
for doc in docs:
    doc.page_content = clean_text(doc.page_content)

In [6]:
def analyze_splits(splits):
    stats = {
        'total_chunks': len(splits),
        'avg_chunk_length': sum(len(c.page_content) for c in splits)/len(splits),
        'max_length': max(len(c.page_content) for c in splits),
        'min_length': min(len(c.page_content) for c in splits),
        'metadata_fields': list(splits[0].metadata.keys()) if splits else []
    }
    return stats

In [7]:
text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50,
        length_function=lambda text: len(text.split()),  # Word-based counting
        add_start_index=True,
       separators=[
        "\n\n## ",    # Section headers
        "\n\n",       # Paragraph breaks
        "\n",         # New lines
        "(?<!\d)\.(?!\d)\s+",  # Sentence ends with space
        ";",          # Semi-colons
        ", ",         # Commas
        " "
        ],
        keep_separator=True,
        is_separator_regex=True,
    )

splits = text_splitter.split_documents(docs)
analyze_splits(splits)

{'total_chunks': 30,
 'avg_chunk_length': 1872.0,
 'max_length': 3228,
 'min_length': 352,
 'metadata_fields': ['producer',
  'creator',
  'creationdate',
  'author',
  'keywords',
  'moddate',
  'subject',
  'title',
  'source',
  'total_pages',
  'page',
  'page_label',
  'start_index']}

In [8]:
# Step 1: Store Embeddings in FAISS (Faster Retrieval)
embeddings = OllamaEmbeddings(model=MODEL)
faiss_store = FAISS.from_documents(splits, embeddings)

# Step 2: Set Up Retrieval Methods
vector_retriever = faiss_store.as_retriever(search_kwargs={"k": 3})
bm25_retriever = BM25Retriever.from_documents(splits)
bm25_retriever.k = 3  # Match vector retriever

# Dynamic Weights for Different Queries
query_type = "entity"  # or "relation"
weights = [0.8, 0.2] if query_type == "entity" else [0.6, 0.4]

# Ensemble Retriever
retriever = EnsembleRetriever(
    retrievers=[vector_retriever, bm25_retriever],
    weights=weights
)
llm = ChatOllama(model=MODEL) 
# Multi-Query Expansion (Better Recall)
mq_retriever = MultiQueryRetriever.from_llm(retriever=retriever, llm=llm)

# Retrieve Once, Then Filter
combined_query = """
EXTRACT: LiDAR sensors, their components, technical specifications.
FIND: has_part, implements, measurement_properties relationships.
IGNORE: Experimental results, methodology, figures.
FILTER: Only technical specifications sections.
"""
retrieved_docs = mq_retriever.get_relevant_documents(combined_query)

retrieved_text = "\n\n".join([doc.page_content for doc in retrieved_docs])

  embeddings = OllamaEmbeddings(model=MODEL)
  llm = ChatOllama(model=MODEL)
  retrieved_docs = mq_retriever.get_relevant_documents(combined_query)


In [9]:
# Initialize with your local model
compressor = LLMChainExtractor.from_llm(llm=llm)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=retriever,
    search_kwargs={"k": 10}  
)

In [10]:
KG_EXTRACTION_PROMPT="""As a LiDAR sensor expert, analyze this technical text to extract:

**Target Entities**:
1. Sensor Models: Manufacturer-branded names (Velodyne HDL-64E, Livox Horizon)
2. Components: Physical/software parts (e.g., MEMS mirror, FPGA processor)
3. Technical Specs: Quantified values with units (120m range, 0.08° resolution)
4. Implementations: Standards/protocols (IEEE 802.11p, ROS2)
5. Category: Automotive LiDAR, Industrial LiDAR, etc.

**Extraction Rules**:
- Preserve contextual relationships: 
  *Example*:
  "The Velodyne VLP-32C rotating assembly enables 360° coverage" → 
  {{"parts": ["rotating assembly"], "properties": ["field of view: 360°"]}}
- Capture implied properties from comparisons:
  *Example*:
  "Outperforms Ouster OS2 in range" → 
  {{"properties": ["comparative_range: > Ouster OS2"]}}


**IMPORTANT**:
The example block is for demonstration purposes only and should not affect the final extraction.
Do NOT include any sensor data from the example in your final output.
Only extract sensors that are explicitly mentioned in the text below.

<example>
Input: "The Velodyne VLP-32C has a 200m range and 0.2° resolution"
Output:
{{
  "sensors": [
    {{
      "name": "Velodyne VLP-32C",
      "category": "Automotive LiDAR",
      "parts": [],
      "properties": ["range: 200m", "resolution: 0.2°"],
      "implements": []
    }}
  ]
}}
</example>

**Format**:
```json
{{
  "sensors": [
    {{
      "name": "",
      "category": "",
      "parts": [],
      "properties": [],
      "implements": []
    }}
  ]
}}
```
**Critical Instructions**:
1. If no sensors are found, return: {{"sensors": []}}
2. Always maintain valid JSON structure
3. Never add extra text outside the JSON
4. Use exact values from tables when available

<text>
{text}
<text>
"""

In [11]:
class ResponseFormat(Enum):
    JSON = "json_object"
    TEXT = "text"
 
 
def call_model(
    prompt: str, response_format: ResponseFormat = ResponseFormat.TEXT
) -> str:
    response = ollama.generate(
        model=MODEL,
        prompt=prompt,
        keep_alive="1h",
        format="" if response_format == ResponseFormat.TEXT else "json",
    )
    return response["response"]

In [12]:
chunks = text_splitter.split_text(retrieved_text)


In [13]:
# Prepare the final prompt with the concatenated text

responses = []

for chunk in chunks:
        final_prompt = KG_EXTRACTION_PROMPT.format(text=chunk)
        # Send the final prompt to Ollama
        # print (final_prompt)
        response = call_model(final_prompt)
        responses.append(response)

In [14]:


#Save responses after every chunk to ensure progress is retained
with open(response_path, "w", encoding="utf-8") as f:
    json.dump(responses, f)

In [15]:
import json
import unicodedata
import re

def clean_string(s):
    """
    Clean and normalize strings by stripping whitespace, decoding unicode escapes,
    normalizing characters, replacing common separators with a space, and fixing known encoding issues.
    """
    if not isinstance(s, str):
        return str(s).lower()
    s = s.strip()
    s = unicodedata.normalize('NFKC', s)
    s = s.replace("â", "")
    s = s.replace('<', ',')
    s = s.replace('>', ',')
    return s.lower()

def make_valid_iri_fragment(s):
    """
    Convert a string into a valid IRI fragment:
    - First clean the string,
    - Then use URL encoding to percent-encode any characters not in the safe set.
    """
    cleaned = clean_string(s)
    # Allow only alphanumerics, underscore, and hyphen.
    safe_chars = "_-abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
    return urllib.parse.quote(cleaned, safe=safe_chars)

def extract_json_part(response_str):
    """
    Extract the JSON block from Ollama's response string.
    Tries to find the JSON block delimited by ```json and ```.
    """
    try:
        json_str = response_str.split("```json")[-1].split("```")[0].strip()

        return json.loads(json_str)
    except (IndexError, json.JSONDecodeError) as e:
        print(f"Failed to extract JSON: {str(e)}")
        return None

# Load responses from file
with open(response_path, "r", encoding="utf-8") as f:
    raw_data = f.read()

# Try parsing as a full JSON array; if that fails, split by lines.
try:
    responses = json.loads(raw_data)
except json.JSONDecodeError:
    responses = [json.loads(line) for line in raw_data.splitlines() if line.strip()]



In [None]:
# Use a dictionary to accumulate unique sensors based on (name, category, parts, properties)
sensor_dict = {}

for response in responses:
    # If the response is not a dict, try extracting the JSON block
    if not isinstance(response, dict):
        extracted = extract_json_part(response)
        if extracted is None:
            continue
        response = extracted

    if isinstance(response, list):
        continue
    sensors = response.get("sensors", [])
    if not isinstance(sensors, list):
        continue

    for sensor in sensors:
        # Validate required fields and clean them
        sensor_name = clean_string(sensor.get("name", "unnamed sensor"))
        category_name = clean_string(sensor.get("category", "uncategorized"))
        parts_list = sensor.get("parts", [])
        properties_list = sensor.get("properties", [])
        implements_list = sensor.get("implements", [])

        cleaned_parts = sorted(clean_string(p) for p in parts_list if p)
        cleaned_properties = sorted(clean_string(p) for p in properties_list if p)
        cleaned_implements = sorted(clean_string(i) for i in implements_list if i)

        # Use key without implements for merging duplicates:
        key = (sensor_name, category_name, tuple(cleaned_parts), tuple(cleaned_properties))
        
        if key in sensor_dict:
            # Merge the implements list (union of values)
            current_impl = set(sensor_dict[key]["implements"])
            new_impl = set(cleaned_implements)
            sensor_dict[key]["implements"] = sorted(current_impl.union(new_impl))
        else:
            sensor_dict[key] = {
                "name": sensor_name,
                "category": category_name,
                "parts": sorted(cleaned_parts),
                "properties": sorted(cleaned_properties),
                "implements": sorted(cleaned_implements)
            }

# Convert the sensor dictionary to a list
final_sensors = list(sensor_dict.values())

# Save the final, deduplicated JSON
with open(generated_path, "w", encoding="utf-8") as out_file:
    json.dump({"sensors": final_sensors}, out_file, ensure_ascii=False, indent=2)

print(f"Processed {len(final_sensors)} unique sensors.")

Processed 29 unique sensors.


In [None]:
# Load the JSON file
with open(generated_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Create a new ontology
onto = get_ontology("http://example.org/sensor_ontology.owl")

with onto:
    # Define basic classes and properties
    class Sensor(Thing):
        pass
    
    class Part(Thing):
        pass
    
    class Property(Thing):
        pass
    
    class Technology(Thing):
        pass

    class has_part_directly(ObjectProperty):
        domain = [Sensor]
        range = [Part]
    
    class implements(ObjectProperty):
        domain = [Sensor]
        range = [Technology]
    
    class has_property(ObjectProperty):
        domain = [Sensor]
        range = [Property]

    # Define an annotation property for category
    class category(AnnotationProperty):
        pass

    # Process the JSON data
    category_classes = {} # dictionary to store created category classes.
    part_classes = {}  # Cache for part classes to avoid duplicates
    property_classes = {}  # Cache for property classes
    technology_classes = {}  # Cache for technology classes

# Process each sensor from the JSON data
    for sensor_data in data["sensors"]:
        # Clean sensor name and category
        # sensor_name = clean_string(sensor_data["name"]).replace(" ", "_")
        sensor_name = sensor_data["name"]
        # sensor_category_name = clean_string(sensor_data["category"]).replace(" ", "_")
        sensor_category_name = sensor_data["category"]
        # Avoid creating a sensor if its name equals the category (to prevent a cycle)
        if sensor_name == sensor_category_name:
            print(f"Skipping sensor '{sensor_name}' because it equals its category.")
            continue
        
        # Create or retrieve the category class (as a subclass of Sensor)
        if sensor_category_name not in category_classes:
            # Create new category class as a subclass of Sensor
            category_class = types.new_class(sensor_category_name, (Sensor,))
            category_classes[sensor_category_name] = category_class
        else:
            category_class = category_classes[sensor_category_name]

        # Create the sensor model as a subclass of the category class
        sensor_class = types.new_class(sensor_name, (category_class,))
        
        # Process and link parts
        for part_name in sensor_data.get("parts", []):
            # part_clean = clean_string(part_name).replace(" ", "_")
            part_clean = make_valid_iri_fragment(part_name)
            if part_clean not in part_classes:
                part_class = types.new_class(part_clean, (Part,))
                part_classes[part_clean] = part_class
            else:
                part_class = part_classes[part_clean]
            sensor_class.is_a.append(has_part_directly.some(part_class))

        # Process and link properties
        for prop_name in sensor_data.get("properties", []):
            # prop_clean = clean_string(prop_name).replace(" ", "_")
            prop_clean = make_valid_iri_fragment(prop_name)
            if prop_clean not in property_classes:
                prop_class = types.new_class(prop_clean, (Property,))
                property_classes[prop_clean] = prop_class
            else:
                prop_class = property_classes[prop_clean]
            sensor_class.is_a.append(has_property.some(prop_class))

        # Process and link technologies (implements relationship)
        for tech_name in sensor_data.get("implements", []):
            # tech_clean = clean_string(tech_name).replace(" ", "_")
            tech_clean = make_valid_iri_fragment(tech_name)
            if tech_clean not in technology_classes:
                tech_class = types.new_class(tech_clean, (Technology,))
                technology_classes[tech_clean] = tech_class
            else:
                tech_class = technology_classes[tech_clean]
            sensor_class.is_a.append(implements.some(tech_class))
# Save the ontology to a file
onto.save(file=ontology_path, format="rdfxml")

print("Ontology created and saved as sensor_ontology_with_properties_and_implements.owl")


Processing sensor: 'velarray' in category 'automotive lidar'
Processing sensor: 'blickfeld cube' in category 'automotive lidar'
Processing sensor: 'velodyne vlp-32c' in category 'automotive lidar'
Processing sensor: 'velodyne vlp-32c' in category 'automotive lidar'
Processing sensor: 'ouster os2' in category 'automotive lidar'
Processing sensor: 'livox horizon' in category 'automotive lidar'
Processing sensor: '' in category ''
Skipping sensor '' because it equals its category.
Processing sensor: '' in category 'automotive lidar'
Processing sensor: '360 ° rotary lidar' in category 'automotive lidar'
Processing sensor: 'robosense m1' in category 'automotive lidar'
Processing sensor: 'velodyne velarray h800' in category 'automotive lidar'
Processing sensor: 'livox horizon' in category 'automotive lidar'
Processing sensor: '' in category 'generic lidar'
Processing sensor: 'velodyne velarray h800' in category 'automotive lidar'
Processing sensor: 'livox horizon' in category 'automotive lid

In [18]:
# Fuzzy verification function using fuzzywuzzy
def fuzzy_verify(name, chunks, threshold=75):
    name_clean = name.lower()
    max_score = 0
    for chunk in chunks:
        chunk_clean = chunk.page_content.lower()
        score = fuzz.token_set_ratio(name_clean, chunk_clean)
        if score > max_score:
            max_score = score
    return max_score, max_score >= threshold

# Semantic verification using sentence-transformers
def semantic_verify(sensor, chunks, model, threshold=0.65):
    # Use sensor name and category for additional context
    category = sensor.get('category', '').strip()
    name = sensor.get('name', '').strip()
    if category:
        query_text = f"LiDAR sensor model {name} used in {category}"
    else:
        query_text = f"LiDAR sensor model {name}"
        
    query_embedding = model.encode(query_text)
    
    # Encode each chunk and compute cosine similarities
    chunk_embeddings = [model.encode(chunk.page_content) for chunk in chunks]
    similarities = cosine_similarity([query_embedding], chunk_embeddings)[0]
    max_sim = float(np.max(similarities))
    return max_sim, max_sim >= threshold

In [19]:
with open(generated_path, "r", encoding="utf-8") as f:
    sensor_data = json.load(f)

# Iterate through sensors and validate against the PDF chunks
qa_results = []
for sensor in sensor_data.get("sensors", []):
    name = sensor.get("name", "")
    
    # Exact string search (case-insensitive)
    string_found = any(name.lower() in chunk.lower() for chunk in chunks)
    
    # Fuzzy matching remains unchanged
    fuzzy_score, fuzzy_match = fuzzy_verify(name, splits, threshold=80)
    
    # Use the updated semantic_verify function with contextual query text
    semantic_score, semantic_match = semantic_verify(sensor, splits, entity_model, threshold=0.65)
    
    # Compute a weighted confidence score
    confidence = (string_found * 0.4) + (fuzzy_score / 100 * 0.3) + (semantic_score * 0.3)
    
    qa_results.append({
        "sensor": name,
        "string_match": string_found,
        "fuzzy_score": fuzzy_score,
        "fuzzy_match": fuzzy_match,
        "semantic_score": semantic_score,
        "semantic_match": semantic_match,
        "confidence": confidence
    })

In [20]:

with open(validtion_json_path, "w", encoding="utf-8") as f:
    json.dump(qa_results, f,indent=2)
    