In [20]:
%pip install -Uqqq pip --progress-bar off
%pip install -qqq ollama --progress-bar off
%pip install -qqq pathlib --progress-bar off
%pip install -qqq pandas --progress-bar off
%pip install -qqq PyPDF2 --progress-bar off
%pip install -qqq ollama --progress-bar off
%pip install -qqq owlready2 --progress-bar off


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [88]:
import json
from enum import Enum
import PyPDF2
import ollama
from collections import defaultdict
import re
from datetime import datetime
import unicodedata

MODEL = "llama3.1:8b-instruct-q8_0"
 

In [22]:
def extract_pdf_text(pdf_path):
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        text = " ".join(page.extract_text() for page in reader.pages)
    return text

pdf_text = extract_pdf_text("sensors.pdf")
print("text extracted from pdf")

text extracted from pdf


In [23]:
class ResponseFormat(Enum):
    JSON = "json_object"
    TEXT = "text"
 
 
def call_model(
    prompt: str, response_format: ResponseFormat = ResponseFormat.TEXT
) -> str:
    response = ollama.generate(
        model=MODEL,
        prompt=prompt,
        keep_alive="1h",
        format="" if response_format == ResponseFormat.TEXT else "json",
    )
    return response["response"]

In [82]:

SUMMARIZE_PROMPT = """
As a LiDAR sensor expert, your task is to extract, categorize, and group all the **real, specific, and branded** LiDAR sensor names mentioned in the following text.

**Output Requirements**:
1. Provide a concise and structured list of LiDAR sensors.
2. Group and standardize similar categories to avoid duplicates (e.g., "Automotive LiDAR", "Automotive LiDAR Sensors", and "Automotive Sensors" should be grouped under one category, "Automotive LiDAR").
3. Categorize the sensors based on their application, type, or any mentioned specifications (e.g., Automotive, Industrial, Surveying).
4. Include only **specific branded or model names** of sensors (e.g., "Velodyne Velarray", "Livox Horizon"). 

**Exclusion Criteria**:
- Do not include generic terms such as "LiDAR", "LiDAR sensor", "LiDAR sensors", "various LiDAR sensors", or "no specific brand model mentioned".
- Exclude datasets or evaluation tools (e.g., "Kitti", "NuScenes").
- Ignore descriptions or summaries like "solid-state LiDAR", "lidar technology", or "selected sensors are mechanical or solid-state types".
- Avoid placeholder entries like "not mentioned" or "various".

**Formatting**:
- Strictly follow the JSON template below:
{{
    "categories": [
        {{
            "name": "Category Name",
            "sensors": [
                "Sensor 1",
                "Sensor 2"
            ]
        }}
    ],
    "metadata": {{
        "total_categories": <number>,
        "total_unique_sensors": <number>
    }}
}}

**Additional Instructions**:
- Only include a category if it contains sensors.
- Use consistent naming conventions for categories. Avoid redundancy or variations in names.
- If no sensors are found in the text, return an empty JSON: {{}}.
- Do not include any additional explanations or information.

<text>
{text}
</text>

"""

In [83]:
# Step 1: Chunk size for processing
CHUNK_SIZE = 1000
chunks = [pdf_text[i:i + CHUNK_SIZE] for i in range(0, len(pdf_text), CHUNK_SIZE)]

# Save chunks to a file if needed
with open("intermediate_data/chunks.json", "w", encoding="utf-8") as f:
    json.dump(chunks, f)

print("Chunks created and saved.")


Chunks created and saved.


In [84]:
# Step 2: Process chunks with the model and save responses
responses = []

for idx, chunk in enumerate(chunks):
    formatted_prompt = SUMMARIZE_PROMPT.format(text=chunk)
    response = call_model(formatted_prompt)  # Call your expensive model
    responses.append(response)

    # Save responses after every chunk to ensure progress is retained
    with open("intermediate_data/responses.json", "w", encoding="utf-8") as f:
        json.dump(responses, f)

print("Model responses saved.")


Model responses saved.


In [85]:
# Step 3: Load responses and process them
with open("intermediate_data/responses.json", "r", encoding="utf-8") as f:
    responses = json.load(f)

# Placeholder for final results
categories_dict = defaultdict(set)

# Function to clean strings
def clean_string(s):
    s = s.strip()  # Remove leading/trailing whitespace
    s = s.encode('utf-8').decode('unicode_escape')  # Decode Unicode escapes
    s = unicodedata.normalize('NFKC', s)  # Normalize special characters
    s = re.sub(r"[\s\-_/]+", " ", s)  # Replace special separators with space
    s = s.replace("\u00c2", "")  # Remove unwanted artifacts
    return s.lower()


# Process responses into categories_dict
for response in responses:
    try:
        response_json = json.loads(response)

        if "categories" in response_json and isinstance(response_json["categories"], list):
            for category in response_json["categories"]:
                category_name = clean_string(category.get("name", "Unidentified Category"))
                sensors = {clean_string(sensor) for sensor in category.get("sensors", [])}
                categories_dict[category_name].update(sensors)
    except json.JSONDecodeError as e:
        print(f"Error parsing response: {response}. Details: {e}")

print("Categories processed.")


Categories processed.


In [92]:
# Prepare the final JSON structure
categories_list = []
unique_sensors = set()

# Merge categories and deduplicate sensors
for category_name, sensors in categories_dict.items():
    categories_list.append({
        "name": category_name.title(),  # Capitalized name for better readability
        "sensors": sorted(sensors)  # Sorted sensors
    })
    unique_sensors.update(sensors)

output_json = {
    "categories": categories_list,
    "metadata": {
        "total_categories": len(categories_list),
        "total_unique_sensors": len(unique_sensors)
    }
}

# Ensure that the final output strictly follows the required JSON template
final_output = json.dumps(output_json, indent=4, ensure_ascii=False)

# Save the JSON to a file with a timestamp
output_file = f"out/JSON/Extracted_Data_{datetime.now().strftime('%d.%m.%Y_%H%M%S')}.json"
with open(output_file, "w", encoding="utf-8") as f:
    f.write(final_output)

print(f"Results saved to {output_file}")

Results saved to out/JSON/Extracted_Data_04.12.2024_125703.json


In [97]:
from owlready2 import get_ontology, Thing
import os

# Function to create OWL ontology from JSON
def create_ontology_from_json(json_data, ontology_url="http://example.org/lidar.owl", output_file="lidar_ontology.owl"):
    """
    Convert JSON data to an OWL ontology and save it to a file.
    
    :param json_data: JSON dictionary containing the categorized sensor data.
    :param ontology_url: The URL for the ontology namespace.
    :param output_file: Filepath to save the resulting OWL file.
    """
    # Load ontology
    ontology = get_ontology(ontology_url)

    # Define ontology structure
    with ontology:
        class LidarCategory(Thing):
            pass

        class LidarSensor(Thing):
            pass

        class belongsToCategory(LidarSensor >> LidarCategory):
            pass

    # Populate ontology from JSON
    for category in json_data.get("categories", []):
        category_name = category["name"].replace(" ", "_")
        # Create a class for each category
        category_class = type(category_name, (LidarCategory,), {})
        
        for sensor in category.get("sensors", []):
            sensor_name = sensor.replace(" ", "_")
            # Create individuals for sensors
            sensor_individual = LidarSensor(sensor_name)
            sensor_individual.belongsToCategory.append(category_class)

    # Save ontology to a file
    ontology.save(file=output_file, format="rdfxml")
    print(f"OWL file created at: {output_file}")
   
json_data = json.loads(final_output)

# Define output OWL file path
output_owl_file = f"out/OWL/LIDAR{datetime.now().strftime('%d.%m.%Y_%H%M%S')}.owl"
# Generate the OWL ontology
create_ontology_from_json(json_data, output_file=output_owl_file)

OWL file created at: out/OWL/LIDAR04.12.2024_130334.owl
