In [20]:
%pip install -Uqqq pip --progress-bar off
%pip install -qqq ollama==0.3.3 --progress-bar off
%pip install -qqq pathlib --progress-bar off
%pip install -qqq pandas --progress-bar off
%pip install -qqq PyPDF2 --progress-bar off
%pip install -qqq ollama --progress-bar off


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [21]:
import json
from enum import Enum
import PyPDF2
import ollama
from collections import defaultdict
import re
from datetime import datetime
import unicodedata

MODEL = "llama3.1:8b-instruct-q8_0"
 

In [22]:
def extract_pdf_text(pdf_path):
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        text = " ".join(page.extract_text() for page in reader.pages)
    return text

pdf_text = extract_pdf_text("sensors.pdf")
print("text extracted from pdf")

text extracted from pdf


In [23]:
class ResponseFormat(Enum):
    JSON = "json_object"
    TEXT = "text"
 
 
def call_model(
    prompt: str, response_format: ResponseFormat = ResponseFormat.TEXT
) -> str:
    response = ollama.generate(
        model=MODEL,
        prompt=prompt,
        keep_alive="1h",
        format="" if response_format == ResponseFormat.TEXT else "json",
    )
    return response["response"]

In [24]:

SUMMARIZE_PROMPT = """
As a LiDAR sensor expert, your task is to extract, categorize, and group all the LiDAR sensor names mentioned in the following text.

**Output Requirements**:
1. Provide a concise and structured list of LiDAR sensors.
2. Group and standardize similar categories to avoid duplicates (e.g., "Automotive LiDAR", "Automotive LiDAR Sensors", and "Automotive Sensors" should be grouped under one category, "Automotive LiDAR").
3. Categorize the sensors based on their application, type, or any mentioned specifications (e.g., Automotive, Industrial, Surveying).
4. Include the brand or model name if mentioned.

**Formatting**:
- Strictly follow the JSON template below:
{{
    "categories": [
        {{
            "name": "Category Name",
            "sensors": [
                "Sensor 1",
                "Sensor 2"
            ]
        }}
    ],
    "metadata": {{
        "total_categories": <number>,
        "total_unique_sensors": <number>
    }}
}}

**Additional Instructions**:
- Only include a category if it contains sensors.
- Use consistent naming conventions for categories. Avoid redundancy or variations in names.
- If no sensors are found in the text, return an empty JSON: {{}}.
- Do not include any additional explanations or information.

<text>
{text}
</text>
"""

In [31]:
# Step 1: Chunk size for processing
CHUNK_SIZE = 1000
chunks = [pdf_text[i:i + CHUNK_SIZE] for i in range(0, len(pdf_text), CHUNK_SIZE)]

# Save chunks to a file if needed
with open("intermediate_data/chunks.json", "w", encoding="utf-8") as f:
    json.dump(chunks, f)

print("Chunks created and saved.")


Chunks created and saved.


In [32]:
# Step 2: Process chunks with the model and save responses
responses = []

for idx, chunk in enumerate(chunks):
    formatted_prompt = SUMMARIZE_PROMPT.format(text=chunk)
    response = call_model(formatted_prompt)  # Call your expensive model
    responses.append(response)

    # Save responses after every chunk to ensure progress is retained
    with open("intermediate_data/responses.json", "w", encoding="utf-8") as f:
        json.dump(responses, f)

print("Model responses saved.")


Model responses saved.


In [27]:
# Step 3: Load responses and process them
with open("responses.json", "r", encoding="utf-8") as f:
    responses = json.load(f)

# Placeholder for final results
categories_dict = defaultdict(set)

# Function to clean strings
def clean_string(s):
    s = s.strip()
    s = s.encode('utf-8').decode('unicode_escape')
    s = unicodedata.normalize('NFKC', s)
    s = re.sub(r"[\s\-_/]+", " ", s)
    s = s.replace("solid state", "solid-state")
    s = re.sub(r"\u00c2", "", s)
    return s.lower()

# Process responses into categories_dict
for response in responses:
    try:
        response_json = json.loads(response)

        if "categories" in response_json and isinstance(response_json["categories"], list):
            for category in response_json["categories"]:
                category_name = clean_string(category.get("name", "Unidentified Category"))
                sensors = {clean_string(sensor) for sensor in category.get("sensors", [])}
                categories_dict[category_name].update(sensors)
    except json.JSONDecodeError as e:
        print(f"Error parsing response: {response}. Details: {e}")

print("Categories processed.")


Error parsing response: {
    "categories": [
        {
            "name": "Automotive LiDAR",
            "sensors": [
                "Velodyne Velarray H800"
            ]
        },
        {
            "name": "Surveying and Mapping LiDAR",
            "sensors": [
                "LiDAR sensors" (no specific brand/model mentioned)
            ]
        }
    ],
    "metadata": {
        "total_categories": 2,
        "total_unique_sensors": 1
    }
}. Details: Expecting ',' delimiter: line 12 column 33 (char 281)
Categories processed.


In [29]:
# Prepare the final JSON structure
categories_list = []
unique_sensors = set()

# Merge categories and deduplicate sensors
for category_name, sensors in categories_dict.items():
    categories_list.append({
        "name": category_name.title(),  # Capitalized name for better readability
        "sensors": sorted(sensors)  # Sorted sensors
    })
    unique_sensors.update(sensors)

output_json = {
    "categories": categories_list,
    "metadata": {
        "total_categories": len(categories_list),
        "total_unique_sensors": len(unique_sensors)
    }
}

# Ensure that the final output strictly follows the required JSON template
final_output = json.dumps(output_json, indent=4)

# Save the JSON to a file with a timestamp
output_file = f"out/Extracted_Data_{datetime.now().strftime('%d.%m.%Y_%H%M%S')}.json"
with open(output_file, "w", encoding="utf-8") as f:
    f.write(final_output)

print(f"Results saved to {output_file}")

Results saved to out/Extracted_Data_23.11.2024_093804.json
