# TODO

- Query Hybrid Database
- Pass to LLM
- Add gear classifier
- Update Query to Weaviate

# PDF Extraction

In [None]:
# Using unstructured locally
from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title

# https://docs.unstructured.io/open-source/core-functionality/chunking
# https://docs.unstructured.io/examplecode/codesamples/apioss/table-extraction-from-pdf
# https://docs.unstructured.io/open-source/concepts/document-elements
raw_elements = partition_pdf(
    "documents/Sequential Trigon 6.pdf",
    strategy="hi_res",
    multipage_sections=True,
    extract_images_in_pdf=False,
    infer_table_structure=True,
    hi_res_model_name="yolox"
)

elements = []
tables = []

skip_list = ["Footer", "Header", "Table"]

for raw_element in raw_elements:
    # Experimenting with not chunking tables
    if raw_element.category == "Table":
        tables.append(raw_element)
    # Elements we don't want in our index
    elif raw_element.category not in skip_list:
        elements.append(raw_element)

# First testing with larger chunks since Claude handles 200k tokens.
# TODO: play around with these numbers
chunks = chunk_by_title(
    elements, 
    max_characters=10000, 
    combine_text_under_n_chars=500
)

print("\n\n ----chunk---- \n\n".join([chunk.text for chunk in chunks]))

In [None]:
# Using unstructured API and proprietary model
from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title
import unstructured_client
from unstructured_client.models import operations, shared

client = unstructured_client.UnstructuredClient(
    api_key_auth="S8kpjp1vPjnZh9WyLr7FDZp5yD0PO3",
    server_url="https://platform.unstructuredapp.io"
)

filename = "documents/Sequential Trigon 6.pdf"
with open(filename, "rb") as f:
    data = f.read()

req = operations.PartitionRequest(
    partition_parameters=shared.PartitionParameters(
        files=shared.Files(
            content=data,
            file_name=filename
        ),
        strategy=shared.Strategy.HI_RES,
        hi_res_model_name="layout_v1.1.0"
    )
)

try:
    res = client.general.partition(request=req)
    print(res.elements[1])
except Exception as e:
    print(e)


In [None]:
import os
import json
from pathlib import Path
import fitz  # PyMuPDF
import anthropic
import time
from tqdm.notebook import tqdm  # Progress bar for Jupyter

# Configuration
input_folder = "document_test"
output_folder = "output/extractions"
prompt_template = "You are a PDF to markdown converter. Convert the following extracted pdf output to markdown. Ignore page headers and page footers. Keep semantic markup like headings, bold, italics, and bullets. Do not respond with anything except the extracted markdown. The input follows: {}" 

# Initialize Claude client
client = anthropic.Client()

# Create output folder if it doesn't exist
Path(output_folder).mkdir(exist_ok=True, parents=True)

# Get all PDF files in the input folder
pdf_files = [f for f in os.listdir(input_folder) if f.lower().endswith('.pdf')]

# Process each PDF file
for pdf_file in tqdm(pdf_files, desc="Processing PDF files"):
    file_path = os.path.join(input_folder, pdf_file)
    print(f"Processing: {pdf_file}")
    
    # Prepare variables
    filename_base = os.path.splitext(pdf_file)[0]
    doc = None
    
    # Prepare the results container for this PDF
    pdf_results = {
        'filename': pdf_file,
        'path': file_path,
        'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
        'pages': []
    }

    markdown_output = ""
    
    try:
        # Open the PDF
        doc = fitz.open(file_path)
        
        # Process each page
        for page_num in tqdm(range(len(doc)), desc=f"Pages in {pdf_file}"):
            # For testing since long docs are spendy.
            #if page_num > 5:
            #   break;
            page = doc.load_page(page_num)
            page_text = page.get_text()
            
            # Skip empty pages
            if not page_text.strip():
                print(f"  Page {page_num+1} is empty, skipping")
                pdf_results['pages'].append({
                    'page_number': page_num + 1,
                    'status': 'skipped',
                    'reason': 'empty page'
                })
                continue
                
            # Prepare the prompt with the page content
            prompt = prompt_template.format(page_text)
            
            # Call Claude API
            try:
                response = client.messages.create(
                    model="claude-3-7-sonnet-latest",
                    max_tokens=8192,
                    system="You are an assistant that analyzes PDF content.",
                    messages=[
                        {"role": "user", "content": prompt}
                    ]
                )
                
                # Store the response in our results
                pdf_results['pages'].append({
                    'page_number': page_num + 1,
                    'status': 'success',
                    'response': response.content[0].text
                })

                markdown_output += response.content[0].text + "\n\n"
                
                # Respect rate limits - add a small delay
                time.sleep(0.5)
                
            except Exception as api_error:
                print(f"  API error on page {page_num+1}: {api_error}")
                # Record the error but continue with next page
                pdf_results['pages'].append({
                    'page_number': page_num + 1,
                    'status': 'error',
                    'error_message': str(api_error)
                })
        
    except Exception as e:
        print(f"Error processing {pdf_file}: {e}")
        pdf_results['error'] = str(e)
    
    finally:
        # Close the document if it was successfully opened
        if doc is not None:
            try:
                doc.close()
            except Exception as close_error:
                print(f"Warning: Could not close document properly: {close_error}")
        
        # Save results for this PDF, even if partial due to errors
        output_file = os.path.join(output_folder, f"{filename_base}.md")
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(markdown_output)
        debug_file = os.path.join(output_folder, f"{filename_base}_results.json")
        with open(debug_file, 'w', encoding='utf-8') as f:
            json.dump(pdf_results, f)
            
        print(f"Saved results for {pdf_file} to {output_file}")

print(f"Processing complete. Results saved to {output_folder}")

In [None]:
import os
import json
from pathlib import Path
import fitz  # PyMuPDF
import anthropic
import time
from tqdm.notebook import tqdm  # Progress bar for Jupyter
import base64

# Configuration
input_folder = "document_test"
output_folder = "output/extractions"
# TODO: Improve prompt because still getting header and footer content.
prompt = "You are a PDF to markdown converter. Convert the attached pdf to markdown. Make sure to ignore page headers at the top of each page and page footers at the bottom of each page that might contain page numbers, the document name, or the section title. Keep semantic markup like headings, bold, italics, and bullets. If you encounter pictures or diagrams, describe their purpose. Do not respond with anything except the extracted markdown."

# Initialize Claude client
client = anthropic.Client()

# Create output folder if it doesn't exist
Path(output_folder).mkdir(exist_ok=True, parents=True)

# Get all PDF files in the input folder
pdf_files = [f for f in os.listdir(input_folder) if f.lower().endswith('.pdf')]

# Process each PDF file
for pdf_file in tqdm(pdf_files, desc="Processing PDF files"):
    file_path = os.path.join(input_folder, pdf_file)
    print(f"Processing: {pdf_file}")
    
    # Prepare variables
    filename_base = os.path.splitext(pdf_file)[0]
    doc = None
    
    # Prepare the results container for this PDF
    pdf_results = {
        'filename': pdf_file,
        'path': file_path,
        'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
        'pages': []
    }

    markdown_output = ""
    
    try:
        # Open the PDF
        doc = fitz.open(file_path)
        
        # Process each page
        for page_num in tqdm(range(len(doc)), desc=f"Pages in {pdf_file}"):
            # For testing since long docs are spendy.
            #if page_num > 10:
            #   break;
            
            # Create empty pdf, insert page, and then get binary info
            page_pdf = fitz.open()
            page_pdf.insert_pdf(doc, from_page=page_num, to_page=page_num)
            page_bytes = page_pdf.tobytes()
            base64_string = base64.b64encode(page_bytes).decode("utf-8")
            
            # https://github.com/anthropics/anthropic-cookbook/blob/main/misc/pdf_upload_summarization.ipynb
            # Call Claude API
            try:
                response = client.messages.create(
                    model="claude-3-7-sonnet-latest",
                    max_tokens=8192,
                    system="You are an assistant that analyzes PDF content.",
                    messages=[
                        {
                            "role": "user", 
                            "content": [
                                {"type": "document", "source": {"type": "base64", "media_type": "application/pdf", "data": base64_string}},
                                {"type": "text", "text": prompt}
                            ]
                         }
                    ]
                )
                
                # Store the response in our results
                pdf_results['pages'].append({
                    'page_number': page_num + 1,
                    'status': 'success',
                    'response': response.content[0].text
                })

                markdown_output += response.content[0].text + "\n\n"
                
                # Respect rate limits - add a small delay
                time.sleep(0.5)
                
            except Exception as api_error:
                print(f"  API error on page {page_num+1}: {api_error}")
                # Record the error but continue with next page
                pdf_results['pages'].append({
                    'page_number': page_num + 1,
                    'status': 'error',
                    'error_message': str(api_error)
                })
        
    except Exception as e:
        print(f"Error processing {pdf_file}: {e}")
        pdf_results['error'] = str(e)
    
    finally:
        # Close the document if it was successfully opened
        if doc is not None:
            try:
                doc.close()
            except Exception as close_error:
                print(f"Warning: Could not close document properly: {close_error}")
        
        # Save results for this PDF, even if partial due to errors
        output_file = os.path.join(output_folder, f"{filename_base}_uploaded.md")
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(markdown_output)
        debug_file = os.path.join(output_folder, f"{filename_base}_results_uploaded.json")
        with open(debug_file, 'w', encoding='utf-8') as f:
            json.dump(pdf_results, f)
            
        print(f"Saved results for {pdf_file} to {output_file}")

print(f"Processing complete. Results saved to {output_folder}")

The following is the winner. We take a PDF, and extract it page by page into a new pdf which is then uploaded to the LLM with specific instructions on how to convert it to markdown. It took some prompt refinement to get it to behave correctly, but now it handles headers, footers, tables, and formatting really well. Image interpretation is a little hit and miss, but given we aren't keeping images I think that's ok.

In [None]:
import os
import json
from pathlib import Path
import fitz  # PyMuPDF
import anthropic
import time
from tqdm.notebook import tqdm  # Progress bar for Jupyter
import base64
import re

# Configuration
input_folder = "document_test"
output_folder = "output/extractions"
prompt = """Please follow these instructions carefully:

1. Analyze the PDF content thoroughly.

2. Convert the content to markdown format, following these rules:
   - Ignore page headers at the top of each page and page footers at the bottom of each page. These often contain page numbers, document names, or section titles.
   - Preserve semantic markup such as headings, bold text, italics, and bullet points.
   - If you encounter pictures or diagrams, describe their purpose in markdown instead of including the actual images.
   - For multi-column layouts, treat the columns as one continuous page, maintaining the logical flow of the content.
   - Do not exclude any sections, summarize them, or truncate for length.

3. Before providing the final markdown output, wrap your analysis in a <pdf_analysis> tag to show your thought process and ensure you've addressed all requirements. In your analysis:
   - List the main sections or chapters of the PDF content.
   - Identify and quote examples of headers and footers you'll be ignoring.
   - List and describe any images or diagrams you've found.
   - Note any special formatting or semantic markup you've encountered.
   - Explain how you'll handle multi-column layouts, if present.
   - Double check that you didn't truncate any content.
   - Outline your plan for converting the content to markdown.

4. After your analysis, provide the converted markdown content in <markdown_output></markdown_output> tags without any additional commentary. Don't forget the closing tag.

Example output structure:

<pdf_analysis>
[Your detailed analysis of the PDF content, including:
- List of main sections or chapters
- Examples of headers and footers
- Description of images or diagrams
- Notes on special formatting or semantic markup
- Approach for handling multi-column layouts
- Check that all required content is present and not truncated.
- Conversion plan]
</pdf_analysis>

<markdown_output>
[Your converted markdown content here]
</markdown_output>

Please proceed with your analysis and conversion of the PDF content."""

# Initialize Claude client
client = anthropic.Client()

# Create output folder if it doesn't exist
Path(output_folder).mkdir(exist_ok=True, parents=True)

# Get all PDF files in the input folder
pdf_files = [f for f in os.listdir(input_folder) if f.lower().endswith('.pdf')]

# Process each PDF file
for pdf_file in tqdm(pdf_files, desc="Processing PDF files"):
    file_path = os.path.join(input_folder, pdf_file)
    print(f"Processing: {pdf_file}")
    
    # Prepare variables
    filename_base = os.path.splitext(pdf_file)[0]
    doc = None
    
    # Prepare the results container for this PDF
    pdf_results = {
        'filename': pdf_file,
        'path': file_path,
        'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
        'pages': []
    }

    markdown_output = ""
    
    try:
        # Open the PDF
        doc = fitz.open(file_path)
        
        # Process each page
        for page_num in tqdm(range(len(doc)), desc=f"Pages in {pdf_file}"):
            # For testing since long docs are spendy.
            #if page_num == 3:
            #   continue;
            
            # Create empty pdf, insert page, and then get binary info
            page_pdf = fitz.open()
            page_pdf.insert_pdf(doc, from_page=page_num, to_page=page_num)
            page_bytes = page_pdf.tobytes()
            base64_string = base64.b64encode(page_bytes).decode("utf-8")
            
            # https://github.com/anthropics/anthropic-cookbook/blob/main/misc/pdf_upload_summarization.ipynb
            # Call Claude API
            try:
                response = client.messages.create(
                    model="claude-3-7-sonnet-latest",
                    max_tokens=8192,
                    system="You are an advanced AI assistant specializing in PDF content analysis and conversion. Your task is to convert the provided PDF content into markdown format while adhering to specific guidelines.",
                    messages=[
                        {
                            "role": "user", 
                            "content": [
                                {"type": "document", "source": {"type": "base64", "media_type": "application/pdf", "data": base64_string}},
                                {"type": "text", "text": prompt}
                            ]
                         }
                    ]
                )
                
                # Store the response in our results
                pdf_results['pages'].append({
                    'page_number': page_num + 1,
                    'status': 'success',
                    'response': response.content[0].text
                })

                pattern = r'<markdown_output>(.*?)</markdown_output>'
                match = re.search(pattern, response.content[0].text, re.DOTALL)

                markdown_output += match.group(1) + "\n"
                
                # Respect rate limits - add a small delay
                time.sleep(0.5)
                
            except Exception as api_error:
                print(f"  API error on page {page_num+1}: {api_error}")
                # Record the error but continue with next page
                pdf_results['pages'].append({
                    'page_number': page_num + 1,
                    'status': 'error',
                    'error_message': str(api_error)
                })
        
    except Exception as e:
        print(f"Error processing {pdf_file}: {e}")
        pdf_results['error'] = str(e)
    
    finally:
        # Close the document if it was successfully opened
        if doc is not None:
            try:
                doc.close()
            except Exception as close_error:
                print(f"Warning: Could not close document properly: {close_error}")
        
        # Save results for this PDF, even if partial due to errors
        output_file = os.path.join(output_folder, f"{filename_base}.md")
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(markdown_output)
        debug_file = os.path.join(output_folder, f"{filename_base}_results.json")
        with open(debug_file, 'w', encoding='utf-8') as f:
            json.dump(pdf_results, f)
            
        print(f"Saved results for {pdf_file} to {output_file}")

print(f"Processing complete. Results saved to {output_folder}")

# Chunking

In [None]:
import os
from pathlib import Path
import time
import json
import uuid
import re
from datetime import datetime
from tqdm.notebook import tqdm  # Progress bar for Jupyter
from unstructured.partition.md import partition_md
from unstructured.chunking.title import chunk_by_title
from unstructured.chunking.basic import chunk_elements
import anthropic

# Initialize Claude client
client = anthropic.Client()

def situate_context(doc: str, chunk: str):
    DOCUMENT_CONTEXT_PROMPT = """
    <document>
    {doc_content}
    </document>
    """

    CHUNK_CONTEXT_PROMPT = """
    Here is the chunk we want to situate within the whole document
    <chunk>
    {chunk_content}
    </chunk>

    Please give a short succinct context to situate this chunk within the overall document for the purposes of improving search retrieval of the chunk.
    Answer only with the succinct context and nothing else.
    """

    response = client.messages.create(
        model="claude-3-haiku-20240307",
        max_tokens=1024,
        temperature=0.0,
        messages=[
            {
                "role": "user", 
                "content": [
                    {
                        "type": "text",
                        "text": DOCUMENT_CONTEXT_PROMPT.format(doc_content=doc),
                        "cache_control": {"type": "ephemeral"} # Cache full document context
                    },
                    {
                        "type": "text",
                        "text": CHUNK_CONTEXT_PROMPT.format(chunk_content=chunk),
                    },
                ]
            },
        ]
    )
    return response.content[0].text

def classify_content(doc: str):
    CLASSIFIER_PROMPT = """
    You will be analyzing a technical manual for a product to extract specific information. The manual content is provided below:

    <technical_manual>
    {doc_content}
    </technical_manual>

    Your task is to carefully read through the manual and extract the following information:
    1. Company name / brand
    2. Model name of the product being documented
    3. Type of product (e.g., synthesizer, guitar pedal, software plugin)
    4. Keywords that describe the purpose and utility of the product (to aid in BM25 search)

    Follow these steps to complete the task:

    1. Thoroughly read the entire technical manual.

    2. Look for the company name or brand. This is often found on the cover page, in headers, or in copyright notices.

    3. Identify the model name of the product. This is typically prominently displayed near the beginning of the manual or in product descriptions.

    4. Determine the type of product based on the descriptions and features mentioned in the manual.

    5. Extract keywords that describe the product's purpose and utility. Focus on terms that highlight its main features, functions, and applications.

    6. Organize your findings into a JSON object with the following structure:
    {{
        "company_name": "",
        "model_name": "",
        "product_type": "",
        "keywords": []
    }}

    Important notes:
    - If you cannot find a specific piece of information, use "Unknown" as the value.
    - For the "keywords" field, include an array of relevant terms (at least 3, but no more than 10).
    - Ensure that the extracted information is accurate and directly supported by the content in the manual.
    - Leave out legal designations like LLC or TM.

    Present your final output within <json_output> tags, formatted as a valid JSON object.
    """

    response = client.messages.create(
        model="claude-3-haiku-20240307",
        max_tokens=1024,
        temperature=0.0,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": CLASSIFIER_PROMPT.format(doc_content=doc),
                        "cache_control": {"type": "ephemeral"} # Cache full document context
                    }
                ]
            },
        ]
    )

    # Pattern to match content between <json_output> tags
    pattern = r'<json_output>(.*?)</json_output>'
    
    # Use re.DOTALL to make '.' match newlines as well
    match = re.search(pattern, response.content[0].text, re.DOTALL)

    empty_object =  {
        "company_name": "",
        "model_name": "",
        "product_type": "",
        "keywords": []
    }

    if match:
        try:
            # Extract the JSON string and parse it
            json_str = match.group(1).strip()
            return json.loads(json_str)
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON: {e}")
            return empty_object
    else:
        return empty_object

# Configuration
input_folder = "output/extractions"
output_folder = "output/chunks"

# Create output folder if it doesn't exist
Path(output_folder).mkdir(exist_ok=True, parents=True)

# Get all Markdown files in the input folder
md_files = [f for f in os.listdir(input_folder) if f.lower().endswith('.md')]

# Process each Markdown file
for md_file in md_files:
    print(f"Processing: {md_file}")
    file_path = os.path.join(input_folder, md_file)
    filename_base = os.path.splitext(md_file)[0]
    output_folder_chunk_path = output_folder + '/' + filename_base
    
    # Create chunk folder if it doesn't exist
    Path(output_folder_chunk_path).mkdir(exist_ok=True, parents=True)

    document = open(file_path, "r").read()
    elements = partition_md(filename=file_path)

    # Not chunking tables, so pulling them out
    tables = []
    for element in elements:
        if element.category == "Table":
            tables.append(element)
        
    for table in tables:
        # Create metadata
        chunk_data = {
            "id": str(uuid.uuid4()),
            "source_file": md_file,
            "category": "Table",
            "content": table.text,
            "contextualization": situate_context(document, table.text),
            "raw_table": table.metadata.text_as_html,
            "created_at": datetime.now().isoformat()
        }

        # Generate unique filename using UUID
        filename = f"{chunk_data['id']}.json"
        output_path = os.path.join(output_folder_chunk_path, filename)

        # Save chunk as JSON file
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(chunk_data, f, ensure_ascii=False, indent=2)

    # "For technical manuals, I recommend larger chunk sizes around 300-500 tokens with semantic boundaries."
    # "Use 10% overlap to preserve cross-references."
    chunks = chunk_by_title(
        elements,
        multipage_sections=True,
        combine_text_under_n_chars=1200,
        max_characters=2000,
        overlap=60
    )

    for chunk in chunks:
        if chunk.category in ["Table", "TableChunk"]:
            continue
        # Create metadata
        chunk_data = {
            "id": str(uuid.uuid4()),
            "source_file": md_file,
            "category": chunk.category,
            "content": chunk.text,
            "contextualization": situate_context(document, chunk.text),
            "created_at": datetime.now().isoformat()
        }

        # Generate unique filename using UUID
        filename = f"{chunk_data['id']}.json"
        output_path = os.path.join(output_folder_chunk_path, filename)

        # Save chunk as JSON file
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(chunk_data, f, ensure_ascii=False, indent=2)

    # Create metadata.json file to classify the pdf as a whole
    doc_metadata = classify_content(document)
    metadata_filename = f"metadata.json"
    metadata_output_path = os.path.join(output_folder_chunk_path, metadata_filename)

    # Save metadata
    with open(metadata_output_path, 'w', encoding='utf-8') as f:
        json.dump(doc_metadata, f, ensure_ascii=False, indent=2)
        
    print(f"Completed processing {md_file}")

# Embedding

In [None]:
import voyageai
import os
import json

embedding_client = voyageai.Client()

root_folder = "output/chunks"  # Replace with your actual path

for subdir_name in os.listdir(root_folder):
    subdir_path = os.path.join(root_folder, subdir_name)
    
    # Check if this is a subfolder (not the root)
    if os.path.isdir(subdir_path):

        # Create output folder if it doesn't exist
        output_folder = os.path.join("output/embeddings", subdir_name)
        Path(output_folder).mkdir(exist_ok=True, parents=True)

        # Look for JSON files in the current subfolder
        json_files = [f for f in os.listdir(subdir_path) if f.lower().endswith('.json')]

        # Process each JSON file found
        for json_file in json_files:

            file_path = os.path.join(subdir_path, json_file)
            print(f"Processing: {file_path}")

            if json_file == "metadata.json":
                filename = f"metadata.json"
                output_path = os.path.join(output_folder, filename)

                # Save chunk as JSON file
                with open(output_path, 'w', encoding='utf-8') as f:
                    json.dump(data, f, ensure_ascii=False, indent=2)
            
            # Read and parse the JSON file
            try:
                with open(file_path, 'r') as f:
                    data = json.load(f)

                    documents = [data['content'] + "\n\n" + data['contextualization']]

                    embeddings = embedding_client.embed(
                        documents,
                        model="voyage-3",
                        input_type="document"
                    ).embeddings[0]

                    data['embeddings'] = embeddings

                    filename = f"{data['id']}.json"
                    output_path = os.path.join(output_folder, filename)

                    # Save chunk as JSON file
                    with open(output_path, 'w', encoding='utf-8') as f:
                        json.dump(data, f, ensure_ascii=False, indent=2)

            except Exception as e:
                print(f"Error processing {file_path}: {e}")

print("Embedding generation completed.")

# Vector Database

In [None]:
import os
import json
import weaviate
import weaviate.classes as wvc

client = weaviate.connect_to_local()

print(client.is_ready())  # Should print: `True`

root_folder = "output/embeddings"  # Replace with your actual path

collection_name = "Manuals"

if client.collections.exists(collection_name):
    client.collections.delete(collection_name)

manuals = client.collections.create(
    "Manuals",
    vectorizer_config=wvc.config.Configure.Vectorizer.none()
)

for subdir_name in os.listdir(root_folder):
    subdir_path = os.path.join(root_folder, subdir_name)
    
    # Check if this is a subfolder (not the root)
    if os.path.isdir(subdir_path):

        metadata_file = os.path.join(subdir_path, 'metadata.json')
        with open(metadata_file, 'r') as f:
            metadata = json.load(f)

        # Look for JSON files in the current subfolder
        json_files = [f for f in os.listdir(subdir_path) if f.lower().endswith('.json')]

        # Process each JSON file found
        for json_file in json_files:

            if json_file == "metadata.json":
                continue

            file_path = os.path.join(subdir_path, json_file)

            with open(file_path, 'r') as f:
                data = json.load(f)

            uuid = manuals.data.insert(
                uuid=data["id"],
                vector=data["embeddings"],
                properties={
                    "content": data["content"],
                    "doc_type": "chunk",
                    "company_name": metadata["company_name"].lower(),
                    "model_name": metadata["model_name"].lower(),
                    "product_type": metadata["product_type"].lower(),
                    "keywords": ",".join(metadata["keywords"]).lower()
                }
            )

            print(uuid)

client.close()  # Free up resources

print ("Completed load")

# Delete existing weaviate collection

In [None]:
import os
import json
import weaviate
import weaviate.classes as wvc

client = weaviate.connect_to_local()

print(client.is_ready())  # Should print: `True`

root_folder = "output/embeddings"  # Replace with your actual path

collection_name = "Manuals"

if client.collections.exists(collection_name):
    client.collections.delete(collection_name)

manuals = client.collections.create(
    "Manuals",
    vectorizer_config=wvc.config.Configure.Vectorizer.none()
)

client.close()

# Querying

## Semantic

In [None]:
import weaviate
import voyageai

weaviate_client = weaviate.connect_to_local()
embedding_client = voyageai.Client()

print(weaviate_client.is_ready())  # Should print: `True`

query = "What waveforms are available on the Trigon 6?"

query_embeddings = embedding_client.embed(
    [query],
    model="voyage-3",
    input_type="query"
).embeddings[0]

collection = weaviate_client.collections.get("Manuals")

response = collection.query.near_vector(
    near_vector=query_embeddings,
    limit=5,
    return_metadata=wvc.query.MetadataQuery(certainty=True)
)

print(response)

weaviate_client.close()

## Hybrid

In [6]:
import weaviate
import voyageai
from weaviate.classes.query import HybridFusion

weaviate_client = weaviate.connect_to_local()
embedding_client = voyageai.Client()

print(weaviate_client.is_ready())  # Should print: `True`

query = "What is a sawtooth oscillator?"

query_embeddings = embedding_client.embed(
    [query],
    model="voyage-3",
    input_type="query"
).embeddings[0]

collection = weaviate_client.collections.get("Manuals")

response = collection.query.hybrid(
    query=query,
    vector=query_embeddings,
    limit=5,
    fusion_type=HybridFusion.RELATIVE_SCORE,
    return_metadata=wvc.query.MetadataQuery(certainty=True)
)

print(response)

weaviate_client.close()

True
QueryReturn(objects=[Object(uuid=_WeaviateUUIDInt('e7d89ba1-7149-4559-a223-d5ca51c52467'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'model': 'trigon-6', 'product_type': 'synthesizer', 'doc_type': 'chunk', 'content': 'Oscillator Parameters\n\nOctave: Sets the base oscillator frequency of an oscillator over a 5-octave range from -2 to +2, while Oscillator 3 can be dropped to LFO frequency by setting this parameter to lo.\n\nThe global Master Tune settings affect the pitch of all oscillators. See "Globals - Top Row" on page 13 for more information.\n\nPitch: Fine tune control with a range of 7 semitones (a major 5th) up or down. The 12 o\'clock position is centered. Steps are in cents (50 cents = 1/2 semitone). This can be used to set Oscillators 2 & 3 to different intervals from each other and from Oscillator 1.\n\nWaveshape: Triangle, Sawtooth

## ReRank

In [None]:
import weaviate
import voyageai

weaviate_client = weaviate.connect_to_local()
embedding_client = voyageai.Client()

print(weaviate_client.is_ready())  # Should print: `True`

query = "What waveforms are available on the Trigon 6?"

query_embeddings = embedding_client.embed(
    [query],
    model="voyage-3",
    input_type="query"
).embeddings[0]

collection = weaviate_client.collections.get("Manuals")

response = collection.query.hybrid(
    query=query,
    vector=query_embeddings,
    limit=5,
    return_metadata=wvc.query.MetadataQuery(certainty=True)
)

documents = []
for object in response.objects:
    documents.append(object.properties['content'])

reranked_documents = embedding_client.rerank(
    query=query,
    documents=documents,
    model="rerank-2"
)

print(reranked_documents.results)

weaviate_client.close()

# RAG Agent

In [None]:
import anthropic
import weaviate

import weaviate
import voyageai
from weaviate.classes.query import HybridFusion
import weaviate.classes as wvc

anthropic_client = anthropic.Client()
weaviate_client = weaviate.connect_to_local()
embedding_client = voyageai.Client()

query = "How do I tie notes together with the Trigon 6's sequencer?"

query_embeddings = embedding_client.embed(
    [query],
    model="voyage-3",
    input_type="query"
).embeddings[0]

collection = weaviate_client.collections.get("Manuals")

response = collection.query.hybrid(
    query=query,
    vector=query_embeddings,
    limit=5,
    fusion_type=HybridFusion.RELATIVE_SCORE,
    return_metadata=wvc.query.MetadataQuery(certainty=True)
)

documents = []
for object in response.objects:
    documents.append(object.properties['content'])

system_instructions = "You are a specialized studio assistant for a busy music producer. Your primary purpose is to provide accurate, concise technical information to maximize the producer's efficiency in the studio. You have access to a RAG (Retrieval-Augmented Generation) system containing a comprehensive index of technical manuals for all studio equipment."

RAG_PROMPT = """
Core Responsibilities:

1. Answer technical questions about studio equipment using the retrieved manual excerpts
2. Provide troubleshooting assistance based on technical documentation
3. Suggest optimal equipment settings and configurations
4. Offer workflow tips to improve productivity
5. Translate technical jargon into clear, actionable instructions

Interaction Guidelines:

- Keep responses brief and focused on the immediate need
- Prioritize actionable information over theoretical explanations
- Acknowledge when information is incomplete or unclear in the retrieved documents
- Use appropriate technical terminology but explain it when necessary
- Format responses for quick scanning (concise paragraphs, occasional bullet points)
- Include exact page/section references from manuals when relevant
- When suggesting alternatives, focus only on what's feasible with the existing equipment

Response Structure:

- Direct answer to the question (1-2 sentences)
- Supporting details from relevant manual(s)
- Practical next steps or troubleshooting sequence (when applicable)
- Optional: Quick tip for improved workflow

Input Template

The following template contains:
- <documents> tag containing context from RAG system
- <question> tag containing the query to be answered

<documents>
    {documents}
</documents>

<question>
    {question}
</question>
"""

llm_response = client.messages.create(
    model="claude-3-7-sonnet-latest",
    system=system_instructions,
    max_tokens=8192,
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": RAG_PROMPT.format(documents="\n".join(documents), question=query)                    
                }
            ]
        }
    ]
)

print(llm_response.content[0].text)

weaviate_client.close()

# Classify models and brands

In [None]:
import weaviate
from weaviate.classes.query import Metrics
import anthropic

weaviate_client = weaviate.connect_to_local()
collection = weaviate_client.collections.get("Manuals")

brand_response = collection.aggregate.over_all(
    return_metrics=Metrics("company_name").text(
        top_occurrences_count=True,
        top_occurrences_value=True,
        min_occurrences=5
    )
)

brands = []
for occurence in brand_response.properties['company_name'].top_occurrences:
    brands.append(occurence.value)

model_response = collection.aggregate.over_all(
    return_metrics=Metrics("model_name").text(
        top_occurrences_count=True,
        top_occurrences_value=True,
        min_occurrences=5
    )
)

models = []
for occurence in model_response.properties['model_name'].top_occurrences:
    models.append(occurence.value)

print(brands)
print(models)

anthropic_client = anthropic.Client()

MODEL_CLASSIFIER_PROMPT = """
You will be given a list of brands and models, followed by a user's query. Your task is to determine if the user's query contains mentions of any of the brands or models from the list. Exact matches are not necessary; you should look for close matches or variations as well.

First, here is the list of brands and models:
<brands>
{BRANDS}
</brands>

<models>
{MODELS}
</models>

Now, here is the user's query:
<user_query>
{USER_QUERY}
</user_query>

Analyze the user's query and compare it to the brands and models list. Look for exact matches, close matches, or variations of the brands and models. Consider common misspellings, abbreviations, or partial matches.

Provide your response in the following format:

<analysis>
1. First, list any brands or models you've identified in the user's query. For each match, briefly explain why you consider it a match (e.g., exact match, close spelling, common abbreviation).

2. If you haven't found any matches, state that no matches were found.
</analysis>

<brands>
List the matched brands here, one per line. If no matches were found, write "No matches found."
</brands>

<models>
List the matched models here, one per line. If no matches were found, write "No matches found."
</models>

Remember, your goal is to identify mentions of brands and models from the provided list in the user's query, even if they're not exact matches. Be thorough in your analysis, but avoid false positives by ensuring there's a reasonable connection between the query and the brands/models list.
"""

query = "What oscillators are available on the sequential trigon?"

llm_response = anthropic_client.messages.create(
    model="claude-3-7-sonnet-latest",
    system=system_instructions,
    max_tokens=8192,
    temperature=0,
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": MODEL_CLASSIFIER_PROMPT.format(BRANDS="\n".join(brands), MODELS="\n".join(models), USER_QUERY=query)
                }
            ]
        }
    ]
)

print(llm_response.content[0].text)

weaviate_client.close()


In [None]:
import weaviate
from weaviate.classes.query import Metrics
import anthropic

weaviate_client = weaviate.connect_to_local()
collection = weaviate_client.collections.get("Manuals")

brand_response = collection.aggregate.over_all(
    return_metrics=Metrics("company_name").text(
        top_occurrences_count=True,
        top_occurrences_value=True,
        min_occurrences=5
    )
)

brands = []
for occurence in brand_response.properties['company_name'].top_occurrences:
    brands.append(occurence.value)

model_response = collection.aggregate.over_all(
    return_metrics=Metrics("model_name").text(
        top_occurrences_count=True,
        top_occurrences_value=True,
        min_occurrences=5
    )
)

models = []
for occurence in model_response.properties['model_name'].top_occurrences:
    models.append(occurence.value)

print(brands)
print(models)

anthropic_client = anthropic.Client()

MODEL_CLASSIFIER_PROMPT = """
You will be given a list of brands and models, followed by a user's query. Your task is to determine if the user's query contains mentions of any of the brands or models from the list. Exact matches are not necessary; you should look for close matches or variations as well. Consider common misspellings, abbreviations, or partial matches.

First, here is the list of brands and models:
<brands>
{BRANDS}
</brands>

<models>
{MODELS}
</models>

Now, here is the user's query:
<user_query>
{USER_QUERY}
</user_query>

Provide your response in the following format:

<brands>
List the matched brands here, one per line. If no matches were found, write "none"
</brands>

<models>
List the matched models here, one per line. If no matches were found, write "none"
</models>
"""

query = "What oscillators are available on the trigon?"

llm_response = anthropic_client.messages.create(
    model="claude-3-7-sonnet-latest",
    max_tokens=1024,
    temperature=0,
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": MODEL_CLASSIFIER_PROMPT.format(BRANDS="\n".join(brands), MODELS="\n".join(models), USER_QUERY=query)
                }
            ]
        }
    ]
)

print(llm_response.content[0].text)

weaviate_client.close()
