# Create Chunks for the Description
# Multiple Chunks for the description
[
    {
        "content": "Passive marine flooding...",
        "layer_id": "passive_marine_flooding"
    },
    {
        "content": "Limitations include not...",
        "layer_id": "passive_marine_flooding"
    },
    {
        "content": "Uses 2-meter resolution DEMs...",
        "layer_id": "passive_marine_flooding"
    }
]

In [None]:
import os
import json
import re

In [None]:
documentation_path = "../../data/documentation.json"

In [None]:
documentation = json.load(open(documentation_path))

In [None]:
def create_chunks(layer_id,description,max_sentences=3):
    sentences = re.split(r'(?<=[.!?])\s+', description)
    chunks = []

    for i in range(0, len(sentences), 3):
        chunk = sentences[i:i+2]
        chunks.append({
            "content": " ".join(chunk),
            "layer_id": layer_id
        })

    return chunks

In [None]:
documentation

In [None]:
all_chunks = []
for key, value in documentation.items():
    layer_id = key
    description = value["description"]
    chunks = create_chunks(layer_id, description, max_sentences=3)
    all_chunks.extend(chunks)

In [None]:
all_chunks

In [None]:
from openai import OpenAI

client = OpenAI(
    api_key='ENTER YOUR API KEY HERE'
)

# Function to create embeddings for the chunks
def embed_chunks(chunks):
    response = client.embeddings.create(
        input=chunks,
        model="text-embedding-3-small"
    )
    return response.data[0].embedding

In [None]:
for chunk in all_chunks:
    chunk['embedding'] = embed_chunks(chunk['content'])
all_chunks


In [None]:
# Save chunks to a file
with open('chunks.json', 'w') as f:
    json.dump(all_chunks, f)

## Insert chunks into 

In [79]:
import psycopg2
import numpy as np

conn = psycopg2.connect(
    host="localhost",
    port=5432,
    database="climate_viewer_dev",
    user="dev_user",
    password="dev_password"
)
cur = conn.cursor()

# Create a table for the chunks
cur.execute("""
    CREATE TABLE IF NOT EXISTS public.chunks (
        id SERIAL PRIMARY KEY,
        layer_id VARCHAR(255) NOT NULL,
        content TEXT NOT NULL,
        embedding VECTOR(1536) NOT NULL
    )
""")

# List all tables in the database
cur.execute("SELECT table_name FROM information_schema.tables WHERE table_schema='public'")
tables = cur.fetchall()
print(tables)

# List all columns in the chunks table
cur.execute("SELECT column_name FROM information_schema.columns WHERE table_name='chunks'")
columns = cur.fetchall()
print(columns)

# List all rows in the chunks table
cur.execute(query="SELECT * FROM public.chunks")

[('chunks',)]
[('id',), ('embedding',), ('created_at',), ('layer_id',), ('content',)]


In [80]:
# Insert chunks into the table
for chunk in all_chunks:
    cur.execute("""
        INSERT INTO chunks (layer_id, content, embedding)
        VALUES (%s, %s, %s)
    """, (chunk['layer_id'], chunk['content'], chunk['embedding']))


In [81]:
cur.execute(query="SELECT * FROM public.chunks")
print(cur.fetchall())

[(361, 'passive_marine_flooding', 'Passive marine flooding identifies areas hydrologically connected to the ocean that would be inundated by sea level rise scenarios. Using a modified bathtub approach with DEMs and MHHW tidal datum, the model identifies coastal areas below specified sea level heights that have direct surface connections to marine waters.', '[0.006575726,0.050960317,0.06726562,0.034704875,-0.018611485,-0.009530128,0.00885074,-0.03343336,0.015769277,0.056046378,-0.01188617,-0.0036711872,-0.018225044,0.00387999,0.011668018,0.035627346,-0.0023716243,-0.0014927833,-0.0074109365,0.023809737,0.03525337,0.009523895,-0.00870115,0.01588147,-0.018661348,-0.023398364,0.0045967754,0.034555282,0.022002192,0.045475353,0.044153973,-0.025380433,-0.044951785,0.023797272,0.06751494,0.053303894,-0.04218437,0.04846715,-0.041336697,0.05068607,-0.020169714,-0.006195518,-0.026352767,0.059885852,0.012434666,-0.007853474,0.04557508,0.005257464,0.0064386018,0.031588417,0.029818268,-0.048541944,-

## QA Testing

In [82]:
# Test Vector Search
user_query = "What is the definition of passive marine flooding?"

# Convert to embedding
query_embedding = embed_chunks(user_query)

# Convert to vector type for pgvector
query_vector = f"[{','.join(map(str, query_embedding))}]"

print(f"Query: {user_query}")
print(f"Query vector length: {len(query_embedding)}")
print(f"Query vector (first 5): {query_embedding[:5]}")

# First, let's check if we have any data in the chunks table
cur.execute("SELECT COUNT(*) FROM public.chunks")
count = cur.fetchone()[0]
print(f"Total chunks in database: {count}")

# Check the first chunk's embedding
cur.execute("SELECT layer_id, content FROM public.chunks LIMIT 1")
sample = cur.fetchone()
if sample:
    layer_id, content = sample
    print(f"Sample chunk - layer_id: {layer_id}, content: {content[:100]}...")

# Try a broader similarity search (higher threshold)
print("\n--- Testing with threshold 1.0 ---")
cur.execute("""
    SELECT content, layer_id, embedding <=> %s::vector as similarity_score
FROM public.chunks
WHERE embedding <=> %s::vector < 1.0
ORDER BY embedding <=> %s::vector
LIMIT 5""", (query_vector, query_vector, query_vector))
results = cur.fetchall()
print(f"Results with threshold 1.0: {len(results)}")
for i, result in enumerate(results):
    print(f"{i+1}. Score: {result[2]:.4f}, Layer: {result[1]}, Content: {result[0][:100]}...")

# Try without any threshold to see all similarities
print("\n--- All similarities ---")
cur.execute("""
    SELECT content, layer_id, embedding <=> %s::vector as similarity_score
FROM public.chunks
ORDER BY embedding <=> %s::vector
LIMIT 5""", (query_vector, query_vector))
all_results = cur.fetchall()
print(f"All results: {len(all_results)}")
for i, result in enumerate(all_results):
    print(f"{i+1}. Score: {result[2]:.4f}, Layer: {result[1]}, Content: {result[0][:100]}...")


Query: What is the definition of passive marine flooding?
Query vector length: 1536
Query vector (first 5): [0.019157158210873604, 0.07736074924468994, 0.02231747843325138, 0.036117952316999435, 0.00247395783662796]
Total chunks in database: 40
Sample chunk - layer_id: passive_marine_flooding, content: Passive marine flooding identifies areas hydrologically connected to the ocean that would be inundat...

--- Testing with threshold 1.0 ---
Results with threshold 1.0: 5
1. Score: 0.2321, Layer: passive_marine_flooding, Content: Passive marine flooding identifies areas hydrologically connected to the ocean that would be inundat...
2. Score: 0.3914, Layer: low_lying_flooding, Content: Low-lying area flooding identifies areas that are topographically below sea level rise scenarios but...
3. Score: 0.4353, Layer: passive_marine_flooding, Content: These areas experience direct marine inundation as sea levels rise, with floodwater arriving via sur...
4. Score: 0.4606, Layer: low_lying_floodin