# Create Chunks for the Description
# Multiple Chunks for the description
[
    {
        "content": "Passive marine flooding...",
        "layer_id": "passive_marine_flooding"
    },
    {
        "content": "Limitations include not...",
        "layer_id": "passive_marine_flooding"
    },
    {
        "content": "Uses 2-meter resolution DEMs...",
        "layer_id": "passive_marine_flooding"
    }
]

In [27]:
import os
import json
import re
from dotenv import load_dotenv

load_dotenv()

True

In [None]:
openai_key = os.getenv("OPENAI_API_KEY")

In [19]:
documentation_path = "../../data/documentation.json"

In [20]:
documentation = json.load(open(documentation_path))

In [21]:
def create_chunks(layer_id, description, max_sentences=3):
    sentences = re.split(r'(?<=[.!?])\s+', description)
    chunks = []

    for i in range(0, len(sentences), 3):
        chunk = sentences[i:i+2]
        chunks.append({
            "content": " ".join(chunk),
            "layer_id": layer_id
        })

    return chunks

In [22]:
documentation

{'passive_marine_flooding': {'title': 'Passive Marine Flooding',
  'description': "Passive marine flooding identifies areas hydrologically connected to the ocean that would be inundated by sea level rise scenarios. Using a modified bathtub approach with DEMs and MHHW tidal datum, the model identifies coastal areas below specified sea level heights that have direct surface connections to marine waters. Water levels are shown as they would appear during Mean Higher High Water (MHHW), representing the average higher high water height of each tidal day. These areas experience direct marine inundation as sea levels rise, with floodwater arriving via surface flow paths from the ocean. The modeling uses 2-meter resolution DEMs derived from LiDAR data, with horizontal and vertical accuracies conforming to FEMA flood mapping standards. Limitations include not accounting for wave action, coastal erosion, or dynamic coastal processes that are important along Hawaii's active coastlines.",
  'base_

In [23]:
all_chunks = []
for key, value in documentation.items():
    layer_id = value["base_layer_name"]
    description = value["description"]
    chunks = create_chunks(layer_id, description, max_sentences=3)
    all_chunks.extend(chunks)

In [24]:
all_chunks

[{'content': 'Passive marine flooding identifies areas hydrologically connected to the ocean that would be inundated by sea level rise scenarios. Using a modified bathtub approach with DEMs and MHHW tidal datum, the model identifies coastal areas below specified sea level heights that have direct surface connections to marine waters.',
  'layer_id': 'CRC:HI_State_80prob_{scenario}ft_SCI'},
 {'content': 'These areas experience direct marine inundation as sea levels rise, with floodwater arriving via surface flow paths from the ocean. The modeling uses 2-meter resolution DEMs derived from LiDAR data, with horizontal and vertical accuracies conforming to FEMA flood mapping standards.',
  'layer_id': 'CRC:HI_State_80prob_{scenario}ft_SCI'},
 {'content': 'Low-lying area flooding identifies areas that are topographically below sea level rise scenarios but lack direct hydrological connections to the ocean. These areas may become flooded through indirect pathways such as subsurface connections

In [28]:
from openai import OpenAI

client = OpenAI()

# Function to create embeddings for the chunks
def embed_chunks(chunks):
    response = client.embeddings.create(
        input=chunks,
        model="text-embedding-3-small"
    )
    return response.data[0].embedding

In [29]:
for chunk in all_chunks:
    chunk['embedding'] = embed_chunks(chunk['content'])
all_chunks


[{'content': 'Passive marine flooding identifies areas hydrologically connected to the ocean that would be inundated by sea level rise scenarios. Using a modified bathtub approach with DEMs and MHHW tidal datum, the model identifies coastal areas below specified sea level heights that have direct surface connections to marine waters.',
  'layer_id': 'CRC:HI_State_80prob_{scenario}ft_SCI',
  'embedding': [0.006575725972652435,
   0.05096031725406647,
   0.06726562231779099,
   0.03470487520098686,
   -0.0186114851385355,
   -0.009530127979815006,
   0.008850740268826485,
   -0.03343335911631584,
   0.01576927676796913,
   0.05604637786746025,
   -0.011886170133948326,
   -0.0036711872089654207,
   -0.018225044012069702,
   0.0038799899630248547,
   0.011668018065392971,
   0.035627346485853195,
   -0.002371624344959855,
   -0.0014927833108231425,
   -0.007410936523228884,
   0.023809736594557762,
   0.0352533683180809,
   0.009523894637823105,
   -0.008701150305569172,
   0.015881469473

In [None]:
# Save chunks to a file
with open('chunks.json', 'w') as f:
    json.dump(all_chunks, f)

## Insert chunks into vector database (pgvector)

In [30]:
# Read chunks from file

with open('chunks.json', 'r') as f:
    all_chunks = json.load(f)


In [31]:
import psycopg2
import numpy as np

conn = psycopg2.connect(
    host="localhost",
    port=5432,
    database="climate_viewer_dev",
    user="dev_user",
    password="dev_password"
)
cur = conn.cursor()

# Create a table for the chunks
cur.execute("""
    CREATE TABLE IF NOT EXISTS public.chunks (
        id SERIAL PRIMARY KEY,
        layer_id VARCHAR(255) NOT NULL,
        content TEXT NOT NULL,
        embedding VECTOR(1536) NOT NULL
    )
""")

# List all tables in the database
cur.execute("SELECT table_name FROM information_schema.tables WHERE table_schema='public'")
tables = cur.fetchall()
print(tables)

# List all columns in the chunks table
cur.execute("SELECT column_name FROM information_schema.columns WHERE table_name='chunks'")
columns = cur.fetchall()
print(columns)

# List all rows in the chunks table
cur.execute(query="SELECT * FROM public.chunks")

[('chunks',)]
[('id',), ('embedding',), ('created_at',), ('layer_id',), ('content',)]


In [32]:
# Insert chunks into the table
for chunk in all_chunks:
    cur.execute("""
        INSERT INTO chunks (layer_id, content, embedding)
        VALUES (%s, %s, %s)
    """, (chunk['layer_id'], chunk['content'], chunk['embedding']))


In [33]:
cur.execute(query="SELECT * FROM public.chunks")
print(cur.fetchall())

[(465, 'drainage_backflow', 'Profiles are spaced 20 meters apart along the coast. This approach was used to model the transformation of the wave as it breaks across the reef and includes shallow water wave processes such as wave set-up and overtopping.', '[0.027137034,0.0003140484,0.08560004,0.0007602207,-0.023507487,0.04441315,0.011864321,-0.022739949,0.018212775,0.079095475,-0.014049854,-0.013984809,0.044361115,-0.015675995,0.017406208,-0.00049515977,0.028021654,-0.004712555,-0.009132405,0.030363295,-0.020593444,-0.01812171,-0.041369013,-0.007987603,0.001342379,-0.015988214,-0.005590671,0.017354172,0.019643778,0.03606129,0.013171738,-0.01395879,-0.03926154,-3.1354022e-05,0.0537537,-0.016846815,-0.0063614617,-0.017146027,0.009080369,0.046078317,-0.0023286333,-0.016469551,-0.04053643,0.0012082225,-0.0001979826,0.004237722,0.027111014,0.00041263315,0.038272843,0.05156817,0.012599337,0.023208277,-0.0424878,-0.045818135,0.00794207,0.004081613,0.049304582,-0.017848518,0.0033531017,0.010608

## QA Testing

In [34]:
# Test Vector Search
user_query = "What is the definition of passive marine flooding?"

# Convert to embedding
query_embedding = embed_chunks(user_query)

# Convert to vector type for pgvector
query_vector = f"[{','.join(map(str, query_embedding))}]"

print(f"Query: {user_query}")
print(f"Query vector length: {len(query_embedding)}")
print(f"Query vector (first 5): {query_embedding[:5]}")

# First, let's check if we have any data in the chunks table
cur.execute("SELECT COUNT(*) FROM public.chunks")
count = cur.fetchone()[0]
print(f"Total chunks in database: {count}")

# Check the first chunk's embedding
cur.execute("SELECT layer_id, content FROM public.chunks LIMIT 1")
sample = cur.fetchone()
if sample:
    layer_id, content = sample
    print(f"Sample chunk - layer_id: {layer_id}, content: {content[:100]}...")

# Try a broader similarity search (higher threshold)
print("\n--- Testing with threshold 1.0 ---")
cur.execute("""
    SELECT content, layer_id, embedding <=> %s::vector as similarity_score
FROM public.chunks
WHERE embedding <=> %s::vector < 1.0
ORDER BY embedding <=> %s::vector
LIMIT 5""", (query_vector, query_vector, query_vector))
results = cur.fetchall()
print(f"Results with threshold 1.0: {len(results)}")
for i, result in enumerate(results):
    print(f"{i+1}. Score: {result[2]:.4f}, Layer: {result[1]}, Content: {result[0][:100]}...")

# Try without any threshold to see all similarities
print("\n--- All similarities ---")
cur.execute("""
    SELECT content, layer_id, embedding <=> %s::vector as similarity_score
FROM public.chunks
ORDER BY embedding <=> %s::vector
LIMIT 5""", (query_vector, query_vector))
all_results = cur.fetchall()
print(f"All results: {len(all_results)}")
for i, result in enumerate(all_results):
    print(f"{i+1}. Score: {result[2]:.4f}, Layer: {result[1]}, Content: {result[0][:100]}...")


## Which hazard presents the largest risk to Oahu?
## Simple Definitions (What is passive flooding?... )
## What is the greatest risk to {location}?

Query: What is the definition of passive marine flooding?
Query vector length: 1536
Query vector (first 5): [0.01914322003722191, 0.07735373824834824, 0.022315455600619316, 0.03611467778682709, 0.00247830874286592]
Total chunks in database: 40
Sample chunk - layer_id: drainage_backflow, content: Profiles are spaced 20 meters apart along the coast. This approach was used to model the transformat...

--- Testing with threshold 1.0 ---
Results with threshold 1.0: 5
1. Score: 0.2320, Layer: passive_marine_flooding, Content: Passive marine flooding identifies areas hydrologically connected to the ocean that would be inundat...
2. Score: 0.3915, Layer: low_lying_flooding, Content: Low-lying area flooding identifies areas that are topographically below sea level rise scenarios but...
3. Score: 0.4352, Layer: passive_marine_flooding, Content: These areas experience direct marine inundation as sea levels rise, with floodwater arriving via sur...
4. Score: 0.4605, Layer: low_lying_flooding, Cont

In [35]:
# Assemble Context from Vector Search
context = ""
for result in results:
    context += f"{result[0]}\n" 
    context += f"Layer: {result[1]}\n"


In [36]:
# Assemble Create Prompt
prompt = f"""
You are a Hawaiian climate data assistant. Analyze the user's query and provide helpful response according to the context from the layer descriptions. Synthesize a response from the layer descriptions and the user's query.

USER QUERY: {user_query}
CONTEXT: {context}

RESPONSE FORMAT:
You MUST respond with valid JSON in this exact structure:
{{
  "type": "add_layer",
  "parameters": {{
    "layer_id": "layer_id",
    "reason": "Why adding the layer",
    "synthesized_response": "Synthesized response from the layer descriptions and the user's query"
  }}
}}
"""

In [61]:
client = OpenAI()

response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[{"role": "system", "content": prompt}],
    response_format={"type": "json_object"}
)

json_content = json.loads(response.choices[0].message.content or "{}")

In [63]:
json_content["parameters"]["synthesized_response"]

'Passive marine flooding is defined as the identification of areas that are hydrologically connected to the ocean and are susceptible to inundation due to rising sea levels. This is modeled using a modified bathtub approach, which incorporates Digital Elevation Models (DEMs) and Mean Higher High Water (MHHW) tidal datum to pinpoint coastal regions that fall below specific sea level heights and have open surface connections to marine waters.'