In [None]:
# Import python packages
import streamlit as st
import pandas as pd

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()


# SQL Query Optimization and Semantic YAML Enhancement

This tool optimizes SQL queries and suggests improvements to semantic YAML files.

**Requirements:**
- `verified_queries.csv` with columns: `user_question`, `verified_query`
- `semantic.yaml` with your current semantic model
- Snowflake account with Cortex AI access

In [None]:
# Load data
df_csv = pd.read_csv('verified_queries.csv')
with open('semantic.yaml', 'r') as f:
    yaml_content = f.read()

print(f"Loaded {len(df_csv)} queries and semantic YAML ({len(yaml_content)} chars)")


# Phase 1: SQL Query Optimization

Optimize SQL queries for performance and extract semantic metrics.

In [None]:
# Define optimization prompt
OPTIMIZATION_PROMPT = """Optimize this SQL query. Remove unnecessary subqueries/CTEs if possible. Identify tables used and suggest semantic metrics.

Respond with valid JSON only:
{
  "optimized_query": "optimized SQL here",
  "tables_used": ["table1", "table2"],
  "optimization_notes": "explanation of changes",
  "suggested_metrics": [
    {
      "name": "metric_name",
      "description": "what this measures",
      "yaml_syntax": "- name: metric_name\\n  description: \\"desc\\"\\n  sql: \\"expression\\"\\n  data_type: NUMBER"
    }
  ]
}

Query: """

def optimize_query(query_text):
    prompt = OPTIMIZATION_PROMPT + query_text
    result = session.sql(f"SELECT SNOWFLAKE.CORTEX.COMPLETE('claude-3-5-sonnet', $${prompt}$$) as response").collect()
    return result[0]['RESPONSE']

print("Query optimization function ready")


In [None]:
# Process queries
results = []
for _, row in df_csv.iterrows():
    print(f"Processing: {row['user_question'][:50]}...")
    
    response = optimize_query(row['verified_query'])
    
    try:
        data = json.loads(response)
        results.append({
            'user_question': row['user_question'],
            'verified_query': row['verified_query'],
            'optimized_query': data.get('optimized_query', ''),
            'tables_used': data.get('tables_used', []),
            'optimization_notes': data.get('optimization_notes', ''),
            'suggested_metrics': data.get('suggested_metrics', [])
        })
    except:
        results.append({
            'user_question': row['user_question'],
            'verified_query': row['verified_query'],
            'optimized_query': 'Parse failed',
            'tables_used': [],
            'optimization_notes': response[:200],
            'suggested_metrics': []
        })

df_results = pd.DataFrame(results)
print(f"Processed {len(results)} queries")


In [None]:
# Display results table
display_df = df_results[['user_question', 'tables_used', 'optimization_notes']].copy()
display_df['user_question'] = display_df['user_question'].str[:60] + '...'
display_df['optimization_notes'] = display_df['optimization_notes'].str[:80] + '...'

print("OPTIMIZATION RESULTS")
print("=" * 100)
print(display_df.to_string(index=False))
print("=" * 100)

# Phase 2: Semantic YAML Analysis

Analyze semantic YAML against queries to extract business context and suggest improvements.

In [None]:
# YAML Analysis
YAML_PROMPT = """Analyze this SQL query and user question against the semantic YAML. Extract business context and suggest improvements.

Respond with valid JSON only:
{
  "business_context": {
    "domain": "business domain",
    "key_concepts": ["term1", "term2"],
    "implicit_definitions": ["concept: definition"]
  },
  "custom_instructions": [
    {
      "category": "BUSINESS_DEFINITIONS",
      "instruction": "Define what X means",
      "rationale": "Why this helps"
    }
  ],
  "improvements": [
    {
      "type": "NEW_METRIC",
      "yaml_code": "- name: metric\\n  sql: expression",
      "location": "Add to metrics section",
      "reason": "Why needed"
    }
  ]
}

SQL: {query}
Question: {question}
YAML: {yaml}
"""

def analyze_yaml(query, question, yaml_content):
    prompt = YAML_PROMPT.format(query=query, question=question, yaml=yaml_content)
    result = session.sql(f"SELECT SNOWFLAKE.CORTEX.COMPLETE('claude-3-5-sonnet', $${prompt}$$) as response").collect()
    return result[0]['RESPONSE']

print("YAML analysis function ready")


In [None]:
# Simple Table-Based YAML Analysis

def show_table_queries(table_name):
    """Show queries for a specific table"""
    table_queries = []
    for i, row in df_results.iterrows():
        tables = row['tables_used']
        if isinstance(tables, str):
            tables = [t.strip() for t in tables.split(',')]
        
        if any(table_name.lower() in str(table).lower() for table in tables):
            table_queries.append((i, row))
    
    if table_queries:
        print(f"Queries using table '{table_name}':")
        for idx, (orig_i, row) in enumerate(table_queries):
            print(f"{idx}. {row['user_question']}")
        print(f"\nUse: analyze_table_query('{table_name}', query_index, 'yaml_file.yaml')")
    else:
        print(f"No queries found for table: {table_name}")

def analyze_table_query(table_name, query_index, yaml_file):
    """Analyze a specific query from a table"""
    # Find queries for this table
    table_queries = []
    for i, row in df_results.iterrows():
        tables = row['tables_used']
        if isinstance(tables, str):
            tables = [t.strip() for t in tables.split(',')]
        
        if any(table_name.lower() in str(table).lower() for table in tables):
            table_queries.append((i, row))
    
    if query_index >= len(table_queries):
        print(f"Invalid index. Table '{table_name}' has {len(table_queries)} queries (0-{len(table_queries)-1})")
        return
    
    # Get the query
    orig_i, row = table_queries[query_index]
    
    # Load YAML
    with open(yaml_file, 'r') as f:
        yaml_content = f.read()
    
    print(f"Table: {table_name}")
    print(f"Query: {row['user_question']}")
    print(f"YAML: {yaml_file}")
    print("-" * 50)
    
    # Run analysis with structured output
    prompt = f"""Analyze this SQL query and user question against the semantic YAML. Extract business context and suggest improvements.

Respond with valid JSON only:
{{
  "business_context": {{
    "domain": "business domain",
    "key_concepts": ["term1", "term2"]
  }},
  "improvements": [
    {{
      "type": "NEW_METRIC",
      "reason": "Why needed",
      "location": "Add to metrics section",
      "yaml_code": "- name: metric\\n  sql: expression"
    }}
  ]
}}

SQL: {row['optimized_query']}
Question: {row['user_question']}
YAML: {yaml_content}
"""
    
    result = session.sql(f"SELECT SNOWFLAKE.CORTEX.COMPLETE('claude-3-5-sonnet', $${prompt}$$) as response").collect()
    response = result[0]['RESPONSE']
    
    # Parse JSON response with error handling
    try:
        import json
        # Clean the response and try to extract JSON
        clean_response = response.strip()
        if clean_response.startswith('{'):
            data = json.loads(clean_response)
        else:
            # Find JSON in response
            start = clean_response.find('{')
            end = clean_response.rfind('}') + 1
            if start >= 0 and end > start:
                json_part = clean_response[start:end]
                data = json.loads(json_part)
            else:
                raise ValueError("No JSON found")
    except Exception as e:
        print(f"Could not parse JSON: {e}")
        print("Raw response:", response[:500])
        return
    
    # Show improvements
    improvements = data.get('improvements', [])
    if improvements:
        print("YAML Improvements:")
        for i, imp in enumerate(improvements, 1):
            print(f"{i}. {imp.get('type', '')}: {imp.get('reason', '')}")
            if imp.get('yaml_code'):
                print(f"   Code: {imp.get('yaml_code', '')}")
            print()
    else:
        print("No improvements suggested")

# Show available tables
tables = set()
for _, row in df_results.iterrows():
    if isinstance(row['tables_used'], list):
        tables.update(row['tables_used'])
    else:
        tables.update([t.strip() for t in str(row['tables_used']).split(',')])

print("Available tables:", ', '.join(sorted(tables)))
print("Usage: show_table_queries('table_name')")
print("Then: analyze_table_query('table_name', query_index, 'yaml_file.yaml')")


In [None]:
show_table_queries("customers")

In [None]:
analyze_table_query("customers", 0, "semantic.yaml")