In [None]:
%reload_ext dotenv

%dotenv ../../env/.env


In [None]:
import snowflake.connector
import os

conn = snowflake.connector.connect(
    user=os.environ['SNOWFLAKE_USER'],
    role=os.environ['SNOWFLAKE_ROLE'],
    password=os.environ['SNOWFLAKE_PASSWORD'],
    account=os.environ['SNOWFLAKE_ACCOUNT'],
    warehouse=os.environ['SNOWFLAKE_WAREHOUSE'],
    database=os.environ['SNOWFLAKE_DATABASE'],
    schema=os.environ['SNOWFLAKE_SCHEMA'],
    client_session_keep_alive=True
)

conn.schema

In [None]:
# Create a cursor object.
import pandas as pd
def run_sql(sql: str, ctx=conn):
    return conn.cursor().execute(sql)

import functools
import time

@functools.cache
def sql_to_df(sql_query, pre_hook=[], ctx=conn):
    print(f"RUNNING pre-hook: {pre_hook}")
    for s in pre_hook:
        run_sql(s,conn)
        # print(f"RUNNING SQL: {sql_query}")

    # todo: move to latest method of pandas dataframe fetching
    # may need to upgrade python: https://github.com/snowflakedb/snowflake-connector-python/issues/986#issuecomment-1115354587
    
    trimmed_lowered = sql_query.strip().lower()
    if trimmed_lowered.startswith('select') or trimmed_lowered.startswith('with'):
        print(f"using arrow to fetch results...")
        cur = ctx.cursor()
        cur.execute(sql_query)
        data = cur.fetch_pandas_all() 
        cur.close()
    else:
        data = pd.read_sql(
            sql_query,
            ctx,
        )
    
    data.columns = data.columns.str.lower()
    return data

In [None]:
sql = """
select
 query_sanitized_signature,
 -- date_trunc(hour, execution_start_time) as start_hour,
 count_if(data_scanned_from_cache_gb > 0) num_cache_hits,
 count(*) as cnt,
 100*avg(case when data_scanned_from_cache_gb > 0 then percentage_scanned_from_cache else null end) as avg_pct_scanned_from_cache,
 avg(case when data_scanned_from_cache_gb > 0 then query_cost else null end) as avg_query_cost_cache_hit,
 avg(case when data_scanned_from_cache_gb = 0 then query_cost else null end) as avg_query_cost_cache_miss,
 avg(case when data_scanned_from_cache_gb > 0 then execution_time_s else null end) as avg_execution_time_s_cache_hit,
 avg(case when data_scanned_from_cache_gb = 0 then execution_time_s else null end) as avg_execution_time_s_cache_miss,
 median(case when data_scanned_from_cache_gb > 0 then query_cost else null end) as median_query_cost_cache_hit,
 median(case when data_scanned_from_cache_gb = 0 then query_cost else null end) as median_query_cost_cache_miss,
 avg_query_cost_cache_hit - avg_query_cost_cache_miss as avg_query_cost_diff,
 median_query_cost_cache_hit - median_query_cost_cache_miss as median_query_cost_diff,
 
 100*div0null(avg_query_cost_cache_hit - avg_query_cost_cache_miss,avg_query_cost_cache_miss) as pct_diff_avg,
 100*div0null(median_query_cost_cache_hit - median_query_cost_cache_miss,median_query_cost_cache_miss) as pct_diff_median
from query_history_enriched
where warehouse_size is not null and data_scanned_gb > 0
group by 1
having num_cache_hits > 4 and (cnt - num_cache_hits) > 1 and avg_query_cost_cache_miss > 0.5
"""
df = sql_to_df(sql)
df

In [None]:
import plotly.express as px
fig = px.histogram(df, x='pct_diff_avg', nbins=100)
fig.show()

fig = px.histogram(df, x='pct_diff_median', nbins=100)
fig.show()
