In [None]:
%load_ext autoreload
%autoreload 2
import sys, pathlib
sys.path.append(str(pathlib.Path.cwd().parent.parent))

In [None]:
%reload_ext dotenv

%dotenv ../../env/.env

import warnings
from pandas import Timedelta
# from optiml.utils import sf
import time
from optiml.utils.sf import logger, sql_to_df, run_sql, conn, session
import pandas as pd
warnings.filterwarnings('ignore')

try:
    %load_ext autotime
except:
    !pip install ipython-autotime
    %load_ext autotime

import plotly
plotly.offline.init_notebook_mode()

In [None]:
# to rerun this analysis:

# in the *staging* app, rerun dbt in knot account in order to pull latest staging data [<=10min]
# run `grant select on all tables in schema optiml_share.optiml to database role optiml_share_role` in the query admin console
# run this notebook, connecting to KNOT_SHARE.OPTIML

# for the counterfactual analysis, additional steps are needed:

# use the knot-dba notebook to copy all *tables* from the share into a target schema [in our account]
# run dbt: dbt seed && dbt run --exclude staging daily_rates
# run the cluster autosuspend simulation smart-suspend-simulate on that schema
# rerun dbt # dbt run -s warehouse_era_simulated+
# run this notebook connected to the target schema:

# rerun dbt, just selecting views dbt run -s config.materialized:view?

In [None]:
run_sql("set lookback_days=30")

wh_name = 'XOGRP_DEV_WH'
# wh_name = 'SEGMENT_LOAD_WH'
wh_profile = f"""
select 
    *,
    (active_hours - idle_hours)*60 as query_minutes,
    idle_hours * 60 as idle_minutes
from warehouse_profile_by_hour
where warehouse_name = '{wh_name}' 
and hour_start > dateadd('days',-$lookback_days, current_timestamp())
order by hour_start desc;
"""

import itables


wpdf = sql_to_df(wh_profile)
itables.show(wpdf)


wh_profile_sim = f"""

with sim_snowflake_suspend as (
select 
    *,
    (active_hours - idle_hours)*60 as query_minutes,
    idle_hours * 60 as idle_minutes
from warehouse_profile_by_hour_sim
where warehouse_name = '{wh_name}' 
and hour_start > dateadd('days',-$lookback_days, current_timestamp())
and strategy = {{'autosuspend_sec': 60,   'engine': 'sql',   'type': 'snowflake' }}
),
sim_aero_managed as (
select 
    *,
    (active_hours - idle_hours)*60 as query_minutes,
    idle_hours * 60 as idle_minutes
from warehouse_profile_by_hour_sim
where warehouse_name = '{wh_name}' 
and hour_start > dateadd('days',-$lookback_days, current_timestamp())
and strategy = {{'engine': 'python',   'polling_sec': 1,   'type': 'greedy_after_one_min' }}
),

actual as (
    select 
        *,
        (active_hours - idle_hours)*60 as query_minutes,
        idle_hours * 60 as idle_minutes
    from warehouse_profile_by_hour
    where warehouse_name = '{wh_name}' 
    and hour_start > dateadd('days',-$lookback_days, current_timestamp())
)
select
    a.*,
    a.query_minutes as query_minutes_actual,
    a.idle_minutes as idle_minutes_actual,
    s.query_minutes as query_minutes_sim_snowflake,
    s.idle_minutes as idle_minutes_sim_snowflake,
    m.query_minutes as query_minutes_sim_aero,
    m.idle_minutes as idle_minutes_sim_aero,
    
   (abs(idle_minutes_sim_aero - idle_minutes_actual) < abs(idle_minutes_sim_snowflake - idle_minutes_actual)) as aero_presumed_on,
    sum( case when aero_presumed_on then  (idle_minutes_sim_snowflake - idle_minutes_actual ) else 0 end) over (order by a.hour_start asc) as cum_idle_minutes_saved,
   case when aero_presumed_on then  (idle_minutes_sim_snowflake - idle_minutes_actual ) else 0 end as idle_minutes_saved,
   idle_minutes_saved*a.max_cluster_number as cluster_idle_minutes_saved,
   sum( case when aero_presumed_on then  a.max_cluster_number * (idle_minutes_sim_snowflake - idle_minutes_actual ) else 0  end) over (order by a.hour_start asc) as cum_cluster_idle_minutes_saved,
   sum(aero_presumed_on::int) over (order by a.hour_start asc) as hours_on,
   (cum_idle_minutes_saved / 60) * wc.credits_per_hour as cum_credits_saved_pessimistic,
   (cum_cluster_idle_minutes_saved / 60) * wc.credits_per_hour as cum_credits_saved_upperbound,
   div0(cum_credits_saved_pessimistic * (365 * 24), hours_on) as annualized_credits_saved_projected_pessimistic,
   div0(cum_credits_saved_upperbound * (365 * 24), hours_on) as annualized_credits_saved_projected_upperbound
        
    
from actual a
left join sim_snowflake_suspend s
on a.hour_start = s.hour_start
left join sim_aero_managed m
on a.hour_start = m.hour_start
left join warehouse_credits wc
on a.warehouse_size = wc.size
order by hour_start desc;

"""
wpsimdf = sql_to_df(wh_profile_sim)


wh_events = f"""
select 
    current_timestamp(), 
    *,
    event_reason || ':' || nvl(role_name, 'null') as source
from stg_warehouse_events_history 
where event_name = 'SUSPEND_WAREHOUSE'
and event_state = 'COMPLETED'
and warehouse_name='{wh_name}' 
and timestamp > dateadd('days',-$lookback_days, current_timestamp())
order by timestamp desc;
"""


wedf = sql_to_df(wh_events)
itables.show(wedf)


op_queries = """
select
    start_time,
    query_text,
    credits_used_cloud_services,
    cloud_services_cost,
    cloud_services_cost/credits_used_cloud_services as dollars_per_credit,
    sum(credits_used_cloud_services) over (order by start_time asc) cum_credits_used_cloud_services,
    sum(query_cost) over (order by start_time asc) cum_query_cost
from query_history_enriched
where (contains(lower(user_name), 'aero') or contains(lower(user_name), 'optiml'))
and (contains(lower(query_text), 'show warehouses') or contains(lower(query_text), 'alter warehouse'))
and start_time > dateadd('days',-$lookback_days, current_timestamp());
"""


opdf = sql_to_df(op_queries)
itables.show(opdf)

suspension_stats = f"""
with eras as (
    select
        'query' as type,
        warehouse_id,
        warehouse_name,
        warehouse_sizes,
        max_cluster_number,
        era_start,
        era_end
    from query_era
    where era_end <= (select max(era_end) from warehouse_era)
    
    union

    select
        'warehouse'as type,
        warehouse_id,
        warehouse_name,
        null as max_cluster_number,
        null as warehouse_sizes,
        era_start,
        era_end
    from warehouse_era
    where era_start > (select min(era_start) from query_era)
    and era_end <= (select max(era_end) from query_era)
),
enriched as (
    select 
        row_number() over(order by warehouse_id, era_end) as era_id,
        *,
        -- max(era_end) over (partition by warehouse_id)
        lag(type) over (partition by warehouse_id order by era_end) as previous_ending_type,
        lag(era_end) over (partition by warehouse_id order by era_end) as previous_ending_time,
        lag(max_cluster_number) over (partition by warehouse_id order by era_end) as previous_max_cluster_number,
        lag(warehouse_sizes) over (partition by warehouse_id order by era_end) as prevous_wh_sizes,
        case when type = 'warehouse' and previous_ending_type = 'query' then timediff(milliseconds, previous_ending_time, era_end)/1000 else null end as suspend_lag,
        case when type = 'query' and previous_ending_type = 'query' then timediff(milliseconds, previous_ending_time, era_start)/1000 else null end as time_since_last_query,
        timediff(seconds, era_start, era_end) as era_seconds
    from eras
)
-- select * from enriched order by era_end desc limit 10;
,
suspends as (
select
    warehouse_id,
    warehouse_name,
    prevous_wh_sizes as warehouse_sizes,
    previous_max_cluster_number,
    era_end as suspend_time,
    suspend_lag
from enriched
where suspend_lag is not null
),
suspension_hour_stats as (
-- select * from suspends limit 10;
select
	warehouse_id,
    warehouse_name,
    date_trunc('hour', suspend_time) as hour,
    array_union_agg(warehouse_sizes) as sizes,
    sizes[0]::text as size,
    max(previous_max_cluster_number) as clusters,
    count(*) as num_suspensions,
    sum(previous_max_cluster_number*(60 - suspend_lag)) as max_saved_idle_seconds,
    avg(suspend_lag) suspend_lag_avg,
    median(suspend_lag) suspend_lag_median,
    min(suspend_lag) suspend_lag_min,
    percentile_cont(.99) within group(order by suspend_lag) as "99_pctile",
    max(suspend_lag) suspend_lag_max
from suspends s
where warehouse_name = '{wh_name}'
and suspend_time > dateadd('days',-$lookback_days, current_timestamp())
group by 1,2,3
),
suspension_savings as (
select 
    s.*,
    max_saved_idle_seconds * wc.credits_per_hour / 3600 as max_credit_diff,
    max_credit_diff * 3 as max_dollar_diff,
    sum(max_dollar_diff) over(order by hour asc) as cum_savings
from 
    suspension_hour_stats s
left join warehouse_credits wc
on s.size = wc.size
)
select * from suspension_savings
order by hour desc;
"""

sdf = sql_to_df(suspension_stats)
itables.show(sdf)

whload = f"""
select 
    *
from stg_warehouse_load_history 
where warehouse_name = '{wh_name}' 
    and start_time > dateadd('days',-$lookback_days, current_timestamp())
    order by start_time desc;
"""

load_df = sql_to_df(whload)

whload = f"""
select 
    date_trunc(hour, start_time) start_hour,
    avg(avg_running) as avg_running,
    avg(avg_queued_load) as avg_queued_load,
    avg(avg_queued_provisioning) as avg_queued_provisioning,
    avg(avg_blocked) as avg_blocked
from stg_warehouse_load_history 
where warehouse_name = '{wh_name}' 
    and start_time > dateadd('days',-$lookback_days, current_timestamp())
    group by 1
    order by start_hour desc;
"""

load_hour_df = sql_to_df(whload)

exec_times = f"""

with times as (
    select
        start_time,
        execution_time/1000 as execution_time
    from 
        stg_query_history
    where start_time > dateadd('days',-$lookback_days, current_timestamp())
    and warehouse_name = '{wh_name}'
)
select 
    date_trunc(hour, start_time) as start_hour,
    count(*) as num,
    avg(execution_time) avg,
    median(execution_time) median,
    min(execution_time) min,
    max(execution_time) max,
    percentile_cont(.90) within group(order by execution_time) as "90_pctile",
    percentile_cont(.75) within group(order by execution_time) as "75_pctile",
    percentile_cont(.25) within group(order by execution_time) as "25_pctile",
    percentile_cont(.10) within group(order by execution_time) as "10_pctile"
from
   times
group by 1
having num > 1000
order by 1 desc;
"""
exec_times_df = sql_to_df(exec_times)
itables.show(exec_times_df)


In [None]:
wpdf.head()

In [None]:
## Suspension start
wedf["source"].unique()


In [None]:
wedf.head()

In [None]:
## Calculates epoch on and off based on number of events coming from Optiml Svc or Snowflake Autosuspend
def get_aero_suspend_on_epochs(df, timecolumn="hour_start"):
    optiml_on = wedf.groupby([wedf['timestamp'].dt.floor('H'), 'source']).size().unstack().reset_index()
    optiml_on["optiml_on"] = optiml_on['WAREHOUSE_AUTOSUSPEND:null'] < optiml_on['WAREHOUSE_SUSPEND:OPTIML_SVC']
    optiml_on.fillna(0, inplace=True)
    optiml_on.sort_values('timestamp',inplace=True, ascending=False)
    df = df.merge(optiml_on, left_on=timecolumn, right_on='timestamp')
    df_on = df[df["optiml_on"]==True]
    df_off = df[df["optiml_on"]==False]   

    return df_on, df_off

In [None]:
%%markdown
# Cost savings estimate

In [None]:
wpdf_on, wpdf_off = get_aero_suspend_on_epochs(wpdf)

In [None]:
import numpy as np
def get_cost_hourly(df, pessimistic=False):
    if not pessimistic:
        cost = np.mean((df["idle_minutes"]+df["query_minutes"])*df["max_cluster_number"]*2.2)/60.
    else:    
        cost = np.mean((df["idle_minutes"]+df["query_minutes"])*2.2)/60.
    
    return cost
    

In [None]:
cost_off_max = get_cost_hourly(wpdf_off)
cost_off_min = get_cost_hourly(wpdf_off, pessimistic=True)
# Annualized
cost_off = np.mean([cost_off_max, cost_off_min])*24*365

In [None]:
cost_on_max = get_cost_hourly(wpdf_on)
cost_on_min = get_cost_hourly(wpdf_on, pessimistic=True)
# Annualized
cost_on = np.mean([cost_on_max, cost_on_min])*24*365

In [None]:
annual_savings = (cost_off - cost_on)
pct_savings = annual_savings/cost_off*100
cost_on, cost_off, annual_savings, pct_savings

In [None]:
%%markdown
# Suspension lag

In [None]:
sdf.columns

In [None]:
sdf_on, sdf_off = get_aero_suspend_on_epochs(sdf, "hour")

In [None]:
suspension_lag_on = np.median(sdf_on["suspend_lag_avg"])
suspension_lag_off = np.median(sdf_off["suspend_lag_avg"])
change_suspension_lag = suspension_lag_off - suspension_lag_on
pct_suspension_lag = change_suspension_lag/suspension_lag_off*100
suspension_lag_off, suspension_lag_on, change_suspension_lag, pct_suspension_lag

In [None]:
%%markdown
# Idling minutes

In [None]:
wpdf_on, wpdf_off = get_aero_suspend_on_epochs(wpdf)

In [None]:
wpdf.columns

In [None]:
pct_idle_off = np.mean(wpdf_off["pct_idle"])
pct_idle_on = np.mean(wpdf_on["pct_idle"])
pct_idle_change = pct_idle_off - pct_idle_on
pct_idle_off, pct_idle_on, pct_idle_change

In [None]:
# median_hourly_idle_minutes_on = np.median(wpdf_on["idle_minutes"])
# median_hourly_idle_minutes_off = np.median(wpdf_off["idle_minutes"])
# annualized_idle_minutes_on = median_hourly_idle_minutes_on*24*265
# annualized_idle_minutes_off = median_hourly_idle_minutes_off*24*265
# annual_minutes_saved = annualized_idle_minutes_off - annualized_idle_minutes_on
# annualized_idle_minutes_on, annualized_idle_minutes_off, annual_minutes_saved

In [None]:
%%markdown
# Query latency

In [None]:
exec_times_df.columns

In [None]:
exec_times_df_on, exec_times_df_off = get_aero_suspend_on_epochs(exec_times_df, 'start_hour')

In [None]:
# median_query_latency_off = np.median(exec_times_df_off["90_pctile"])
# median_query_latency_on = np.median(exec_times_df_on["90_pctile"])
# change_median_query_latency = median_query_latency_off-median_query_latency_on
# pct_change_median_query_latency = change_median_query_latency/median_query_latency_off*100
# median_query_latency_on, median_query_latency_off, change_median_query_latency, pct_change_median_query_latency

In [None]:
median_query_latency_off = np.median(exec_times_df_off["median"])
median_query_latency_on = np.median(exec_times_df_on["median"])
change_median_query_latency = median_query_latency_off-median_query_latency_on
pct_change_median_query_latency = change_median_query_latency/median_query_latency_off*100
median_query_latency_on, median_query_latency_off, change_median_query_latency, pct_change_median_query_latency

In [None]:
%%markdown
# Query load

In [None]:
load_hour_df.columns

In [None]:
load_hour_df_on, load_hour_df_off = get_aero_suspend_on_epochs(load_hour_df, 'start_hour')

In [None]:
avg_running_load_off = np.mean(load_hour_df_off["avg_running"])
avg_running_load_on = np.mean(load_hour_df_on["avg_running"])
pct_change = (avg_running_load_off - avg_running_load_on)/avg_running_load_off*100
avg_running_load_off, avg_running_load_on, pct_change

In [None]:
avg_queued_load_off = np.mean(load_hour_df_off["avg_queued_load"])
avg_queued_load_on = np.mean(load_hour_df_on["avg_queued_load"])
pct_change = (avg_queued_load_off - avg_queued_load_on)/avg_queued_load_off*100
avg_queued_load_off, avg_queued_load_on, pct_change

In [None]:
%%markdown
# Costs with and without Aero on

In [None]:
run_sql("set lookback_days=50")

In [None]:
wh_profile = f"""
select 
    *,
    (active_hours - idle_hours)*60 as query_minutes,
    idle_hours * 60 as idle_minutes
from warehouse_profile_by_hour
where warehouse_name = '{wh_name}' 
and hour_start > dateadd('days',-$lookback_days, current_timestamp())
order by hour_start desc;
"""

wpdf = sql_to_df(wh_profile)
itables.show(wpdf)


In [None]:
wpdf.columns

In [None]:
wpdf.sort_values("hour_start",inplace=True)

In [None]:
wpdf_epochs_on, wpdf_epochs_off = get_aero_suspend_on_epochs(wpdf,"hour_start")

In [None]:
wpdf_epochs_on.head()

In [None]:
optiml_start = wpdf_epochs_on.iloc[0]["hour_start"]

In [None]:
wpdf_off  = wpdf[wpdf["hour_start"] < optiml_start]
wpdf_on  = wpdf[wpdf["hour_start"] >= optiml_start]

In [None]:
wpdf_on["cumulative_dollars_used_compute"] = wpdf_on["dollars_used_compute"].cumsum(axis=0)
wpdf_off["cumulative_dollars_used_compute"] = wpdf_off["dollars_used_compute"].cumsum(axis=0)

In [None]:
wpdf_on["time_delta"] = (wpdf_on["hour_start"] - wpdf_on.iloc[0]["hour_start"])/ pd.Timedelta(hours=1)
wpdf_off["time_delta"] = (wpdf_off["hour_start"] - wpdf_off.iloc[0]["hour_start"])/ pd.Timedelta(hours=1)

In [None]:
import plotly.express as px

df_concat = pd.concat([wpdf_off, wpdf_on], keys=['Aero off', 'Aero on'])

fig = px.line(df_concat, x='time_delta', y='cumulative_dollars_used_compute', color=df_concat.index.get_level_values(0), title='Comparison of Columns')
fig.update_layout(yaxis_title='Dollars')
fig.update_layout(xaxis_title='Hours')
fig.update_layout(title='Observational difference in cost after turning on Aero')

fig.show()

In [None]:
%%markdown
## ================ Previous analyses ======================

In [None]:
%%markdown
# Idle Time, Suspension Lag, Savings Estimates

In [None]:
import plotly.express as px


fig = px.line(sdf, x="hour", y=["suspend_lag_avg", 'suspend_lag_max', 'suspend_lag_min', 'suspend_lag_median'])
fig.show()

fig = px.bar(sdf, x="hour", y='num_suspensions')
fig.show()

NO_SIMULATION_FOUND = wpsimdf.query_minutes_sim_aero.isnull().all()

if NO_SIMULATION_FOUND:
    print("NO AERO SIMULATION FOUND: you may need to run the warehouse simulation notebook and rerun dbt...")

fig = px.line(wpsimdf, x="hour_start", y=['query_minutes_actual', 'query_minutes_sim_snowflake','query_minutes_sim_aero'])
fig.show()

fig = px.line(wpsimdf, x="hour_start", y=['idle_minutes_actual', 'idle_minutes_sim_snowflake',  'idle_minutes_sim_aero'])
fig.show()


fig = px.line(wpsimdf, x="hour_start", y=['cum_cluster_idle_minutes_saved'])
fig.show()

fig = px.line(wpsimdf, x="hour_start", y=['cum_credits_saved_pessimistic', 'cum_credits_saved_upperbound'])
fig.show()

fig = px.line(wpsimdf, x="hour_start", y=['annualized_credits_saved_projected_pessimistic', 'annualized_credits_saved_projected_upperbound'])
fig.show()



In [None]:
%%markdown
# Latency / Performance

In [None]:
whload = f"""
select 
    *
from stg_warehouse_load_history 
where warehouse_name = '{wh_name}' 
    and start_time > dateadd('days',-$lookback_days, current_timestamp())
    order by start_time desc;
"""

load_df = sql_to_df(whload)

whload = f"""
select 
    date_trunc(hour, start_time) start_hour,
    avg(avg_running) as avg_running,
    avg(avg_queued_load) as avg_queued_load,
    avg(avg_queued_provisioning) as avg_queued_provisioning,
    avg(avg_blocked) as avg_blocked
from stg_warehouse_load_history 
where warehouse_name = '{wh_name}' 
    and start_time > dateadd('days',-$lookback_days, current_timestamp())
    group by 1
    order by start_hour desc;
"""

load_hour_df = sql_to_df(whload)

exec_times = f"""

with times as (
    select
        start_time,
        execution_time/1000 as execution_time
    from 
        stg_query_history
    where start_time > dateadd('days',-$lookback_days, current_timestamp())
    and warehouse_name = '{wh_name}'
)
select 
    date_trunc(hour, start_time) as start_hour,
    count(*) as num,
    avg(execution_time) avg,
    median(execution_time) median,
    min(execution_time) min,
    max(execution_time) max,
    percentile_cont(.90) within group(order by execution_time) as "90_pctile",
    percentile_cont(.75) within group(order by execution_time) as "75_pctile",
    percentile_cont(.25) within group(order by execution_time) as "25_pctile",
    percentile_cont(.10) within group(order by execution_time) as "10_pctile"
from
   times
group by 1
having num > 1000
order by 1 desc;
"""
exec_times_df = sql_to_df(exec_times)


In [None]:
fig = px.line(load_df, x="start_time", y=['avg_running', 'avg_queued_load', 'avg_queued_provisioning', 'avg_blocked'])
fig.show()

fig = px.line(load_hour_df, x="start_hour", y=['avg_running', 'avg_queued_load', 'avg_queued_provisioning', 'avg_blocked'])
fig.show()

fig = px.line(exec_times_df, x="start_hour", y=['median', '90_pctile', '75_pctile', '25_pctile', '10_pctile'], title='query latency (seconds)')
fig.show()
fig = px.line(exec_times_df, x="start_hour", y='num', title='number of queries')
fig.show()


# fig = px.line(wpsimdf, x="hour_start", y=['query_minutes_actual', 'query_minutes_sim_snowflake','query_minutes_sim_aero'])
# fig.show()

In [None]:
from plotly.subplots import make_subplots
import plotly.express as px


figures = [
    px.line(sdf, x="hour", y=["suspend_lag_avg", 'suspend_lag_max', 'suspend_lag_min', 'suspend_lag_median']),
    px.bar(wpdf, x='hour_start', y='dollars_used_compute'),
    px.line(wpdf, x='hour_start', y=['query_minutes', 'idle_minutes']),
    px.bar(wpdf, x='hour_start', y= 'idle_minutes'),
    px.bar(wpdf, x='hour_start', y='pct_idle'),
    px.scatter(wedf, x="timestamp", y="source"),
    px.area(opdf, x="start_time", y="cum_query_cost")
    ]

fig = make_subplots(rows=len(figures), cols=1, shared_xaxes=True, vertical_spacing=0.05,
                   subplot_titles=['suspend lag stats', 'dollars_used_compute', 
                                   'query and idle minutes', 'idle minutes', 'pct_idle', 
                                   'suspension event sources', 'cumulative operational query cost']) 

for i, figure in enumerate(figures):
    for trace in range(len(figure["data"])):
        fig.append_trace(figure["data"][trace], row=i+1, col=1)


fig.update_xaxes(showgrid=True,minor=dict(showgrid=True))
fig.update_yaxes(showgrid=True,minor=dict(showgrid=True))
fig.update_xaxes(autorange=True)
fig.update_layout(xaxis_showticklabels=True, 
                  xaxis2_showticklabels=True,
                 xaxis3_showticklabels=True,
                  xaxis4_showticklabels=True,
                  xaxis5_showticklabels=True
                 )

# fig.update_xaxes(range=[df.ts.min(), df.ts.max()])
fig.update_layout(
    height=2000,
)
fig.update_xaxes(type='date', autorange=True)
# fig.update_yaxes(row=3, col=1, autorange='reversed')

fig.show()

# next: try just one layer of test query

In [None]:
# %%sh 
# jupyter nbconvert --to html autosuspend_monitoring.ipynb --no-input --output knot-autosuspend-analysis