In [None]:
%load_ext autoreload
%autoreload 2
import sys, pathlib
sys.path.append(str(pathlib.Path.cwd().parent.parent))

In [None]:
%reload_ext dotenv

%dotenv ../../env/.env

import warnings
from pandas import Timedelta
# from optiml.utils import sf
import time
from optiml.utils.sf import logger, sql_to_df, run_sql, conn, session
import pandas as pd
warnings.filterwarnings('ignore')

try:
    %load_ext autotime
except:
    !pip install ipython-autotime
    %load_ext autotime

import plotly
plotly.offline.init_notebook_mode()

In [None]:
%%markdown
# costs of inadequate pruning


In [None]:
import plotly.express as px
import plotly
plotly.offline.init_notebook_mode()

sql = """
select 
    filtered_select, 
    no_pruning, 
    365 / (select datediff('day', min(start_time), max(start_time)) from stg_query_history) * count(*) num_queries_annualized, 
    365 / (select datediff('day', min(start_time), max(start_time)) from stg_query_history) * sum(query_cost) query_cost_annualized
from query_history_narrowed
group by 1,2
"""

df = sql_to_df(sql)
df

In [None]:
fig = px.pie(
    df,
    values = 'query_cost_annualized',
    names = 'filtered_select',
    title = 'query cost for filtered selects vs other queries'
    # height = 2000,
    # orientation='h',
)

fig.show()

fig = px.pie(
    df[df.filtered_select == True],
    values = 'query_cost_annualized',
    names = 'no_pruning',
    title = "pruning status of filtered selects"
    # height = 2000,
    # orientation='h',
)

fig.show()

In [None]:
%%markdown
# pattern compression
a large "query to query_pattern ratio" indicates high levels of query logic repetition

In [None]:
sql = """
select
count(*) as num_queries, 
count(distinct query_text_hash) as num_unique_queries,
count(distinct query_pattern_hash) as num_unique_query_patterns
from 
query_pattern;
"""
df = sql_to_df(sql)

In [None]:
import plotly.express as px
import plotly
plotly.offline.init_notebook_mode()
# df = df.sort_values('dollars_used_compute')
fig = px.bar(
    df, 
    barmode='group',
    title = "query patterns funnel",
    orientation='h',
)

fig.show()

In [None]:
%%markdown
# query parsing performance
(how successfully are we parsing query patterns?)

In [None]:
sql = """
with statuses as (
select 
    -- try_parse_json(filter_predicates) is not null as sql_parsed,
        try_parse_json(filter_predicates) as parsed_filter_predicates,
    case 
        when contains(filter_predicates, 'could not be resolved') then 'column could not be resolved'
        when parsed_filter_predicates is not null and array_size(parsed_filter_predicates) = 0 then 'successful sql parse: empty'
        when parsed_filter_predicates is not null and array_size(parsed_filter_predicates) > 0 then 'successful sql parse: nonempty'
        else 'other'
    end as parse_status
from filter_predicates_limit
)
select
    parse_status,
    count(*) num_query_patterns
from statuses
group by all;
"""
df = sql_to_df(sql)
df

In [None]:

# df = df.sort_values('dollars_used_compute')
fig = px.bar(
    df,
    y = 'parse_status',
    x = 'num_query_patterns',
    title = "sql parsing funnel",
    orientation='h',
)

fig.show()

In [None]:
%%markdown
# cluster key candidate analysis

In [None]:
sql = """
// is the proposed key from a source table or parsed from some other derived entity"
select
    is_source_table,
    count(*)
from
    query_cluster_key_candidate
group by all;
"""
df = sql_to_df(sql)

display(df)

In [None]:
import itables


sql = """
with most_impactful_candidates as (
select 

    table_name, pruning_key_candidate,

        (select datediff('day', min(start_time), max(start_time)) from query_pattern) as 
 window_size_days,
     num_queries/window_size_days as queries_per_day,
     
        -- potential impact
    avg_latency_sec*0.8 as predicted_latency_reduction_avg_secs,
    365/window_size_days*total_latency_hours*0.8 as predicted_latency_reduction_total_hours,
    avg_query_cost*0.8 as predicted_cost_reduction_avg,
    365/window_size_days*total_query_cost*0.8 as predicted_cost_reduction_total,
        
    365/window_size_days * total_query_cost as total_query_cost_annualized,
    
    * exclude (table_name, pruning_key_candidate)


from cluster_key_report
--qualify row_number() over(partition by table_name order by predicted_cost_reduction_total desc) = 1
)
select * from most_impactful_candidates
order by total_query_cost desc limit 100;
--select 
--    sum(predicted_cost_reduction_total) predicted_cost_reduction_total, 
--    sum(predicted_latency_reduction_total_hours) predicted_latency_reduction_total_hours
--from most_impactful_candidates;
"""
df = sql_to_df(sql)

display(df)
# itables.show(df)

In [None]:
df.columns

In [None]:
df['pruning_key'] = df.table_name + '.' + df.pruning_key_candidate

df = df.sort_values('predicted_cost_reduction_total', ascending=True)
import plotly.express as px
import plotly

cols = ['predicted_cost_reduction_total', 'num_queries', 'avg_query_cost', 'avg_latency_sec', 'predicted_latency_reduction_total_hours']
for c in cols:
    fig = px.bar(
        df,
        y = 'pruning_key',
        x = c,
        title = c,
        height = 2000,
        orientation='h',
    )

    fig.show()


In [None]:
%%markdown
# Total Cost/Latency Impact
## assumes:
- 80% cost reduction per query w/pruning
- picking the most impactful cluster/sort key per table

In [None]:
import itables


sql = """
with most_impactful_candidates as (
select 

    table_name, pruning_key_candidate,

        (select datediff('day', min(start_time), max(start_time)) from query_pattern) as 
 window_size_days,
     num_queries/window_size_days as queries_per_day,
     
        -- potential impact
    avg_latency_sec*0.8 as predicted_latency_reduction_avg_secs,
    365/window_size_days*total_latency_hours*0.8 as predicted_latency_reduction_total_hours,
    avg_query_cost*0.8 as predicted_cost_reduction_avg,
    365/window_size_days*total_query_cost*0.8 as predicted_cost_reduction_total,
        
    365/window_size_days * total_query_cost as total_query_cost_annualized,
    
    * exclude (table_name, pruning_key_candidate)


from cluster_key_report
qualify row_number() over(partition by table_name order by predicted_cost_reduction_total desc) = 1
)
--select * from most_impactful_candidates
--order by total_query_cost desc limit 100;
select 
    sum(predicted_cost_reduction_total) predicted_cost_reduction_total, 
    sum(predicted_latency_reduction_total_hours) predicted_latency_reduction_total_hours
from most_impactful_candidates;
"""
df = sql_to_df(sql)

display(df)
# itables.show(df)

In [None]:
# from plotly.subplots import make_subplots

# # qdf['warehouse_query'] = qdf.warehouse_name + " : " + qdf.query_id

# figures = [
# px.scatter(whe.sort_values('warehouse_name'), x="timestamp", y="event_name", color='event_name'),
# px.scatter(cqe.sort_values('warehouse_id'), x="era_start", y="warehouse_id"),
# px.timeline(cqe.sort_values('warehouse_id'), y='warehouse_id', x_start="era_start", x_end="era_end"),
# px.line(spend, x = 'start_time', y='dollars_used_compute', line_shape='hv')
#     ]

# fig = make_subplots(rows=len(figures), cols=1, shared_xaxes=True, vertical_spacing=0.05) 

# for i, figure in enumerate(figures):
#     for trace in range(len(figure["data"])):
#         fig.append_trace(figure["data"][trace], row=i+1, col=1)

# fig.update_xaxes(title_text="warehouse events", row=1, col=1)
# fig.update_xaxes(title_text="warehouse cluster era starts", row=2, col=1)
# fig.update_xaxes(title_text="query eras", row=3, col=1)

# fig.update_xaxes(showgrid=True,minor=dict(showgrid=True))
# fig.update_yaxes(showgrid=True,minor=dict(showgrid=True))
# fig.update_xaxes(autorange=True)
# fig.update_layout(
#     height=800,
# )
# fig.update_xaxes(type='date', autorange=True)

# # fig.update_layout(
# #     xaxis_range=[cqe.era_start.min(), cqe.era_end.max()]  # Specify your desired minimum and maximum range
# # )
# fig.show()

# fig = px.bar(whm[whm.warehouse_name != 'OPS'], y='warehouse_name', x = ['credits_used_compute'], orientation = 'h', title='credits x warehouse')
# fig.show()
# fig = px.bar(whm, y='warehouse_name', x = ['credits_used_compute', 'credits_used_cloud_services'], orientation = 'h')
# fig.show()

In [None]:
%%sh 
jupyter nbconvert --to html cluster_key_analysis.ipynb --no-input --output cluster_key_analysis_rakuten