In [None]:
%load_ext autoreload
%autoreload 2
import sys, pathlib
sys.path.append(str(pathlib.Path.cwd().parent.parent))


In [None]:
%reload_ext dotenv

%dotenv ../../env/.env

import warnings
from pandas import Timedelta
# from optiml.utils import sf
import time
from optiml.utils.sf import logger, sql_to_df, run_sql, conn, session
import pandas as pd
warnings.filterwarnings('ignore')

try:
    %load_ext autotime
except:
    !pip install ipython-autotime
    %load_ext autotime


In [None]:
sql = """
with a as (
select
    warehouse_id['cluster_number']::number as cluster_number,
    warehouse_id['warehouse_id']::number as wid,
    * from cluster_profile
)
-- select * from a;
,
wh_profile as (

select 
    wid,
    max(cluster_number) max_cluster_number,
    min(cluster_number) min_cluster_number,
    any_value(warehouse_name) warehouse_name,
    -- array_agg(dollars_used_compute),
    -- max(case when cluster_number = 1 then dollars_used_compute else null end) as first_cluster_dollars,
    max(case when cluster_number = 1 then active_hours else null end) as first_cluster_active_hours,
    sum(active_hours) as active_hours,
    sum(query_hours) as query_hours,
    -- sum(idle_hours) as idle_hours,
    -- sum(dollars_used_idle) as dollars_used_idle,

    sum(case when cluster_Number = 1 then dollars_used_querying_single_cluster_inferred else 0 end) as dollars_used_querying_inferred_first_cluster,
    sum(case when cluster_Number = 1 then 0 else dollars_used_querying_single_cluster_inferred end) as dollars_used_querying_inferred_secondary_clusters,
    sum(dollars_used_querying_single_cluster_inferred) as dollars_used_querying_inferred,
    
    sum(dollars_used_single_cluster_inferred) as dollars_used_inferred

from a
group by 1
)
-- select * from wh_profile;
select 
    f.warehouse_name || '-' || f.wid as name,
    f.*,
    s.dollars_used_compute,
    s.active_hours as warehouse_active_hours,
    s.billed_hours as warehouse_billed_hours_estimated,
    s.query_hours as warehouse_query_hours,
    s.idle_hours as warehouse_idle_hours,
    
    timediff('day', s.start_hour, s.end_hour) as numdays,
    
    -- 100 * (first_cluster_dollars - s.dollars_used_compute)/s.dollars_used_compute as pct_diff
    100 * (dollars_used_inferred - s.dollars_used_compute)/s.dollars_used_compute as pct_diff_dollar_estimate,
    100 * (first_cluster_active_hours - s.active_hours)/s.active_hours as pct_diff_first_cluster_vs_warehouse,
    warehouse_query_hours/warehouse_active_hours as efficiency_warehouse,
    warehouse_query_hours/warehouse_billed_hours_estimated as efficiency_warehouse_billed,
    (f.dollars_used_querying_inferred / s.dollars_used_compute)::float as efficiency_clusters,
    efficiency_warehouse - efficiency_clusters as inefficiency_clusters,
    inefficiency_clusters * s.dollars_used_compute as possible_gain,

    
    s.dollars_used_single_cluster_inferred as dollars_used_first_cluster_inferred,
    s.dollars_used_idle as dollars_used_warehouse_idle,
    s.dollars_used_compute - f.dollars_used_querying_inferred as dollars_used_clusters_idle,
    
    dollars_used_first_cluster_inferred - f.dollars_used_querying_inferred_first_cluster as dollars_used_idle_first_cluster,
    (s.dollars_used_compute - dollars_used_first_cluster_inferred) - f.dollars_used_querying_inferred_secondary_clusters as dollars_used_idle_secondary_clusters,
    
    (f.dollars_used_querying_inferred + dollars_used_idle_first_cluster + dollars_used_idle_secondary_clusters) = s.dollars_used_compute as test
from wh_profile f
left join warehouse_profile s
on f.wid = s.warehouse_id
order by abs(pct_diff_dollar_estimate) desc;

"""

In [None]:
df = sql_to_df("select * from warehouses")
import itables

print("WAREHOUSE PROFILES (ACTUAL)")
print("---------------------------")
itables.show(df)


In [None]:
df = sql_to_df(sql)
import itables

print("WAREHOUSE PROFILES (ACTUAL)")
print("---------------------------")
itables.show(df)


In [None]:
cols = [c for c in df.columns if 'dollars' in c]
print(cols)
for c in cols:
    df[c] = df[c].map(float)
df.dtypes

In [None]:
import plotly.express as px
import plotly
plotly.offline.init_notebook_mode()
# df = df.sort_values('dollars_used_compute')
fig = px.bar(
    df.sort_values('dollars_used_compute'), 
    y="name", 
    x=['efficiency_warehouse','efficiency_clusters'], 
    barmode='group',
    # color = 'wh_utilization_status', 
    # color_discrete_sequence=['blue', 'green', 'red'],
    title = "efficiency estimates",
    orientation='h',
    height=500*len(df)/20
)
fig.show()

# s.dollars_used_idle as dollars_used_warehouse_idle,
#     s.dollars_used_compute - f.dollars_used_querying_inferred as dollars_used_clusters_idle,
    
fig = px.bar(
    df.sort_values('dollars_used_warehouse_idle'), 
    y="name", 
    x=['dollars_used_clusters_idle', 'dollars_used_warehouse_idle'], 
    barmode='group',
    # color = 'wh_utilization_status', 
    color_discrete_sequence=['red', 'orange'],
    title = "wasteful spend",
    orientation='h',
    height=500*len(df)/20
)
fig.show()

fig = px.bar(
    df.sort_values('dollars_used_idle_secondary_clusters'), 
    y="name", 
    x=['dollars_used_querying_inferred', 'dollars_used_idle_first_cluster', 'dollars_used_idle_secondary_clusters'], 
    # barmode='group',
    # color = 'wh_utilization_status', 
    color_discrete_sequence=['green', 'orange', 'red'],
    title = "multicluster spend breakdowns",
    orientation='h',
    height=500*len(df)/20
)
fig.show()

fig = px.bar(
    df.sort_values('max_cluster_number'), 
    y="name", 
    x='max_cluster_number', 
    # barmode='group',
    # color = 'wh_utilization_status', 
    # color_discrete_sequence=['green', 'orange', 'red'],
    title = "max cluster number",
    orientation='h',
    height=500*len(df)/20
)
fig.show()

# fig = px.bar(
#     df, 
#     y="warehouse_name", 
#     x=["pct_idle"], 
#     # color = 'wh_utilization_status', 
#     # color_discrete_sequence=['blue', 'green', 'red'],
#     title = "pct of uptime spent idle",
#     orientation='h',
#     height=500
# )
# fig.show()

In [None]:
%%sh 
jupyter nbconvert --to html cluster_idleness.ipynb --no-input --output cluster_idleness_analysis_om1