In [None]:
import sys, pathlib
sys.path.append(str(pathlib.Path.cwd().parent))
# sys.path

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tabulate import tabulate

In [None]:
%load_ext autoreload
%autoreload 2
%aimport optiml.queries

## Initialization

In [None]:
# Initialize Snowflake connection to Snowflake
from optiml.connection import SnowflakeConnConfig
connection = SnowflakeConnConfig(accountname='jg84276.us-central1.gcp',warehousename="XSMALL_WH").create_connection()
# Initialize local environment
import os
cache_dir = os.path.expanduser('~/data/kiva')
# Initialize query library
from optiml.queries import SNFLKQuery
qlib = SNFLKQuery(connection, 'KIV', cache_dir)
sdate = '2022-08-01'
edate = '2022-10-31'

# Total cost breakdown

## Visualization

## Breakdown of total cost by usage type

In [None]:
df = qlib.total_cost_breakdown(sdate, edate).round(2)
df.loc[len(df.index)] = ['Total', df['credits'].sum(), df['dollars'].sum()]
print(tabulate(df, headers='keys', tablefmt='rounded_outline', showindex=False))


In [None]:
df = df.drop(len(df)-1) ## Remove the last row of totals for the plot
fig = make_subplots(
    rows=1, cols=2,
    specs=[[{"type": "pie"},{"type": "pie"}]],
    subplot_titles=("Dollars", "Credits")
)

fig.add_trace(go.Pie(labels=df['cost_category'].tolist(), values=df['dollars'].tolist(),name="Dollars"),row=1,col=1)
fig.add_trace(go.Pie(labels=df['cost_category'].tolist(), values=df['credits'].tolist(),name='Credits'),row=1,col=2)

fig.update_layout(
    title={
        'text': "Breakdown of total cost by usage category",
        'y':0.1,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'bottom'})
fig.show()

## Analysis
- Compute forms overwhelming majority of the cost
- Cloud services are in aggregate < 10 % of compute so are not an added expense
- Autoclustering used to be on in 2020 but isn't anymore
- Should explore if there are any compute efficiencies to be gained by 
    - Autoclustering
    - Materialized views / caching
    - Search optimization
    - Query optimizations / reducing scans

* Largely used by compute
* In 2020 auto clustering was on but was turned off - why?
* Storage credits at 0 since storage is charged as a flat rate - are numbers consistent?
* Definitely did not use $80K worth of credits in October since total bill is $50K - so something is wrong!


## Breakdown of total cost by warehouse

In [None]:
df = qlib.cost_by_wh(sdate, edate).round(2)
df.loc[len(df.index)] = ['Total', df['credits'].sum(), df['dollars'].sum()]
print(tabulate(df, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
df = df.drop(len(df)-1) ## Remove the last row of totals for the plot

fig = make_subplots(
    rows=1, cols=1,
    specs=[[{"type": "pie"}]],
)

fig.add_trace(go.Pie(labels=df['warehouse_name'].tolist(), values=df['dollars'].tolist(),name='dollars'),row=1,col=1)

fig.update_layout(
    title={
        'text': "Breakdown of total cost by warehouse",
        'y':0.1,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'bottom'})
fig.show()

## Analysis
- Sum of the compute across all the warehouses is 1255.47 credits which is consistent with total cost of compute above. Sanity check to ensure we are not missing any compute or warehouse

## Breakdown of cost by partner tools

In [None]:
df = qlib.cost_by_partner_tool_ts(sdate, edate).round(2)
df.loc[len(df.index)] = ['All Apps', 'ALL WHs' ,df['approximate_credits_used'].sum()]
print(tabulate(df, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
df_client_app = df.drop(len(df)-1).groupby('client_application_name').sum('numric_only').reset_index()
print(tabulate(df_client_app, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
fig = make_subplots(
    rows=1, cols=1,
    specs=[[{"type": "pie"}]],
)

fig.add_trace(go.Pie(labels=df_client_app['client_application_name'].tolist(), values=df_client_app['approximate_credits_used'].tolist(),name='credits'),row=1,col=1)

fig.update_layout(
    title={
        'text': "Breakdown of credits used by Client Apps",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
fig.show()

## Breakdown of total cost by user (to do)

## Breakdown of total cost as a time series

In [None]:
df = qlib.cost_of_cloud_services_ts(sdate, edate)
df.head()


In [None]:
sdate = '2022-10-01'
df = qlib.cost_by_wh_ts(sdate, edate)
df.head()

In [None]:
sdate = '2022-10-01'
df = qlib.cost_by_user_ts(sdate, edate)
df.head(30)