In [1]:
# Adding system path
import sys, pathlib
sys.path.append(str(pathlib.Path.cwd().parent.parent))
# sys.path

In [2]:
# Set to show warnings only once
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Setting up displays
from IPython.display import display, HTML

display(HTML("<style>.container { width:100% !important; }</style>"))
from dash import Dash,html,dcc,Input,Output
app = Dash(__name__)
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tabulate import tabulate
color_scheme=["red","blue","green","orange","purple","brown","pink","gray","olive","cyan","darkviolet","goldenrod","darkgreen","chocolate","lawngreen"]

In [None]:
# from dash import Dash,html,dcc

In [None]:
# ##center allign all the figure outputs.
# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# from IPython.display import display, HTML
# from plotly.graph_objs import *
# import numpy as np
# init_notebook_mode(connected=True)

# display(HTML("""
# <style>
# .output {
#     display: flex;
#     align-items: center;
#     text-align: center;
# }
# </style>
# """))

In [4]:
# Setting up autoreload for libs
%load_ext autoreload
%autoreload 2
%aimport optiml.queries

In [5]:
# Initialize connection to Snowflake and set analysis date
from optiml.connection import SnowflakeConnConfig
connection = SnowflakeConnConfig(accountname='jg84276.us-central1.gcp',warehousename="XSMALL_WH").create_connection()
# Initialize local environment
import os
cache_dir = os.path.expanduser('~/data/kiva')
# Initialize query library
from optiml.queries import SNFLKQuery
qlib = SNFLKQuery(connection, 'KIV', cache_dir)
sdate = '2022-09-12'
edate = '2022-10-12'


# Most recent rolling month that we have data for
print(f"The analysis is carried our for date range {sdate} to {edate}")

Connecting...
The analysis is carried our for date range 2022-09-12 to 2022-10-12


# Total cost breakdown 

## Analysis setup
<div class="alert alert-warning">

* Analysis date range: '2022-09-12' to '2022-10-12'

* Type of Snowflake account: Standard Edition

* Credit to dollar conversion: `$`2 per credit

</div>


#### STANDARD DEVIATION AND MEAN OF CREDITS

In [None]:
df=qlib.total_breakdown_analysis(sdate,edate)
print(df)

## Cost by usage category

In [None]:
df = qlib.total_cost_breakdown_ts(sdate, edate)
df = df.fillna('Unassigned')
df.head()



In [None]:
df_by_usage_category = df.groupby("category_name").sum("numeric_only").reset_index()
df_by_usage_category.head()
df_by_usage_category.loc[len(df.index)] = ['Total', df_by_usage_category['credits'].sum(), df_by_usage_category['dollars'].sum()]
df_by_usage_category = df_by_usage_category.round(2)
print(tabulate(df_by_usage_category, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
# Pie charts for total cost breakdown
## Remove the last row of totals for the plot
df_by_usage_category.reset_index(inplace=True)
df_by_usage_category.drop(columns=["index"], inplace=True)
df_by_usage_category = df_by_usage_category.drop(len(df_by_usage_category)-1) 
fig = make_subplots(
    rows=1, cols=2,
    specs=[[{"type": "pie"},{"type": "pie"}]],
    subplot_titles=("Dollars", "Credits")
)

fig.add_trace(go.Pie(labels=df_by_usage_category['category_name'].tolist(), values=df_by_usage_category['dollars'].tolist(),name="Dollars", rotation=45, marker_colors=color_scheme),row=1,col=1)
fig.add_trace(go.Pie(labels=df_by_usage_category['category_name'].tolist(), values=df_by_usage_category['credits'].tolist(),name='Credits', rotation=45, marker_colors=color_scheme),row=1,col=2)

fig.update_layout(
    title={
        'text': "Breakdown of total cost by usage category",
        'y':0.1,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'bottom'})
fig.show()


## Cost by usage category timeseries

In [None]:
df_by_category_ts = df.groupby(['category_name','hourly_start_time']).sum('numeric_only').reset_index()
fig = px.area(df_by_category_ts, x="hourly_start_time", y="dollars", color="category_name",color_discrete_sequence=color_scheme)
fig.show()

In [None]:
df_compute = df_by_category_ts[df_by_category_ts["category_name"] == "Compute"]

In [None]:
avg_consumption = df_compute.mean()
max_consumption = df_compute.loc[df_compute['credits'].idxmax()]
min_consumption = df_compute.loc[df_compute['credits'].idxmin()]

print('Avg. consumption:')
print('-----------------')
print(avg_consumption)

print('')

print('Max. consumption:')
print('-----------------')
print(max_consumption)

print('')

print('Min. consumption:')
print('-----------------')
print(min_consumption)

### Analysis

<div class="alert alert-info">

#### Compute
* Compute consumes > 95% of the budget. This is as expected.  
* The average compute consumption is Credits: 12.69, Dollars: `$`25.38, with usage peaks typically at 9 am and 2 pm. 
* The typical compute through the night is non-zero at ~ 10 credits. 
* The nightime compute consumption is flat till 23rd but becomes noisy starting 24th.
* There is an increase in nightime consumption on 9th and 10th of ~30% which reverts to usual pattern subsequently. 

#### Cloud Services
* Cloud services in aggregate consume < 10% of the budget - which is consistent with no over charges if this is true for each warehouse on an hourly basis.
* The overall timeseries for credit consumption of cloud services is also < 10% of compute.

#### Storage
* Storage cost is a small fraction of the `$` budget. Storage is charged by flat `$` amount per TB starting at `$`23/TB. For the timeseries we have spread that evenly through the day.
* Storage costs are almost flat during the period.

</div>

### Actions and Recommendations

<div class="alert alert-success">

* Analyze Cloud Services credit consumption on an hourly basis at a warehouse level to make sure no overages hourly by warehouse.
* Investigate noisy night time usage after Sept 23.
* Investigate increased baseline consumption that occurs on 10th and 11th Oct.
* Evaluate the system for compute bottlenecks and queue lengths for - warehouses, tasks, and queries at 9 am and 2 pm due to usage spikes.
* Based on cost we expect ~300 GB storage is being used. Verify this number for consistency.
* Set up a time varying resource monitor on the account level based on these usage patterns to flag any anomalous usage. Usage monitoring set naively:
    * Will not be proactive about cost containment 
    * Can generate false positives/negatives causing nuisance alerts/shutdowns. 
* Evaluate opportunities for savings through:
    * Dithering peak workloads to alternate times.
    * Reducing weekend consumption 
    * Dithering jobs to or reducing night time consumption
    
</div>

## Cost by user

In [None]:
df_by_user = df.groupby(['user_name']).sum('numeric_only').reset_index()
df_by_user = df_by_user.round(2)
df_by_user.loc[len(df.index)] = ['Total', df_by_user['credits'].sum(), df_by_user['dollars'].sum()]
print(tabulate(df_by_user, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
df_by_user.drop(df_by_user.tail(1).index,inplace=True)
fig = make_subplots(
    rows=1, cols=2,
    specs=[[{"type": "pie"},{"type": "pie"}]],
    subplot_titles=("Dollars", "Credits")
)

fig.add_trace(go.Pie(labels=df_by_user['user_name'].tolist(), values=df_by_user['dollars'].tolist(),name="Dollars", rotation=45,marker_colors=color_scheme),row=1,col=1)
fig.add_trace(go.Pie(labels=df_by_user['user_name'].tolist(), values=df_by_user['credits'].tolist(),name='Credits', rotation=45,marker_colors=color_scheme),row=1,col=2)

fig.update_layout(
    title={
        'text': "Breakdown of total cost by user",
        'y':0.1,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'bottom'})
fig.show()

In [None]:
df_by_user_ts = df.groupby(['user_name','hourly_start_time']).sum('numeric_only').reset_index()
fig = px.area(df_by_user_ts, x="hourly_start_time", y="credits", color="user_name",color_discrete_sequence=color_scheme)
fig.show()

In [None]:
df.head()

### Analysis
<div class="alert alert-info">
    
* User GREGORYW incurs ~ 30% of the costs at ~ 1385 credits.
* Users VERTX_PROD and DBT_PROD have next highest consumption at ~ 20% and ~ 894 credits.
* Users DBT_DEV and ROBS spend the least value of dollars and credits compared to other users.
* User SNOWFLAKE includes costs associated with storage, and Snowflake related functions - this makes total_cost_breakdown numbers self consistent with cost_by_user.
* The overall compute consumption patterns through the day are similar to total_cost_breakdown.
* There is an increase in credit usage on the 8th, 9th and 10th of October during off peak hours (~ 8 pm - 8 am) which can be attributed to the following users:
    * DBT_DEV: +100% 
    * GREGORYW: +30%
    * Unassigned: +100%

* The increased noisiness in usage starting 24th is attributable to users ROBS and Unassigned. 

</div>

### Actions and Recommendations
<div class="alert alert-success">

* Queries for following users should be tagged better - so they can be attributed to organizations/projects/R&D.:
    * GREGORYW
    * Unassigned
* User roles associated with automated jobs should be tagged and have user account level resource monitoring around them based on historical usage: 
    * VERTX_PROD
    * DBT_PROD
* Notify the admin 8th - 10th that users GREGORYW, Unassigned and DBT_DEV have increased credit consumption during off peak hours (~ 8 pm - 8 am) 
* Set up a time varying resource monitor at account and user level based on usage patterns to flag any anomalous usage.    
    
</div>

## Cost by warehouse

In [None]:
# Returns results only for ACCOUNTADMIN role or any other role that has been granted MONITOR USAGE global privilege
# So results consisten with Greg's usage
df = qlib.cost_by_wh_ts(sdate, edate)
df.head()

In [None]:
df_by_wh = df.groupby(['warehouse_name']).sum('numeric_only').reset_index()
df_by_wh = df_by_wh.round(2)
df_by_wh.loc[len(df.index)] = ['Total', df_by_wh['credits'].sum(), df_by_wh['dollars'].sum(),  df_by_wh['cloud_services_credits'].sum(), df_by_wh['cloud_services_dollars'].sum()]
print(tabulate(df_by_wh, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
# Remove the last row of totals for the plot
df_by_wh.drop(df_by_wh.tail(1).index,inplace=True)

fig = make_subplots(
    rows=1, cols=1,
    specs=[[{"type": "pie"}]],
)

fig.add_trace(go.Pie(labels=df_by_wh['warehouse_name'].tolist(), values=df_by_wh['dollars'].tolist(),name='dollars',marker_colors=color_scheme),row=1,col=1)

fig.update_layout(
    title={
        'text': "Breakdown of total cost by warehouse",
        'y':0.1,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
fig.show()

In [None]:
df_by_wh_ts = df.groupby(['warehouse_name','hourly_start_time']).sum('numeric_only').reset_index()
# df_by_wh_ts.head()

In [None]:
##TODO: Investigate why tunring off cloud services only makes daily refresh plot jump in some points
fig = px.area(df_by_wh_ts, x="hourly_start_time", y="credits", color="warehouse_name",color_discrete_sequence=color_scheme)
fig.show()

### Analysis
<div class="alert alert-info">
    
* Total credit consumption for all the warehouses combined is lower compared to total credit consumption overall and total credit consumption for all users combined.
    * This is because [warehouse metering history](https://docs.snowflake.com/en/sql-reference/functions/warehouse_metering_history.html) returns results only for the ACCOUNTADMIN role or any role that has been explicitly granted the MONITOR USAGE global privilege. 
  
* PROD_WH incurs highest and > 50% of the costs at ~ 3110 credits.
* DEV_WH consumes next highest ~30% at ~ 1872 credits.
* DAILY_REFRESH_WH and ML_WH consume 14.6% and 3.6% respectively.

* The increased credit consumption from 8th-10th Oct. is completely in DEV_WH where credit consumption increases by 200%. 
    * This increase is similar in magnitude to 11-2 pm everyday for this WH - investigate if it was the same process run intentionally or not.
    
* PROD_WH shows a +200% increase on 1st Oct. from 1-5 pm which shows as a spike in other views as well but was not as prominent.
    
* Fewer credits consumed by DAILY_REFRESH_WH on Monday and Tuesday - would have expected that to be for weekend.
    
* Increased noisiness starting 24th is almost entirely coming from ML_WH.
</div>

### Actions and Recommendations

<div class="alert alert-success">

* For better visibility grant MONITOR USAGE global privilege to all users. (Or look for alternatives to make warehouse credit consumption consistent with total credit consumption).
* Set WH level alerts for each WH - especially for DEV_WH and PROD_WH
* Tag jobs by function in PROD_WH and DEV_WH
* Set up a time varying resource monitor at warehouse level based on usage patterns to flag any anomalous usage.
    
</div>

## Cost by Partner Tools


In [6]:
df=qlib.cost_by_partner_tool_ts(sdate, edate)
df.head()

Unnamed: 0,client_application_name,warehouse_name,approximate_credits_used,hourly_start_time
0,Python,DAILY_REFRESH_WH,4.237986,2022-10-09 09:00:00
1,Python,DAILY_REFRESH_WH,4.235667,2022-10-06 09:00:00
2,Python,DAILY_REFRESH_WH,4.23268,2022-10-10 09:00:00
3,Python,DAILY_REFRESH_WH,4.231897,2022-10-04 09:00:00
4,Python,DAILY_REFRESH_WH,4.228788,2022-10-08 09:00:00


In [None]:
df_by_pt = df.groupby(['client_application_name']).sum('numeric_only').reset_index()
df_by_pt = df_by_pt.round(2)
df_by_pt.loc[len(df.index)] = ['Total', df_by_pt['approximate_credits_used'].sum()]
print(tabulate(df_by_pt, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
# Remove the last row of totals for the plot
df_by_pt.drop(df_by_pt.tail(1).index,inplace=True)

fig = make_subplots(
    rows=1, cols=1,
    specs=[[{"type": "pie"}]],
)

fig.add_trace(go.Pie(labels=df_by_pt['client_application_name'].tolist(), values=df_by_pt['approximate_credits_used'].tolist(),name='credits',marker_colors=color_scheme, rotation=45),row=1,col=1)

fig.update_layout(
    title={
        'text': "Breakdown of total cost by partner tools",
        'y':0.1,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
fig.show()

In [None]:
df_by_pt_ts = df.groupby(['client_application_name','hourly_start_time']).sum('numeric_only').reset_index()
df_by_pt_ts.head()

In [None]:
fig = px.area(df_by_pt_ts, x="hourly_start_time", y="approximate_credits_used", color="client_application_name",color_discrete_sequence=color_scheme)
fig.show()

In [None]:
## Analyze if a specific client tool is attached with a specific warehouse. For each warehouse what percentage of credits are used by which tool?

In [None]:
df_by_pt_wh_ts = df.groupby(['client_application_name', 'warehouse_name','hourly_start_time']).sum('numeric_only').reset_index()
df_by_pt_wh_ts

In [None]:
df_by_pt_wh_ts["client_application_warehouse_name"] = df_by_pt_wh_ts["client_application_name"] + "_" + df_by_pt_wh_ts["warehouse_name"]
df_by_pt_wh_ts.head()

In [None]:
df_by_pt_wh_ts = df_by_pt_wh_ts.groupby(["client_application_warehouse_name","hourly_start_time"]).sum("numeric_only").reset_index()
df_by_pt_wh_ts.head()

In [None]:
fig = px.area(df_by_pt_wh_ts, x="hourly_start_time", y="approximate_credits_used", color="client_application_warehouse_name",color_discrete_sequence=color_scheme)
fig.show()

### COST OF PARTNER TOOLS BY WAREHOUSES

In [8]:

df_titles=df['warehouse_name'].unique()
# print(df_titles)
df_warehouse = [d for _, d in df.groupby(['warehouse_name'])]


In [13]:
for i in range(len(df_titles)):
    fig = px.area(df_warehouse[i], x="hourly_start_time", y="approximate_credits_used", color="client_application_name",color_discrete_sequence=color_scheme,title=df_titles[i])
    fig.show()


In [None]:
# fig = px.area(df_warehouse[0], x="hourly_start_time", y="approximate_credits_used", color="client_application_name",color_discrete_sequence=color_scheme,title=df_titles[0])
# fig.show()

In [None]:
# fig = px.area(df_warehouse[1], x="hourly_start_time", y="approximate_credits_used", color="client_application_name",color_discrete_sequence=color_scheme,title=df_titles[1])
# fig.show()


In [None]:
# fig = px.area(df_warehouse[2], x="hourly_start_time", y="approximate_credits_used", color="client_application_name",color_discrete_sequence=color_scheme,title=df_titles[2])
# fig.show()


In [None]:
# fig = px.area(df_warehouse[2], x="hourly_start_time", y="approximate_credits_used", color="client_application_name",color_discrete_sequence=color_scheme,title=df_titles[3])
# fig.show()


### Analysis
<div class="alert alert-info">
    
* Total credit consumption for all the warehouses combined is lower compared to total credit consumption overall and total credit consumption for all users combined.
    * This is because [warehouse metering history](https://docs.snowflake.com/en/sql-reference/functions/warehouse_metering_history.html) returns results only for the ACCOUNTADMIN role or any role that has been explicitly granted the MONITOR USAGE global privilege. 
  
* PROD_WH incurs highest and > 50% of the costs at ~ 3110 credits.
* DEV_WH consumes next highest ~30% at ~ 1872 credits.
* DAILY_REFRESH_WH and ML_WH consume 14.6% and 3.6% respectively.

* The increased credit consumption from 8th-10th Oct. is completely in DEV_WH where credit consumption increases by 200%. 
    * This increase is similar in magnitude to 11-2 pm everyday for this WH - investigate if it was the same process run intentionally or not.
    
* PROD_WH shows a +200% increase on 1st Oct. from 1-5 pm which shows as a spike in other views as well but was not as prominent.
    
* Fewer credits consumed by DAILY_REFRESH_WH on Monday and Tuesday - would have expected that to be for weekend.
    
* Increased noisiness starting 24th is almost entirely coming from ML_WH.
</div>

### Actions and Recommendations

<div class="alert alert-success">

* For better visibility grant MONITOR USAGE global privilege to all users. (Or look for alternatives to make warehouse credit consumption consistent with total credit consumption).
* Set WH level alerts for each WH - especially for DEV_WH and PROD_WH
* Tag jobs by function in PROD_WH and DEV_WH
* Set up a time varying resource monitor at warehouse level based on usage patterns to flag any anomalous usage.
    
</div>

In [None]:
#TODO: Implement cost of data transfers query using https://docs.snowflake.com/en/user-guide/cost-exploring-data-transfer.html