In [None]:
# Adding system path
import sys, pathlib
sys.path.append(str(pathlib.Path.cwd().parent.parent))
# sys.path

In [None]:
# Set to show warnings only once
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Setting up displays
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
from dash import Dash,html,dcc,Input,Output
app = Dash(__name__)
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tabulate import tabulate
color_scheme=["red","blue","green","orange","purple","brown","pink","gray","olive","cyan","darkviolet","goldenrod","darkgreen","chocolate","lawngreen"]

In [None]:
# from dash import Dash,html,dcc

In [None]:
# ##center allign all the figure outputs.
# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# from IPython.display import display, HTML
# from plotly.graph_objs import *
# import numpy as np
# init_notebook_mode(connected=True)

# display(HTML("""
# <style>
# .output {
#     display: flex;
#     align-items: center;
#     text-align: center;
# }
# </style>
# """))

In [None]:
# Setting up autoreload for libs
%load_ext autoreload
%autoreload 2
%aimport optiml.queries

In [None]:
# Initialize connection to Snowflake and set analysis date
from optiml.connection import SnowflakeConnConfig
connection = SnowflakeConnConfig(accountname='jg84276.us-central1.gcp',warehousename="XSMALL_WH").create_connection()
# Initialize local environment
import os
cache_dir = os.path.expanduser('~/data/kiva')
# Initialize query library
from optiml.queries import SNFLKQuery
qlib = SNFLKQuery(connection, 'KIV', cache_dir)
sdate = '2022-09-24'
edate = '2022-10-08'
# print(f"The analysis is carried our for date range {sdate} to {edate}")

# Total cost breakdown 

## Analysis setup
<div class="alert alert-info">

* Analysis date range: '2022-09-24' to '2022-10-08'

* Type of Snowflake account: Standard Edition

* Credit to dollar conversion: `$`2 per credit

</div>


## Cost by usage category

In [None]:
df = qlib.total_cost_breakdown_ts(sdate, edate)
df = df.fillna('Unassigned')
# df.to_csv('/home/manas/DS_data/breakdown.csv')


In [None]:
df_by_usage_category = df.groupby("category_name").sum("numeric_only").reset_index()
df_by_usage_category.loc[len(df.index)] = ['Total', df_by_usage_category['credits'].sum(), df_by_usage_category['dollars'].sum()]
df_by_usage_category = df_by_usage_category.round(2)
print(tabulate(df_by_usage_category, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
# Pie charts for total cost breakdown
## Remove the last row of totals for the plot
df_by_usage_category.reset_index(inplace=True)
df_by_usage_category.drop(columns=["index"], inplace=True)
df_by_usage_category = df_by_usage_category.drop(len(df_by_usage_category)-1) 
fig = make_subplots(
    rows=1, cols=2,
    specs=[[{"type": "pie"},{"type": "pie"}]],
    subplot_titles=("Dollars", "Credits")
)

fig.add_trace(go.Pie(labels=df_by_usage_category['category_name'].tolist(), values=df_by_usage_category['dollars'].tolist(),name="Dollars", rotation=45,marker_colors=color_scheme),row=1,col=1)
fig.add_trace(go.Pie(labels=df_by_usage_category['category_name'].tolist(), values=df_by_usage_category['credits'].tolist(),name='Credits', rotation=45,marker_colors=color_scheme),row=1,col=2)

fig.update_layout(
    title={
        'text': "Breakdown of total cost by usage category",
        'y':0.1,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'bottom'})
fig.show()


## Cost by usage category timeseries

In [None]:
df_by_category_ts = df.groupby(['category_name','hourly_start_time']).sum('numeric_only').reset_index()
fig = px.area(df_by_category_ts, x="hourly_start_time", y="credits", color="category_name",color_discrete_sequence=color_scheme)
fig.show()

In [None]:
df_compute = df_by_category_ts[df_by_category_ts["category_name"] == "Compute"]

In [None]:
avg_consumption = df_compute.mean()
max_consumption = df_compute.loc[df_compute['credits'].idxmax()]
min_consumption = df_compute.loc[df_compute['credits'].idxmin()]

print('Avg. consumption:')
print('-----------------')
print(avg_consumption)

print('')

print('Max. consumption:')
print('-----------------')
print(max_consumption)

print('')

print('Min. consumption:')
print('-----------------')
print(min_consumption)

### Analysis

<div class="alert alert-info">

* Compute consumes > 95% of the budget. This is as expected. 
* Cloud services in aggregate consume < 10% of the budget - which is consistent with no over charges for it. But needs to be looked at a warehouse level.
* The overall timeseries for credit consumption of cloud services credits is < 10% of compute.
* Storage cost if a small fraction of the `$` budget. Storage is charged by flat `$` amount per TB. For the timeseries we have spread that evenly through the day.
* The average compute compute consumption is Credits: 12.94, Dollars: `$`25.88, with peaks at 9 am and 2 pm. 
* The typical compute through the night is non-zero at ~ 10 credits and might have opportunity for savings or for dithering peak workloads to alternate times.
* Check for compute bottlenecks and queue lengths for various warehouses, tasks, and queries at 9 am and 2 pm.

</div>

### Optiml TODO

<div class="alert alert-warning">
    
* Analyze the Cloud Services credit consumption on an hourly basis at a warehouse level to make sure no overages hourly by warehouse.
    
</div>

### Optiml Recommendation

<div class="alert alert-success">

* Set up a time varying resource monitor on the account level based on these usage patterns to flag any anomalous usage. 
* Usage monitoring set naively i.e. without considering usage patterns can generate false positives/negatives causing nuisance alerts/shutdowns. 


## Cost by user

In [None]:
df_by_user = df.groupby(['user_name']).sum('numeric_only').reset_index()
df_by_user = df_by_user.round(2)
df_by_user.loc[len(df.index)] = ['Total', df_by_user['credits'].sum(), df_by_user['dollars'].sum()]
print(tabulate(df_by_user, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
df_by_user.drop(df_by_user.tail(1).index,inplace=True)
fig = make_subplots(
    rows=1, cols=2,
    specs=[[{"type": "pie"},{"type": "pie"}]],
    subplot_titles=("Dollars", "Credits")
)

fig.add_trace(go.Pie(labels=df_by_user['user_name'].tolist(), values=df_by_user['dollars'].tolist(),name="Dollars", rotation=45,marker_colors=color_scheme),row=1,col=1)
fig.add_trace(go.Pie(labels=df_by_user['user_name'].tolist(), values=df_by_user['credits'].tolist(),name='Credits', rotation=45,marker_colors=color_scheme),row=1,col=2)

fig.update_layout(
    title={
        'text': "Breakdown of total cost by user",
        'y':0.1,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'bottom'})
fig.show()

In [None]:
df_by_user_ts = df.groupby(['user_name','hourly_start_time']).sum('numeric_only').reset_index()
fig = px.area(df_by_user_ts, x="hourly_start_time", y="credits", color="user_name",color_discrete_sequence=color_scheme)
fig.show()

In [None]:
df.head()

### Analysis

<div class="alert alert-info">
    
* User GREGORYW incurs ~ 30% of the costs at ~ 1385 credits.
* Users VERTX_PROD and DBT_PROD have next highest consumption at ~ 20% and ~ 894 credits.
* Users DBT_DEV and ROBS spend the least value of dollars and credits compared to other users.
* User SNOWFLAKE 

* The time series by user follows a similar trend as the time series by usage category graph with regards to usage of credits at daily time slots.
* The increase in credit usage on the 8th, 9th and 10th of October is because of usage increases by GREGORYW and DBT_DEV. The 2 p.m. credit usage spike from October 1- October 5 can also be attributed to those users. 

### Optiml TODO

<div class="alert alert-warning">

* Given the organizational context that GREGORYW is the primary infrastructure owner, tag these queries so they can be attributed better to organizational budgets.
* Get a clearer understanding of user roles VERTX_PROD and DBT_PROD since they seem like automated jobs and put appropriate user account level resource monitoring around them based on historical usage.

 
</div>

### Optiml Recommendation

## Cost by warehouse

In [None]:
# Returns results only for ACCOUNTADMIN role or any other role that has been granted MONITOR USAGE global privilege
# So results consisten with Greg's usage
df = qlib.cost_by_wh_ts(sdate, edate)
df.head()

In [None]:
df_by_wh = df.groupby(['warehouse_name']).sum('numeric_only').reset_index()
df_by_wh = df_by_wh.round(2)
df_by_wh.loc[len(df.index)] = ['Total', df_by_wh['credits'].sum(), df_by_wh['dollars'].sum(),  df_by_wh['cloud_services_credits'].sum(), df_by_wh['cloud_services_dollars'].sum()]
print(tabulate(df_by_wh, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
# Remove the last row of totals for the plot
df_by_wh.drop(df_by_wh.tail(1).index,inplace=True)

fig = make_subplots(
    rows=1, cols=1,
    specs=[[{"type": "pie"}]],
)

fig.add_trace(go.Pie(labels=df_by_wh['warehouse_name'].tolist(), values=df_by_wh['dollars'].tolist(),name='dollars',marker_colors=color_scheme),row=1,col=1)

fig.update_layout(
    title={
        'text': "Breakdown of total cost by warehouse",
        'y':0.1,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
fig.show()

- PROD_WH incurs the highest cost for warehouses amongst all with a dollar value of 1221.29, followed by DEV_WH with a dollar value of 797.61
- DAILY_REFRESH_WH’s cost is 347.14 and ML_WH’s cost is 140.47
- CLOUD_SERVICES_ONLY does not incur any costs.



In [None]:
df_by_wh_ts = df.groupby(['warehouse_name','hourly_start_time']).sum('numeric_only').reset_index()
df_by_wh_ts.head()

In [None]:
##TODO: Investigate why tunring off cloud services only makes daily refresh plot jump in some points
fig = px.area(df_by_wh_ts, x="hourly_start_time", y="credits", color="warehouse_name",color_discrete_sequence=color_scheme)
fig.show()

- The credit usage increase on the 8th, 9th, and 10th of October from other days is because of more credit consumption by DEV_WH.


## Cost by Partner Tools


In [None]:
df=qlib.cost_by_partner_tool_ts(start_date="2022-01-01", end_date="2022-02-02")
df.head()

In [None]:
df_by_pt = df.groupby(['client_application_name']).sum('numeric_only').reset_index()
df_by_pt = df_by_pt.round(2)
df_by_pt.loc[len(df.index)] = ['Total', df_by_pt['approximate_credits_used'].sum()]
print(tabulate(df_by_pt, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
# Remove the last row of totals for the plot
df_by_pt.drop(df_by_pt.tail(1).index,inplace=True)

fig = make_subplots(
    rows=1, cols=1,
    specs=[[{"type": "pie"}]],
)

fig.add_trace(go.Pie(labels=df_by_pt['client_application_name'].tolist(), values=df_by_pt['approximate_credits_used'].tolist(),name='credits',marker_colors=color_scheme),row=1,col=1)

fig.update_layout(
    title={
        'text': "Breakdown of total cost by partner tools",
        'y':0.1,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
fig.show()

In [None]:
df_by_pt_ts = df.groupby(['client_application_name','hourly_start_time']).sum('numeric_only').reset_index()
df_by_pt_ts.head()

In [None]:

fig = px.area(df_by_pt_ts, x="hourly_start_time", y="approximate_credits_used", color="client_application_name",color_discrete_sequence=color_scheme)
fig.show()

In [None]:
#TODO: Implement cost of data transfers query using https://docs.snowflake.com/en/user-guide/cost-exploring-data-transfer.html