In [None]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

In [None]:
# Adding system path
import sys, pathlib
sys.path.append(str(pathlib.Path.cwd().parent.parent))
# sys.path

In [None]:
# Set to show warnings only once
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Setting up displays
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
from dash import Dash,html,dcc,Input,Output
app = Dash(__name__)
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tabulate import tabulate
color_scheme=["red","blue","green","orange","purple","brown","pink","gray","olive","cyan","darkviolet","goldenrod","darkgreen","chocolate","lawngreen"]

In [None]:
# Setting up autoreload for libs
%load_ext autoreload
%autoreload 2
%aimport optiml.queries

In [None]:
# Initialize connection to Snowflake and set analysis date
from optiml.connection import SnowflakeConnConfig
connection = SnowflakeConnConfig(accountname='jg84276.us-central1.gcp',warehousename="XSMALL_WH").create_connection()
# Initialize local environment
import os
cache_dir = os.path.expanduser('~/data/kiva')
# Initialize query library
from optiml.queries import SNFLKQuery
qlib = SNFLKQuery(connection, 'KIV', cache_dir)
# Initialize analysis dates
from datetime import datetime
from dateutil.relativedelta import relativedelta
sdate = '2022-09-12'
edate = '2022-10-12'


# Most recent rolling month that we have data for
# print(f"The analysis is carried our for date range {sdate} to {edate}")

# Total cost breakdown 

## Analysis setup
<div class="alert alert-warning">

* Analysis date range: '2022-09-12' to '2022-10-12': last rolling month in the data we collected.

* Type of Snowflake account: Standard Edition

* Credit to dollar conversion: `$`2 per credit

</div>


In [None]:
## Function for date time analysis
##TODO: Move to a library function
def get_previous_dates(sdate, edate, date_shift_months):
    sdate_datetime = datetime.strptime(sdate,'%Y-%m-%d')
    prev_sdates_datetime = datetime.strptime(sdate,'%Y-%m-%d') - relativedelta(months=date_shift_months)
    prev_sdates = prev_sdates_datetime.strftime("%Y-%m-%d")
    edate_datetime = datetime.strptime(edate,'%Y-%m-%d')
    prev_edates_datetime = datetime.strptime(edate,'%Y-%m-%d') - relativedelta(months=date_shift_months)
    prev_edates = prev_edates_datetime.strftime("%Y-%m-%d")
    return prev_sdates, prev_edates

## Cost by usage category

In [None]:
df = qlib.total_cost_breakdown_ts(sdate, edate)
df = df.fillna('Unassigned')
df_by_usage_category = df.groupby("category_name").sum("numeric_only").reset_index()
df_by_usage_category.loc[len(df_by_usage_category.index)] = ['Total', df_by_usage_category['credits'].sum(), df_by_usage_category['dollars'].sum()]
df_by_usage_category = df_by_usage_category.round(2)
print('Credit and dollar usage by category (Current month)')
print('---------------------------------------------------')
print(tabulate(df_by_usage_category, headers='keys', tablefmt='rounded_outline', showindex=False))


In [None]:
## Get usage for previous month as a predictive sanity check
p1_sdate, p1_edate = get_previous_dates(sdate, edate, 1)
df_prev = qlib.total_cost_breakdown_ts(p1_sdate, p1_edate)
df_prev = df_prev.fillna('Unassigned')
df_by_usage_category_prev = df_prev.groupby("category_name").sum("numeric_only").reset_index()
df_by_usage_category_prev.loc[len(df_by_usage_category_prev.index)] = ['Total', df_by_usage_category_prev['credits'].sum(), 
                                                                       df_by_usage_category_prev['dollars'].sum()]
df_by_usage_category_prev = df_by_usage_category_prev.round(2)
print('Credit and dollar usage by category (Previous month)')
print('----------------------------------------------------')
print(tabulate(df_by_usage_category_prev, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
df_change = pd.DataFrame().assign(category_name=df_by_usage_category["category_name"])
df_change["percent_change"] = ((df_by_usage_category["dollars"] - df_by_usage_category_prev["dollars"])/df_by_usage_category_prev["dollars"]*100).round(2)
print('Percentage change in dollar usage')
print('---------------------------------')
print(tabulate(df_change, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
# Pie charts for total cost breakdown
## Remove the last row of totals for the plot
df_by_usage_category.reset_index(inplace=True)
df_by_usage_category.drop(columns=["index"], inplace=True)
df_by_usage_category = df_by_usage_category.drop(len(df_by_usage_category)-1) 
fig = make_subplots(
    rows=1, cols=2,
    specs=[[{"type": "pie"},{"type": "pie"}]],
    subplot_titles=("Dollars", "Credits")
)

fig.add_trace(go.Pie(labels=df_by_usage_category['category_name'].tolist(), values=df_by_usage_category['dollars'].tolist(),name="Dollars", 
                     rotation=45, marker_colors=color_scheme),row=1,col=1)
fig.add_trace(go.Pie(labels=df_by_usage_category['category_name'].tolist(), values=df_by_usage_category['credits'].tolist(),name='Credits',
                     rotation=45, marker_colors=color_scheme),row=1,col=2)

fig.update_layout(
    title={
        'text': "Breakdown of total cost by usage category",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
fig.show()

## Cost by usage category timeseries

In [None]:
df_by_category_ts

In [None]:
df_by_category_ts = df.groupby(['category_name','hourly_start_time']).sum('numeric_only').reset_index()
fig = px.area(df_by_category_ts, x="hourly_start_time", y="dollars", color="category_name",color_discrete_sequence=color_scheme)
fig.update_layout(
    title={
        'text': "Timeseries of cost by usage category",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="Hourly start time (UTC)",
    yaxis_title="US Dollars"
)
fig.show()

In [None]:
df_compute = df_by_category_ts[df_by_category_ts["category_name"] == "Compute"].round(2)

In [None]:
avg_consumption = df_compute.mean().round(2)
max_consumption = df_compute.loc[df_compute['credits'].idxmax()]
max_consumption.drop("category_name", inplace=True)
min_consumption = df_compute.loc[df_compute['credits'].idxmin()]
min_consumption.drop("category_name", inplace=True)

print('Avg. hourly consumption:')
print('-----------------')
print(avg_consumption)

print('')

print('Max. hourly consumption:')
print('-----------------')
print(max_consumption)

print('')

print('Min. hourly consumption:')
print('-----------------')
print(min_consumption)

### Analysis

<div class="alert alert-info">

#### Compute
* Compute consumes > 95% of the credits budget and > 90% of the dollar budget. This is as expected.  
* The average hourly compute consumption is Credits: 4.03, Dollars: `$`8.06, with usage peaks typically at 9 am and 2 pm with a period max of Credits: 10.09, Dollars: `$` 20.18. 
* The typical compute through the night is non-zero at ~ 3 credits. 
* The nightime compute consumption is flat till 23rd but becomes noisy starting 24th.
* There is an increase in nightime consumption on 9th and 10th of ~30% which reverts to usual pattern subsequently. 
* The 2 pm spike in credit usage becomes very prominent on some days - can this be tied to any specific usage?

#### Cloud Services
* Cloud services in aggregate consume < 10% of the budget - which is consistent with no over charges if this is true for each warehouse on an hourly basis.
* The overall timeseries for credit consumption of cloud services is also < 10% of compute.

#### Storage
* Storage cost is a small fraction of the `$` budget. Storage is charged by flat `$` amount per TB starting at `$`23/TB. For the timeseries we have spread that evenly through the day.
* Storage costs are almost flat during the period.
    
#### General trends
* Month over month:
    * Total ~ +1.4%
    * Compute ~ +2%
    * Cloud Services ~ -7.5%
    * Storage ~ -4.5%
* Since Compute is majority of the expense a smaller increase in it may have a significant impact
* This is very basic predictive analysis - more complex analyses can be made available in the pilot    

</div>

### Actions and Recommendations

<div class="alert alert-success">

* Analyze Cloud Services credit consumption on an hourly basis at a warehouse level to make sure no overages hourly by warehouse.
* Investigate noisy night time usage after Sept 23.
* Investigate increased baseline consumption that occurs on 10th and 11th Oct.
* Evaluate queries and tasks are driving spikes in credits and if it can be tied to individual queries.
* Evaluate the system for compute bottlenecks and queue lengths for - warehouses, tasks, and queries at 9 am and 2 pm due to usage spikes.
* Based on cost we expect ~10 TB storage is being used. Verify this number for consistency.
* Set up a time varying resource monitor on the account level based on these usage patterns to flag any anomalous usage. Usage monitoring set naively:
    * Will not be proactive about cost containment 
    * Can generate false positives/negatives causing nuisance alerts/shutdowns. 
* Evaluate opportunities for savings through:
    * Dithering peak workloads to alternate times.
    * Reducing weekend consumption 
    * Dithering jobs to or reducing night time consumption
    
</div>

## Cost by user

In [None]:
df = qlib.cost_by_user_ts(sdate, edate)
# df.head()

In [None]:
df_by_user = df.groupby(['user_name']).sum('numeric_only').reset_index()
df_by_user = df_by_user.round(2)
df_by_user.loc[len(df_by_user.index)] = ['Total', df_by_user['approximate_credits_used'].sum()]
print('Credit and dollar usage by user (Current month)')
print('-----------------------------------------------')
print(tabulate(df_by_user, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
df_by_user["percent_usage"] = df_by_user["approximate_credits_used"]/df_by_user[df_by_user["user_name"]=="Total"]["approximate_credits_used"].values[0]*100
df_by_user["percent_usage"] = df_by_user["percent_usage"].round(3)
x = df_by_user.loc[df_by_user["percent_usage"]<1.00].sum(axis=0,numeric_only=True)
df_low_usage_users = df_by_user.loc[df_by_user["percent_usage"] < 1.00].reset_index(drop=True)
print('Credit and dollar for low usage users (Current month)')
print('-----------------------------------------------------')
print(tabulate(df_low_usage_users, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
df_by_user = df_by_user.loc[df_by_user["percent_usage"] > 1.00].reset_index(drop=True)
df_by_user.loc[len(df_by_user)-1.5] = ["Low_usage_users", x["approximate_credits_used"], x["percent_usage"]]
df_by_user = df_by_user.sort_index().reset_index(drop=True)
print('Credit and dollar usage by user with low usage users consolidated (Current month)')
print('---------------------------------------------------------------------------------')
print(tabulate(df_by_user, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
df_by_user.drop(df_by_user.tail(1).index,inplace=True)
fig = make_subplots(
    rows=1, cols=1,
    specs=[[{"type": "pie"}]],
    subplot_titles=("Credits")
)

fig.add_trace(go.Pie(labels=df_by_user['user_name'].tolist(), values=df_by_user['approximate_credits_used'].tolist(),name="Credits", rotation=270,marker_colors=color_scheme),row=1,col=1)
# fig.add_trace(go.Pie(labels=df_by_user['user_name'].tolist(), values=df_by_user['credits'].tolist(),name='Credits', rotation=45,marker_colors=color_scheme),row=1,col=2)

fig.update_layout(
    title={
        'text': "Breakdown of total cost by user",
        'y':0.1,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'bottom'})
fig.show()

In [None]:
df_by_user_ts = df.groupby(['user_name','hourly_start_time']).sum('numeric_only').reset_index()
df_by_user_ts = df_by_user_ts[~df_by_user_ts.user_name.isin(df_low_usage_users["user_name"].values)]
df_by_user_ts.reset_index(drop=True)
fig = px.area(df_by_user_ts, x="hourly_start_time", y="approximate_credits_used", color="user_name",color_discrete_sequence=color_scheme)
fig.update_layout(
    title={
        'text': "Timeseries of cost by user",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="Hourly start time (UTC)",
    yaxis_title="Credits used (approx.)"
)
fig.show()

### Analysis
<div class="alert alert-info">

* User FIVETRAN_USER incurs ~ 36% of the costs at ~ 1086 credits.
    * FIVETRAN_USER had an 80% consumption increase on Oct. 1. This was visible as a sharp peak in total consumption increase.
* Users ML_SERVICE_DEV and DBT_PROD have next highest consumption at ~ 14.4% and ~ 430 credits.                                              
* Following users are extremely bursty in their usage and should be dithered if possible:
    * DBT_DEV
    * DBT_PROD
* FIVETRAN_USER_DEV, ML_SERVICE_DEV, VERTEX_API_PROD show a sharp increase in usage of ~ 200%, Oct 8th-10th which correlates well with increase in total usage 
* Increased noisiness starting 24th Sept. is due to correlated usage between FIVETRAN_USER and ML_SERVICE_PROD                                             
</div>

### Actions and Recommendations
<div class="alert alert-success">

* Similar trend analysis as total cost by usage can be provided for each user and especially for high usage users to capture increases in cost trends and set monitors against
* FIVETRAN_USER:
    * Has high usage which is mostly uniformly spread through the day
    * If most queries for FIVETRAN_USER are similar size/cost/credit consumption (##TODO: Optiml, Query analysis notebook) it might warrant having a separate warehouse for it
    * Create resource monitor to warn of 80% usage increase for the user on Oct. 1. Further analyze the causal driver.
* Low_usage_users should be analyzed for activity. If they are inactive they should be granted reduced privileges or removed to avoid security issues (##TODO: Optiml, User analysis notebook) 
* Tagging should be made available on queries from biggest users for better cost attribution.
* Set up a time varying resource monitor at account and user level based on usage patterns to flag any anomalous usage.    
    
</div>

## Cost by warehouse

In [None]:
# Returns results only for ACCOUNTADMIN role or any other role that has been granted MONITOR USAGE global privilege
# So results consisten with Greg's usage
df = qlib.cost_by_wh_ts(sdate, edate)
# df.head()

In [None]:
df_by_wh = df.groupby(['warehouse_name']).sum('numeric_only').reset_index()
df_by_wh = df_by_wh.round(2)
df_by_wh.loc[len(df.index)] = ['Total', df_by_wh['credits'].sum(), df_by_wh['dollars'].sum(),  df_by_wh['cloud_services_credits'].sum(), df_by_wh['cloud_services_dollars'].sum()]
print('Credit and dollar usage overall and for cloud services by warehouse (Current month)')
print('-----------------------------------------------------------------------------------')
print(tabulate(df_by_wh, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
# Remove the last row of totals for the plot
df_by_wh.drop(df_by_wh.tail(1).index,inplace=True)

fig = make_subplots(
    rows=1, cols=1,
    specs=[[{"type": "pie"}]],
)

fig.add_trace(go.Pie(labels=df_by_wh['warehouse_name'].tolist(), values=df_by_wh['dollars'].tolist(),name='dollars',marker_colors=color_scheme),row=1,col=1)

fig.update_layout(
    title={
        'text': "Breakdown of total cost by warehouse",
        'y':0.1,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
fig.show()

In [None]:
df_by_wh_ts = df.groupby(['warehouse_name','hourly_start_time']).sum('numeric_only').reset_index()
# df_by_wh_ts.head()

In [None]:
##TODO: Investigate why tunring off cloud services only makes daily refresh plot jump in some points
fig = px.area(df_by_wh_ts, x="hourly_start_time", y="credits", color="warehouse_name",color_discrete_sequence=color_scheme)
fig.update_layout(
    title={
        'text': "Timeseries of cost by warehouse",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="Hourly start time (UTC)",
    yaxis_title="Credits used"
)
fig.show()

### Analysis
<div class="alert alert-info">    
  
* PROD_WH incurs highest and > 50% of the costs at ~ 3110 credits.
* DEV_WH consumes next highest ~30% at ~ 1872 credits.
* DAILY_REFRESH_WH and ML_WH consume 14.6% and 3.6% respectively.

* The increased credit consumption from 8th-10th Oct. is completely in DEV_WH where credit consumption increases by 200%. 
    * This increase is similar in magnitude to 11-2 pm everyday for this WH - investigate if it was the same process run intentionally or not.
    
* PROD_WH shows a +200% increase on 1st Oct. from 1-5 pm which shows as a spike in other views as well but was not as prominent.
    
* Fewer credits consumed by DAILY_REFRESH_WH on Monday and Tuesday - would have expected that to be for weekend.
    
* Increased noisiness starting 24th is almost entirely coming from ML_WH. This is consistent with users we identified as generating the noisiness in cost by user.
    
</div>

### Actions and Recommendations

<div class="alert alert-success">
    
* Similar trend analysis as total cost by usage can be provided for each user and especially for high usage users to capture increases in cost trends and set monitors against.
* For better visibility grant MONITOR USAGE global privilege to all users. (Or look for alternatives to make warehouse credit consumption consistent with total credit consumption).
* Set WH level alerts for each WH - especially for DEV_WH and PROD_WH.
* DEV_WH and DAILY_REFRESH_WH have very bursty usage - consider dithering workloads. 
* DEV_WH is constantly using credits at night - warrants further investigation. (##TODO: Optiml, Warehouse analysis notebook) 
* Tag jobs by function in PROD_WH and DEV_WH.
* Set up a time varying resource monitor at warehouse level based on usage patterns to flag any anomalous usage.
* Investigate increased noisiness in the ML_WH starting 24th Sept.    
    
</div>

## Cost by Partner Tools


In [None]:
df=qlib.cost_by_partner_tool_ts(sdate, edate)
# df.to_csv('/home/manas/DS_data/partner_tools.csv')

In [None]:
df_by_pt = df.groupby(['client_application_name']).sum('numeric_only').reset_index()
df_by_pt = df_by_pt.round(2)
df_by_pt.loc[len(df.index)] = ['Total', df_by_pt['approximate_credits_used'].sum()]
print(tabulate(df_by_pt, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
# Remove the last row of totals for the plot
df_by_pt.drop(df_by_pt.tail(1).index,inplace=True)

fig = make_subplots(
    rows=1, cols=1,
    specs=[[{"type": "pie"}]],
)

fig.add_trace(go.Pie(labels=df_by_pt['client_application_name'].tolist(), values=df_by_pt['approximate_credits_used'].tolist(),name='credits',marker_colors=color_scheme, rotation=45),row=1,col=1)

fig.update_layout(
    title={
        'text': "Breakdown of total cost by partner tools",
        'y':0.1,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
fig.show()

In [None]:
df_by_pt_ts = df.groupby(['client_application_name','hourly_start_time']).sum('numeric_only').reset_index()
# df_by_pt_ts.head()

In [None]:
fig = px.area(df_by_pt_ts, x="hourly_start_time", y="approximate_credits_used", color="client_application_name",color_discrete_sequence=color_scheme)
fig.update_layout(
    title={
        'text': "Timeseries of cost by partner tools",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="Hourly start time (UTC)",
    yaxis_title="Credits used (approx.)"
)
fig.show()

In [None]:
df_titles=sorted(df["warehouse_name"].unique())
df_warehouse = [d for _, d in df.groupby(['warehouse_name'])]

In [None]:
for i in range(len(df_warehouse)):
    fig = px.area(df_warehouse[i], x="hourly_start_time", y="approximate_credits_used", color="client_application_name",color_discrete_sequence=color_scheme,title=df_titles[i])
    
    fig.show()

### Analysis
<div class="alert alert-info">
    
* Tools connecting through JDBC driver consume most of the credits ~1477 almost ~50%.
* 1st Oct. usage spike is correlated with FIVETRAN_USER also using JDBC driver on PROD_WH  
* Increased usage on 9th and 10th Oct. on DEV_WH is correlated with Python, JDBC and ODBC usage as follows based on magnitude of changes:
    * Python: Correlates with username ML_SERVICE_DEV
    * JDBC & ODBC: Correlates with username FIVETRAN_USER_DEV and VERTEX_API_PROD
    
</div>

### Actions and Recommendations

<div class="alert alert-success">

* Similar trend analysis as total cost by usage can be provided for each partner tool and especially for high usage users to capture increases in cost trends and set monitors against.
* Tag jobs associated with each partner tool better to discern which physical tool is connecting to optimize usage across entire lineage
* Set up a time varying resource monitor at warehouse level based on usage patterns to flag any anomalous usage.
    
</div>

## Cost of data transfers: 

### Further Actions

<div class="alert alert-warning">

* Implement cost of data transfers query using https://docs.snowflake.com/en/user-guide/cost-exploring-data-transfer.html
    * Will need access to:
        * DATA_TRANSFER_DAILY_HISTORY
        * DATA_TRANSFER_HISTORY
        * ORGANIZATION_USAGE ACCOUNT_USAGE
        * DATABASE_REPLICATION_USAGE_HISTORY
        * REPLICATION_USAGE_HISTORY
        * REPLICATION_GROUP_USAGE_HISTORY
        * USAGE_IN_CURRENCY_DAILY
    
</div>