In [None]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

In [None]:
# Adding system path
import sys, pathlib
sys.path.append(str(pathlib.Path.cwd().parent.parent))
# sys.path

In [None]:
# Set to show warnings only once
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Setting up displays
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
from dash import Dash,html,dcc,Input,Output
app = Dash(__name__)
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tabulate import tabulate
color_scheme=["red","blue","green","orange","purple","brown","pink","gray","olive","cyan","darkviolet","goldenrod","darkgreen","chocolate","lawngreen"]

In [None]:
# Initialize connection to Snowflake and set analysis date
from optiml.connection import SnowflakeConnConfig
connection = SnowflakeConnConfig(accountname='jg84276.us-central1.gcp',warehousename="XSMALL_WH").create_connection()
# Initialize local environment
import os
cache_dir = os.path.expanduser('~/data/knot')
# Initialize query library
from optiml.backend.cost_profile import CostProfile, get_previous_dates
cqlib = CostProfile(connection, 'KNT', cache_dir, "enterprise")

# sdate = '2022-09-21'
# edate = '2022-10-21'
sdate = '2022-10-11'
edate = '2022-10-21'

In [None]:
# Setting up autoreload for libs
%load_ext autoreload
%autoreload 2
%aimport optiml.backend.cost_profile

# Total cost breakdown 

## Analysis setup
<div class="alert alert-warning">

* Analysis date range: '2022-10-11' to '2022-10-21': last rolling month in the data we collected.

* (Assumption) on type of Snowflake account: Enterprise Edition

* (Assumption) Credit to dollar conversion: `$`3 per credit

</div>


## Cost by usage category

In [None]:
df = cqlib.total_cost_breakdown_ts(sdate, edate)
df = df.fillna('Unassigned')
df_by_usage_category = df.groupby("category_name").sum("numeric_only").reset_index()
df_by_usage_category.loc[len(df_by_usage_category.index)] = ['Total', df_by_usage_category['credits'].sum(), df_by_usage_category['dollars'].sum()]
df_by_usage_category = df_by_usage_category.round(2)
print('Credit and dollar usage by category (Current month)')
print('---------------------------------------------------')
print(tabulate(df_by_usage_category, headers='keys', tablefmt='rounded_outline', showindex=False))


In [None]:
## Get usage for previous month as a predictive sanity check
p1_sdate, p1_edate = get_previous_dates(sdate, edate, 1)
df_prev = cqlib.total_cost_breakdown_ts(p1_sdate, p1_edate)
df_prev = df_prev.fillna('Unassigned')
df_by_usage_category_prev = df_prev.groupby("category_name").sum("numeric_only").reset_index()
df_by_usage_category_prev.loc[len(df_by_usage_category_prev.index)] = ['Total', df_by_usage_category_prev['credits'].sum(), 
                                                                       df_by_usage_category_prev['dollars'].sum()]
df_by_usage_category_prev = df_by_usage_category_prev.round(2)
print('Credit and dollar usage by category (Previous month)')
print('----------------------------------------------------')
print(tabulate(df_by_usage_category_prev, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
df_change = pd.DataFrame().assign(category_name=df_by_usage_category["category_name"])
df_change["percent_change"] = ((df_by_usage_category["dollars"] - df_by_usage_category_prev["dollars"])/df_by_usage_category_prev["dollars"]*100).round(2)
print('Percentage change in dollar usage')
print('---------------------------------')
print(tabulate(df_change, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
# Pie charts for total cost breakdown
## Remove the last row of totals for the plot
df_by_usage_category.reset_index(inplace=True)
df_by_usage_category.drop(columns=["index"], inplace=True)
df_by_usage_category = df_by_usage_category.drop(len(df_by_usage_category)-1) 
fig = make_subplots(
    rows=1, cols=2,
    specs=[[{"type": "pie"},{"type": "pie"}]],
    subplot_titles=("Dollars", "Credits")
)

fig.add_trace(go.Pie(labels=df_by_usage_category['category_name'].tolist(), values=df_by_usage_category['dollars'].tolist(),name="Dollars", 
                     rotation=45, marker_colors=color_scheme),row=1,col=1)
fig.add_trace(go.Pie(labels=df_by_usage_category['category_name'].tolist(), values=df_by_usage_category['credits'].tolist(),name='Credits',
                     rotation=45, marker_colors=color_scheme),row=1,col=2)

fig.update_layout(
    title={
        'text': "Breakdown of total cost by usage category",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
fig.show()

## Cost by usage category timeseries

In [None]:
df_by_category_ts = df.groupby(['category_name','hourly_start_time']).sum('numeric_only').reset_index()
fig = px.area(df_by_category_ts, x="hourly_start_time", y="dollars", color="category_name",color_discrete_sequence=color_scheme)
fig.update_layout(
    title={
        'text': "Timeseries of cost by usage category",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="Hourly start time (UTC)",
    yaxis_title="US Dollars"
)
fig.show()

In [None]:
st = df["start_time"].to_list()

In [None]:
etime

In [None]:
queries = [get_queries(s,e) for (s,e) in (start_time, end_time)]

In [None]:
df_compute = df_by_category_ts[df_by_category_ts["category_name"] == "Compute"].round(2)

In [None]:
avg_consumption = df_compute.mean().round(2)
max_consumption = df_compute.loc[df_compute['credits'].idxmax()]
max_consumption.drop("category_name", inplace=True)
min_consumption = df_compute.loc[df_compute['credits'].idxmin()]
min_consumption.drop("category_name", inplace=True)

print('Avg. hourly consumption:')
print('-----------------')
print(avg_consumption)

print('')

print('Max. hourly consumption:')
print('-----------------')
print(max_consumption)

print('')

print('Min. hourly consumption:')
print('-----------------')
print(min_consumption)

### Analysis & recommendations

<div class="alert alert-info">

#### Compute
* Compute consumes ~75% of the credits budget and > 65% of the dollar budget. This is as expected - compared to other datasets seems a little low.
* The average hourly compute consumption is Credits: 7.93, Dollars: `$`15.86, with usage peaks typically 5-9 am with a period max of Credits: 17.88, Dollars: `$` 35.76. 
* At off-peak compute times (11 am - 4 am) credit consumption is 15-20 credits. 
* There is a decline in usage trends from beginning of the analysis period (11th Oct) which is led by cloud services consumption - an 80% decline.

#### Cloud Services
* Cloud services credit cnsumption is > 15% of compute. It should be lowere than 10% so there are definitely overages here and it should be explored how to reduce this. 
* Reduced usage since beginning of the pay period but overall still a significant fraction of compute which would result in overages.
    
#### Snowpipe
* Snowpipe usage is high - is there a slower cadence on which data can be made availabe in tables or is it required to have it available with minutes latency every time its refreshed.

#### Storage
* Storage is ~8-9% of the cost.
* Next steps are to break it down by:
    * Storage bytes
    * Staging bytes
    * Failsafe bytes
    
#### General trends
* Month over month:
    * Autoclustering              31.53
    * Cloud services              34.77
    * Compute                      6.06
    * Snowpipe                   -24.56
    * Storage                      0.42
    * Total                        5.11

* Since Compute is majority of the expense a smaller increase in it may have a significant impact
* Significant increase in cloud services and autoclustering. 
    * Analyze cloud services to see whats driving the cost
    * ROI on enabling autoclustering should be analyzed
      
* This is very basic predictive analysis - more complex analyses can be made available in the pilot 
* Load is well spread out. There may be scope for dithering some workload from 11 am - 3 am
    
* Set up a time varying resource monitor on the account level based on these usage patterns to flag any anomalous usage. Usage monitoring set naively:
    * Will not be proactive about cost containment
    * Can generate false positives/negatives causing nuisance alerts/shutdowns.
* Evaluate opportunities for savings through:
    * Dithering peak workloads to off-peak
    * Reducing weekend consumption

</div>

## Cost by user

In [None]:
df = cqlib.cost_by_user_ts(sdate, edate)
# df.head()

In [None]:
df_by_user = df.groupby(['user_name']).sum('numeric_only').reset_index()
df_by_user = df_by_user.round(2)
df_by_user.loc[len(df_by_user.index)] = ['Total', df_by_user['approximate_credits_used'].sum()]
print('Credit and dollar usage by user (Current month)')
print('-----------------------------------------------')
print(tabulate(df_by_user, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
df_by_user["percent_usage"] = df_by_user["approximate_credits_used"]/df_by_user[df_by_user["user_name"]=="Total"]["approximate_credits_used"].values[0]*100
df_by_user["percent_usage"] = df_by_user["percent_usage"].round(3)
x = df_by_user.loc[df_by_user["percent_usage"]<1.00].sum(axis=0,numeric_only=True)
df_low_usage_users = df_by_user.loc[df_by_user["percent_usage"] < 1.00].reset_index(drop=True)
print('Credit and dollar for low usage users (Current month)')
print('-----------------------------------------------------')
print(tabulate(df_low_usage_users, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
df_by_user = df_by_user.loc[df_by_user["percent_usage"] > 1.00].reset_index(drop=True)
df_by_user.loc[len(df_by_user)-1.5] = ["Low_usage_users", x["approximate_credits_used"], x["percent_usage"]]
df_by_user = df_by_user.sort_index().reset_index(drop=True)
print('Credit and dollar usage by user with low usage users consolidated (Current month)')
print('---------------------------------------------------------------------------------')
print(tabulate(df_by_user, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
df_by_user.drop(df_by_user.tail(1).index,inplace=True)
fig = make_subplots(
    rows=1, cols=1,
    specs=[[{"type": "pie"}]],
    subplot_titles=("Credits")
)

fig.add_trace(go.Pie(labels=df_by_user['user_name'].tolist(), values=df_by_user['approximate_credits_used'].tolist(),name="Credits", rotation=270,marker_colors=color_scheme),row=1,col=1)
# fig.add_trace(go.Pie(labels=df_by_user['user_name'].tolist(), values=df_by_user['credits'].tolist(),name='Credits', rotation=45,marker_colors=color_scheme),row=1,col=2)

fig.update_layout(
    title={
        'text': "Breakdown of total cost by user",
        'y':0.1,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'bottom'})
fig.show()

In [None]:
df_by_user_ts = df.groupby(['user_name','hourly_start_time']).sum('numeric_only').reset_index()
df_by_user_ts = df_by_user_ts[~df_by_user_ts.user_name.isin(df_low_usage_users["user_name"].values)]
df_by_user_ts.reset_index(drop=True)
fig = px.area(df_by_user_ts, x="hourly_start_time", y="approximate_credits_used", color="user_name",color_discrete_sequence=color_scheme)
fig.update_layout(
    title={
        'text': "Timeseries of cost by user",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="Hourly start time (UTC)",
    yaxis_title="Credits used (approx.)"
)
fig.show()

### Analysis
<div class="alert alert-info">

* APIUSER_WP, BIPROCESS and GP_ETL_USER are the majority users consuming ~ 75% of the overall credits
* APIUSER_WP usage seems correlated with Cloud Services use and should be examined to see if that is driving Cloud Services costs
* Following users are bursty in usage and it should be explored if the peak usage can be dithered to off-peak times:
    * BI_PROCESS 
    * GP_ETL_USER
    * SVCPRDSFMYACCT
* Explore Low_usage_users - are they users that use frequently or are idle/not with the organization anymore (security thread)
* Tagging should be made available on queries from biggest users for better cost attribution
            
</div>

## Cost by warehouse

In [None]:
# Returns results only for ACCOUNTADMIN role or any other role that has been granted MONITOR USAGE global privilege
# So results consisten with Greg's usage
df = cqlib.cost_by_wh_ts(sdate, edate)
# df.head()

In [None]:
df_by_wh = df.groupby(['warehouse_name']).sum('numeric_only').reset_index()
df_by_wh = df_by_wh.round(2)
df_by_wh.loc[len(df.index)] = ['Total', df_by_wh['credits'].sum(), df_by_wh['dollars'].sum(),  df_by_wh['cloud_services_credits'].sum(), df_by_wh['cloud_services_dollars'].sum()]
print('Credit and dollar usage overall and for cloud services by warehouse (Current month)')
print('-----------------------------------------------------------------------------------')
print(tabulate(df_by_wh, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
# Remove the last row of totals for the plot
df_by_wh.drop(df_by_wh.tail(1).index,inplace=True)

fig = make_subplots(
    rows=1, cols=1,
    specs=[[{"type": "pie"}]],
)

fig.add_trace(go.Pie(labels=df_by_wh['warehouse_name'].tolist(), values=df_by_wh['dollars'].tolist(),name='dollars',marker_colors=color_scheme),row=1,col=1)

fig.update_layout(
    title={
        'text': "Breakdown of total cost by warehouse",
        'y':0.1,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
fig.show()

In [None]:
df_by_wh_ts = df.groupby(['warehouse_name','hourly_start_time']).sum('numeric_only').reset_index()
# df_by_wh_ts.head()

In [None]:
##TODO: Investigate why tunring off cloud services only makes daily refresh plot jump in some points
fig = px.area(df_by_wh_ts, x="hourly_start_time", y="credits", color="warehouse_name",color_discrete_sequence=color_scheme)
fig.update_layout(
    title={
        'text': "Timeseries of cost by warehouse",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="Hourly start time (UTC)",
    yaxis_title="Credits used"
)
fig.show()

### Analysis
<div class="alert alert-info">    
  
* LOAD_WH incurs highest and ~60% of the costs at ~ 2800 credits
    * Usage for this WH is extremely bursty so there might be opportunity to move queries to this WH between 11 am and 3 am
    * Maybe opportunity to scale the WH size or number of clusters at different times to account for bursty usage

* LOCAL_LOAD_WH is highly correlated with API_USER_WP and high CLOUD SERVICES usage - can be used to track causal drivers of high CLOUD SERVICES costs
    * Bursty usage and has high so there might be scope for dithering usage
    
* SEGMENT_WH is the 3rd largest consumer of credits but is running almost continuously. Unless the events are needed continuously it might be a better strategy to time the events and dither them to off peak times for other WH

* If better cost attribution is the aim then consider tagging jobs by function in LOAD_WH, LOCAL_LOAD_WH

* Set up a time varying resource monitor at warehouse level based on usage patterns to flag any anomalous usage.
    
* Similar trend analysis as total cost by usage can be provided for each user and especially for high usage users to capture increases in cost trends and set monitors against    
    
* Team is keeping track of warehouse resizing in comments - we can make that available through tracking changes in warehouse events history 
    
</div>

## Cost by Partner Tools


In [None]:
df=cqlib.cost_by_partner_tool_ts(sdate, edate)
# df.to_csv('/home/manas/DS_data/partner_tools.csv')

In [None]:
df_by_pt = df.groupby(['client_application_name']).sum('numeric_only').reset_index()
df_by_pt = df_by_pt.round(2)
df_by_pt.loc[len(df.index)] = ['Total', df_by_pt['approximate_credits_used'].sum()]
print(tabulate(df_by_pt, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
# Remove the last row of totals for the plot
df_by_pt.drop(df_by_pt.tail(1).index,inplace=True)

fig = make_subplots(
    rows=1, cols=1,
    specs=[[{"type": "pie"}]],
)

fig.add_trace(go.Pie(labels=df_by_pt['client_application_name'].tolist(), values=df_by_pt['approximate_credits_used'].tolist(),name='credits',marker_colors=color_scheme, rotation=45),row=1,col=1)

fig.update_layout(
    title={
        'text': "Breakdown of total cost by partner tools",
        'y':0.1,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
fig.show()

In [None]:
df_by_pt_ts = df.groupby(['client_application_name','hourly_start_time']).sum('numeric_only').reset_index()
# df_by_pt_ts.head()

In [None]:
fig = px.area(df_by_pt_ts, x="hourly_start_time", y="approximate_credits_used", color="client_application_name",color_discrete_sequence=color_scheme)
fig.update_layout(
    title={
        'text': "Timeseries of cost by partner tools",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="Hourly start time (UTC)",
    yaxis_title="Credits used (approx.)"
)
fig.show()

In [None]:
df_titles=sorted(df["warehouse_name"].unique())
df_warehouse = [d for _, d in df.groupby(['warehouse_name'])]

In [None]:
for i in range(len(df_warehouse)):
    fig = px.area(df_warehouse[i], x="hourly_start_time", y="approximate_credits_used", color="client_application_name",color_discrete_sequence=color_scheme,title=df_titles[i])
    fig.update_layout(
        xaxis_title="Hourly start time (UTC)",
        yaxis_title="Credits used (approx.)"
    )
    fig.show()

### Analysis
<div class="alert alert-info">
    
* Tools connecting through JDBC driver consume most of the credits ~50%
* Go consumes a surprisingly large amount given that its a data system - would have expected Python - but presumably caused by Segment related events which might be getting used in real time for marketing/funnel
* The increased usage in credits at the beginning of the period is almost entirely caused by Javascript on LOCAL_LOAD_WH    
* Similar trend analysis as total cost by usage can be provided for each partner tool and especially for high usage users to capture increases in cost trends and set monitors against.
* Tag jobs associated with each partner tool better to discern which physical tool is connecting to optimize usage across entire lineage
* Set up a time varying resource monitor at warehouse level based on usage patterns to flag any anomalous usage.
    
</div>

## Cost of data transfers: 

### Further Actions

<div class="alert alert-warning">

* Implement cost of data transfers query using https://docs.snowflake.com/en/user-guide/cost-exploring-data-transfer.html
    * Will need access to:
        * DATA_TRANSFER_DAILY_HISTORY
        * DATA_TRANSFER_HISTORY
        * ORGANIZATION_USAGE ACCOUNT_USAGE
        * DATABASE_REPLICATION_USAGE_HISTORY
        * REPLICATION_USAGE_HISTORY
        * REPLICATION_GROUP_USAGE_HISTORY
        * USAGE_IN_CURRENCY_DAILY
    
</div>