In [None]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

In [None]:
# Adding system path
import sys, pathlib, os
sys.path.append(str(pathlib.Path.cwd().parent.parent))
# sys.path

In [None]:
# Set to show warnings only once
import warnings
warnings.filterwarnings('ignore')

In [None]:
## Setup connection to DWH
# customer = 'KIVA'
# schema = 'KIVA_PROD.OPTIML'
customer = 'OPTIML' # Use this for testing
schema = 'KIV.ACCOUNT_USAGE' # Use this for testing
username = customer + '_USERNAME'
password = customer + '_PASSWORD'
account = customer + '_ACCOUNT'
warehouse = customer + '_WAREHOUSE'

user = os.getenv(username)
password = os.getenv(password)
account = os.getenv(account)
warehouse = os.getenv(warehouse)

In [None]:
## Setup pandas
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tabulate import tabulate
color_scheme=["red","blue","green","orange","purple","brown","pink","gray","olive","cyan","darkviolet","goldenrod","darkgreen","chocolate","lawngreen"]

In [None]:
# Initialize connection to Snowflake and set analysis date
from optiml.connection import SnowflakeConnConfig
connection = SnowflakeConnConfig(username=user,password=password,accountname=account).create_connection()

# Initialize query library
from optiml.backend.cost_profile import CostProfile, get_previous_dates
cqlib = CostProfile(connection, schema)
from optiml.backend.warehouse_profile import WarehouseProfile
wqlib = WarehouseProfile(connection, schema)

# Initialize dates
import datetime 
# edate = datetime.date.today() - datetime.timedelta(days=1)
# sdate = edate - datetime.timedelta(days=6)
edate = datetime.datetime.strptime('2022-10-04', '%Y-%m-%d').date()
sdate = datetime.datetime.strptime('2022-09-29', '%Y-%m-%d').date()
# edate = datetime.datetime.strptime('2022-10-12', '%Y-%m-%d').date()
# sdate = datetime.datetime.strptime('2022-10-05', '%Y-%m-%d').date()
# edate = str(edate)
# sdate = str(sdate)

print('Customer:', customer)
print('Schema:', schema)
print(str(sdate), str(edate))

In [None]:
# Setting up autoreload for libs
%load_ext autoreload
%autoreload 2
%aimport optiml.backend.cost_profile

# Total cost breakdown 

## Cost by usage category

In [None]:
## Get data
df = cqlib.total_cost_breakdown_ts(sdate, edate)
df = df.fillna('Unassigned')
## Get usage for past week
df_by_usage_category = df.groupby("category_name").sum("numeric_only").reset_index()
df_by_usage_category.loc[len(df_by_usage_category.index)] = ['Total', df_by_usage_category['credits'].sum(), df_by_usage_category['dollars'].sum()]
df_by_usage_category = df_by_usage_category.round(2)

## Get usage for previous week as a predictive sanity check
p1_sdate, p1_edate = get_previous_dates(sdate, edate, 1)
df_prev = cqlib.total_cost_breakdown_ts(p1_sdate, p1_edate)
df_prev = df_prev.fillna('Unassigned')
df_by_usage_category_prev = df_prev.groupby("category_name").sum("numeric_only").reset_index()
df_by_usage_category_prev.loc[len(df_by_usage_category_prev.index)] = ['Total', df_by_usage_category_prev['credits'].sum(), 
                                                                       df_by_usage_category_prev['dollars'].sum()]
## Get percentage change since previous week
df_by_usage_category_prev = df_by_usage_category_prev.round(2)
df_by_usage_category.set_index('category_name',inplace=True)
df_by_usage_category_prev.set_index('category_name',inplace=True)
df_by_usage_category_prev.rename(columns={"credits": "credits_previous_week", "dollars": "dollars_previous_week"}, inplace=True)

df_by_usage_category = pd.concat([df_by_usage_category_prev, df_by_usage_category], axis=1)
df_by_usage_category.reset_index(inplace=True)
df_by_usage_category["pct_change_dollars"] = round((df_by_usage_category["dollars"]-df_by_usage_category["dollars_previous_week"])/df_by_usage_category["dollars_previous_week"]*100,2)


In [None]:
## Get table for this week and previous week with percentage change
print('Category: Credit and dollar consumption trends')
print('----------------------------------------------')
print(tabulate(df_by_usage_category, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
## Pie charts for total cost breakdown
df_by_usage_category.reset_index(inplace=True)
df_by_usage_category.drop(columns=["index"], inplace=True)
df_by_usage_category = df_by_usage_category.drop(len(df_by_usage_category)-1) 
fig = make_subplots(
    rows=1, cols=2,
    specs=[[{"type": "pie"},{"type": "pie"}]],
    subplot_titles=("Dollars", "Credits")
)

fig.add_trace(go.Pie(labels=df_by_usage_category['category_name'].tolist(), values=df_by_usage_category['dollars'].tolist(),name="Dollars", 
                     rotation=45, marker_colors=color_scheme, hole=0.4),row=1,col=1)
fig.add_trace(go.Pie(labels=df_by_usage_category['category_name'].tolist(), values=df_by_usage_category['credits'].tolist(),name='Credits',
                     rotation=45, marker_colors=color_scheme, hole=0.4),row=1,col=2)

fig.update_layout(
    title={
        'text': "Total cost by usage category",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
fig.show()

In [None]:
## Timeseries
## TODO: Use this to make recommendations for resource monitors - 
## 1) Total for a week 
## 2) For each day based on historic patterns
df_by_category_ts = df.groupby(['category_name','hourly_start_time']).sum('numeric_only').reset_index()
fig = px.area(df_by_category_ts, \
            x="hourly_start_time",\
            y="dollars", \
            color="category_name",\
            color_discrete_sequence=color_scheme,\
            markers=True)
fig.update_layout(
    title={
        'text': "Timeseries of cost by usage category",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="Hourly start time (UTC)",
    yaxis_title="US Dollars"
    
)
fig.show()

## Cost by user

In [None]:
## Get usage for the past week
df = cqlib.cost_by_user_ts(sdate, edate)
df_by_user = df.groupby(['user_name']).sum('numeric_only').reset_index()
df_by_user = df_by_user.round(2)

## Get usage for previous week as a predictive sanity check
df_prev = cqlib.cost_by_user_ts(p1_sdate, p1_edate)
df_by_user_prev = df_prev.groupby(['user_name']).sum('numeric_only').reset_index()
df_by_user_prev = df_by_user_prev.round(2)
df_by_user_prev.rename(columns={"approximate_credits": "approximate_credits_previous_week"}, inplace=True)


## Get percentage change since previous week
df_by_user.set_index('user_name',inplace=True)
df_by_user_prev.set_index('user_name',inplace=True)
df_by_user = pd.concat([df_by_user_prev, df_by_user], axis=1)
df_by_user.reset_index(inplace=True)
df_by_user.loc[len(df_by_user.index)] = ['Total', \
                                              df_by_user['approximate_credits_previous_week'].sum(), \
                                              df_by_user['approximate_credits'].sum()]

df_by_user.fillna({'user_name':'Unassigned',\
                   'approximate_credits_previous_week':0,\
                   'approximate_credits':0
                  }, inplace=True)


df_by_user["pct_change_credits"] = round((df_by_user["approximate_credits"]\
                                          -df_by_user["approximate_credits_previous_week"])\
                                         /df_by_user["approximate_credits_previous_week"]*100,2)

print('Users: Credit consumption trends')
print('--------------------------------')
print(tabulate(df_by_user, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
df_by_user = df_by_user[["user_name", "approximate_credits"]]
df_by_user["percent_usage"] = df_by_user["approximate_credits"]/df_by_user[df_by_user["user_name"]=="Total"]["approximate_credits"].values[0]*100
df_by_user["percent_usage"] = df_by_user["percent_usage"].round(3)
idx_low_usage_users = df_by_user.loc[df_by_user["percent_usage"]<1.00].sum(axis=0,numeric_only=True)
df_low_usage_users = df_by_user.loc[df_by_user["percent_usage"] < 1.00].reset_index(drop=True)
print('List of low usage users (<1% of credits) with usage (Current month)')
print('-------------------------------------------------------------------')
print(tabulate(df_low_usage_users, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
## Group low usage users together
df_by_user = df_by_user.loc[df_by_user["percent_usage"] > 1.00].reset_index(drop=True)
df_by_user.loc[len(df_by_user)-1.5] = ["Low_usage_users",\
                                       idx_low_usage_users["approximate_credits"],\
                                       idx_low_usage_users["percent_usage"]]
df_by_user = df_by_user.sort_index().reset_index(drop=True)

## Drop total
df_by_user.drop(df_by_user.tail(1).index,inplace=True)

## Plot pie
fig = make_subplots(
    rows=1, cols=1,
    specs=[[{"type": "pie"}]],
    subplot_titles=("Credits")
)

fig.add_trace(go.Pie(labels=df_by_user['user_name'].tolist(), \
                     values=df_by_user['approximate_credits'].tolist(), \
                     name="Credits", rotation=320, \
                     marker_colors=color_scheme, hole=0.4),row=1,col=1)


fig.update_layout(
    title={
        'text': "Total cost by user",
        'y':0.1,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'bottom'})
fig.show()

In [None]:
##Plot timeseries
df_by_user_ts = df.groupby(['user_name','hourly_start_time']).sum('numeric_only').reset_index()
df_by_user_ts = df_by_user_ts[~df_by_user_ts.user_name.isin(df_low_usage_users["user_name"].values)]
df_by_user_ts.reset_index(drop=True)
fig = px.area(df_by_user_ts,\
              x="hourly_start_time",\
              y="approximate_credits",\
              color="user_name",\
              color_discrete_sequence=color_scheme,\
            markers=True)
fig.update_layout(
    title={
        'text': "Timeseries of cost by user (except low usage users)",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="Hourly start time (UTC)",
    yaxis_title="Credits used (approx.)"
)
fig.show()

## Cost by warehouse

In [None]:
## Get usage for the past week
df = cqlib.cost_by_wh_ts(sdate, edate)
df_by_wh = df.groupby(['warehouse_name']).sum('numeric_only').reset_index()
df_by_wh = df_by_wh.round(2)


## Get usage for previous week as a predictive sanity check
df_prev = cqlib.cost_by_wh_ts(p1_sdate, p1_edate)
df_by_wh_prev = df_prev.groupby(['warehouse_name']).sum('numeric_only').reset_index()
df_by_wh_prev = df_by_wh_prev.round(2)
# df_by_wh_prev.loc[len(df.index)] = ['Total', \
#                                df_by_wh_prev['credits'].sum(), \
#                                df_by_wh_prev['cloud_services_credits'].sum()
#                               ]
df_by_wh_prev.rename(columns = {"credits": "credits_previous_week",\
                                "cloud_services_credits": "cloud_services_credits_previous_week"}, inplace=True)



## Get percentage change since previous week
df_by_wh.set_index('warehouse_name',inplace=True)
df_by_wh_prev.set_index('warehouse_name',inplace=True)
df_by_wh = pd.concat([df_by_wh_prev, df_by_wh], axis=1)
df_by_wh.reset_index(inplace=True)

df_by_wh.loc[len(df_by_wh.index)] = ['Total', \
                                     df_by_wh['credits'].sum(), \
                                     df_by_wh['cloud_services_credits'].sum(), \
                                     df_by_wh['credits_previous_week'].sum(),\
                                     df_by_wh['cloud_services_credits_previous_week'].sum(),\
                                                ]

df_by_wh["pct_change_credits"] = round((df_by_wh["credits"]\
                                          -df_by_wh["credits_previous_week"])\
                                         /df_by_wh["credits_previous_week"]*100,2)
df_by_wh.fillna({'warehouse_name':'Unassigned',\
                   'credits_previous_week':0,\
                   'cloud_services_credits_previous_week': 0,\
                   'credits':0,\
                   'cloud_service_credits':0,\
                   'pct_change_credits':0\
                  }, inplace=True)

df_by_wh_print = df_by_wh[["warehouse_name","credits_previous_week","credits","pct_change_credits"]]


print('Warehouses: Credit consumption trends')
print('-------------------------------------')
print(tabulate(df_by_wh_print, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
# Remove the last row of totals for the plot
df_by_wh.drop(df_by_wh.tail(1).index,inplace=True)

fig = make_subplots(
    rows=1, cols=1,
    specs=[[{"type": "pie"}]],
)

fig.add_trace(go.Pie(labels=df_by_wh['warehouse_name'].tolist(),\
                     values=df_by_wh['credits'].tolist(),\
                     name='credits',\
                     marker_colors=color_scheme, hole=0.4),row=1,col=1)

fig.update_layout(
    title={
        'text': "Total cost by warehouse",
        'y':0.1,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
fig.show()

In [None]:
## Plot timeseries
df_by_wh_ts = df.groupby(['warehouse_name','hourly_start_time']).sum('numeric_only').reset_index()
fig = px.area(df_by_wh_ts, x="hourly_start_time", y="credits", color="warehouse_name",color_discrete_sequence=color_scheme,markers=True)
fig.update_layout(
    title={
        'text': "Timeseries of cost by warehouse",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="Hourly start time (UTC)",
    yaxis_title="Credits used"
)
fig.show()

## Cost by partner tools


In [None]:
## Get usage for the past week
df = cqlib.cost_by_partner_tool_ts(sdate, edate)
df_by_pt = df.groupby(['client_application_name']).sum('numeric_only').reset_index()
df_by_pt = df_by_pt.round(2)
df_by_pt.loc[len(df.index)] = ['Total', df_by_pt['approximate_credits'].sum()]

## Get usage for previous week as a predictive sanity check
df_prev = cqlib.cost_by_partner_tool_ts(p1_sdate, p1_edate)
df_by_pt_prev = df_prev.groupby(['client_application_name']).sum('numeric_only').reset_index()
df_by_pt_prev = df_by_pt_prev.round(2)
df_by_pt_prev.loc[len(df_prev.index)] = ['Total', df_by_pt_prev['approximate_credits'].sum()]
df_by_pt_prev.rename(columns={"approximate_credits": "approximate_credits_previous_week"}, inplace=True)

## Get percentage change since previous week
df_by_pt.set_index('client_application_name',inplace=True)
df_by_pt_prev.set_index('client_application_name',inplace=True)
df_by_pt = pd.concat([df_by_pt_prev, df_by_pt], axis=1)
df_by_pt.reset_index(inplace=True)

df_by_pt.fillna({'client_application_name':'Unassigned',\
                   'approximate_credits_previous_week':0,\
                   'approximate_credits':0,\
                   'pct_change_credits':0\
                  }, inplace=True)

df_by_pt["pct_change_credits"] = round((df_by_pt["approximate_credits"]\
                                          -df_by_pt["approximate_credits_previous_week"])\
                                         /df_by_pt["approximate_credits_previous_week"]*100,2)

print('Client Application: Credit consumption trends')
print('---------------------------------------------')
print(tabulate(df_by_pt, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
# Remove the last row of totals for the plot
df_by_pt.drop(df_by_pt.tail(1).index,inplace=True)

fig = make_subplots(
    rows=1, cols=1,
    specs=[[{"type": "pie"}]],
)

fig.add_trace(go.Pie(labels=df_by_pt['client_application_name'].tolist(),\
                     values=df_by_pt['approximate_credits'].tolist(),\
                     name='credits',marker_colors=color_scheme,\
                     rotation=45, hole=0.4),row=1,col=1)

fig.update_layout(
    title={
        'text': "Breakdown of total cost by partner tools",
        'y':0.1,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
fig.show()

In [None]:
df_by_pt_ts = df.groupby(['client_application_name','hourly_start_time']).sum('numeric_only').reset_index()
# df_by_pt_ts.head()

In [None]:
fig = px.area(df_by_pt_ts,\
              x="hourly_start_time",\
              y="approximate_credits",\
              color="client_application_name",\
              color_discrete_sequence=color_scheme,\
             markers=True)
fig.update_layout(
    title={
        'text': "Timeseries of cost by partner tools",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="Hourly start time (UTC)",
    yaxis_title="Credits used (approx.)"
)
fig.show()

## Resource monitoring

### Generate Resource Monitors for the dataware house based on data from this week

In [None]:
from datetime import datetime, timedelta
TRAINING_LENGTH = 30
training_start = sdate - timedelta(TRAINING_LENGTH + 1)
training_end = sdate - timedelta(1)
df_compute = cqlib.cost_of_compute_ts(training_start, training_end)

In [None]:
df_by_wh, resource_monitor = cqlib.get_resource_monitor_values(df_compute)
resource_monitor_queries = []
for idx, wh_name in enumerate(resource_monitor["warehouse_name"].unique()):
    resource_monitor_name = resource_monitor.loc[idx]["warehouse_name"] + '_RESOURCE_MONITOR'
    credit_quota = resource_monitor.loc[idx]["credits_three_sigma_plus"]
    start_timestamp_ltz = "YYYY-MM-DD HH:MM:SS PST"
    periodicity = "DAILY"
    percentage_of_monitor=100
    action="NOTIFY"
    warehouse_name=resource_monitor.loc[idx]["warehouse_name"]
    resource_monitor_queries.append(cqlib.generate_resource_monitor_sql(resource_monitor_name,\
                              credit_quota,\
                              periodicity,\
                              start_timestamp_ltz,\
                              percentage_of_monitor,\
                              action,\
                              warehouse_name))

print("Query templates for you to generate resource monitor for this week")
for res_mon_q in resource_monitor_queries:
    print(res_mon_q)

### Runbook

<div class="alert alert-info">

* If the resource monitor did not send a notification last week and the new resource monitor proposed is within 10% of previous monitor - dont update the resource monitor

* If the resource monitor sent a notification last week due to an unexplained usage that cannot be attributed to legitimate use continue to debug and dont change the resource monitor
    
* If the resource monitor sent a notification last week that can be attributed to legitimate use update the resource monitor
    
* If the new resource monitor values are >10% over last week's values update the resource monitor
    
</div>