In [None]:
# Adding system path
import sys, pathlib
sys.path.append(str(pathlib.Path.cwd().parent))
# sys.path

In [None]:
# Set to show warnings only once
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Setting up displays
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tabulate import tabulate
color_scheme=["red","blue","green","orange","purple","brown","pink","gray","olive","cyan","darkviolet","goldenrod","darkgreen","chocolate","lawngreen"]

In [None]:
# ##center allign all the figure outputs.
# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# from IPython.display import display, HTML
# from plotly.graph_objs import *
# import numpy as np
# init_notebook_mode(connected=True)

# display(HTML("""
# <style>
# .output {
#     display: flex;
#     align-items: center;
#     text-align: center;
# }
# </style>
# """))

In [None]:
# Setting up autoreload for libs
%load_ext autoreload
%autoreload 2
%aimport optiml.queries

In [None]:
# Initialize connection to Snowflake and set analysis date
from optiml.connection import SnowflakeConnConfig
connection = SnowflakeConnConfig(accountname='jg84276.us-central1.gcp',warehousename="XSMALL_WH").create_connection()
# Initialize local environment
import os
cache_dir = os.path.expanduser('~/data/kiva')
# Initialize query library
from optiml.queries import SNFLKQuery
qlib = SNFLKQuery(connection, 'KIV', cache_dir)
sdate = '2022-10-01'
edate = '2022-10-31'
print(f"The analysis is carried our for date range {sdate} to {edate}")

# Total cost breakdown 

### SETUP FOR ANALYSIS
- Date range: 10/1/22 to 10/12/22

- Type of account with Snowflake - Standard Edition powered by GCP

- Credit to dollar conversion - $2 per credit


## Cost by usage category

In [None]:
df = qlib.total_cost_breakdown_ts(sdate, edate)
df = df.fillna('Unassigned')
df.head()

In [None]:
df_by_usage_category = df.groupby("category_name").sum("numeric_only").reset_index()
df_by_usage_category.loc[len(df.index)] = ['Total', df_by_usage_category['credits'].sum(), df_by_usage_category['dollars'].sum()]
df_by_usage_category = df_by_usage_category.round(2)
print(tabulate(df_by_usage_category, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
# Pie charts for total cost breakdown
## Remove the last row of totals for the plot
df_by_usage_category.reset_index(inplace=True)
df_by_usage_category.drop(columns=["index"], inplace=True)
df_by_usage_category = df_by_usage_category.drop(len(df_by_usage_category)-1) 
fig = make_subplots(
    rows=1, cols=2,
    specs=[[{"type": "pie"},{"type": "pie"}]],
    subplot_titles=("Dollars", "Credits")
)

fig.add_trace(go.Pie(labels=df_by_usage_category['category_name'].tolist(), values=df_by_usage_category['dollars'].tolist(),name="Dollars", rotation=45,marker_colors=color_scheme),row=1,col=1)
fig.add_trace(go.Pie(labels=df_by_usage_category['category_name'].tolist(), values=df_by_usage_category['credits'].tolist(),name='Credits', rotation=45,marker_colors=color_scheme),row=1,col=2)

fig.update_layout(
    title={
        'text': "Breakdown of total cost by usage category",
        'y':0.1,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'bottom'})
fig.show()


Cost of using compute is 98.8% of total cost and the cost of using cloud services is 1.2% of total cost.

## Cost by usage category timeseries

In [None]:
df_by_category_ts = df.groupby(['category_name','start_time','end_time']).sum('numeric_only').reset_index()
fig = px.area(df_by_category_ts, x="start_time", y="credits", color="category_name",color_discrete_sequence=color_scheme)
fig.show()

### COMPUTE:
 Average credits used for the Compute category reach a high at 9 a.m. and then a low at 7 p.m.
 >Hourly credit consumption -
- Max: 27.04		
- Time: 9:00 am
- Min: 9.18 
- Time: 7:00 pm

Credits are consumed throughout the night from 8:00 p.m. to 8:00 a.m.
> Average hourly credit consumption between 8:00 pm - 8:00 am:
- Mean: 9.85
- Standard deviation: 0.48

On the 8th, 9th and 10th of October, credits consumed at night increased , while at other times, they remained the same from previous days.
- Mean: 12.66
- Standard deviation: 0.419
- Increase in mean from the previous nights : 2.81

From October 1 - October 5, there is a peak in credit consumption at 2 pm.
- Average credit consumption: 29.5
- Increase from other days at 2 p.m : 12.8 

### CLOUD SERVICES
- The credits for cloud services follow a similar trend as compute with regards to daily timings.
- Usage of credits of cloud services are only charged if the daily consumption of cloud services resources exceeds 10% of the daily warehouse usage.



## Cost by user

In [None]:
df_by_user = df.groupby(['user_name']).sum('numeric_only').reset_index()
df_by_user = df_by_user.round(2)
df_by_user.loc[len(df.index)] = ['Total', df_by_user['credits'].sum(), df_by_user['dollars'].sum()]
print(tabulate(df_by_user, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
fig = make_subplots(
    rows=1, cols=2,
    specs=[[{"type": "pie"},{"type": "pie"}]],
    subplot_titles=("Dollars", "Credits")
)

fig.add_trace(go.Pie(labels=df_by_user['user_name'].tolist(), values=df_by_user['dollars'].tolist(),name="Dollars", rotation=45,marker_colors=color_scheme),row=1,col=1)
fig.add_trace(go.Pie(labels=df_by_user['user_name'].tolist(), values=df_by_user['credits'].tolist(),name='Credits', rotation=45,marker_colors=color_scheme),row=1,col=2)

fig.update_layout(
    title={
        'text': "Breakdown of total cost by user",
        'y':0.1,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'bottom'})
fig.show()

- User GREGORYW incurs the maximum value of dollars and credits at 2418 and 1209, respectively.
- Users VERTX_PROD and DBT_PROD incur almost the same values of dollars and credit at 1497 and 748, respectively. 
- Users DBT_DEV and ROBS spend the least value of dollars and credits compared to other users.


In [None]:
df_by_user_ts = df.groupby(['user_name','start_time','end_time']).sum('numeric_only').reset_index()
fig = px.area(df_by_user_ts, x="start_time", y="credits", color="user_name",color_discrete_sequence=color_scheme)
fig.show()

- The time series by user follows a similar trend as the time series by usage category graph with regards to usage of credits at daily time slots.
- The increase in credit usage on the 8th, 9th and 10th of October is because of usage increases by GREGORYW and DBT_DEV. The 2 p.m. credit usage spike from October 1- October 5 can also be attributed to those users. 




## Cost by warehouse

In [None]:
df = qlib.cost_by_wh_ts(sdate, edate)
df.head()

In [None]:
df_by_wh = df.groupby(['warehouse_name']).sum('numeric_only').reset_index()
df_by_wh.loc[len(df.index)] = ['Total', df_by_wh['credits'].sum(), df_by_wh['dollars'].sum(), df_by_wh['cloud_services_dollars'].sum()]
df_by_wh.head(10)

In [None]:
df_by_wh = df.groupby(['warehouse_name']).sum('numeric_only').reset_index()
df_by_wh.loc[len(df_by_wh.index)] = ['Total', df_by_wh['credits'].sum(), df_by_wh['dollars'].sum(), df_by_wh['cloud_services_dollars'].sum()]

print(tabulate(df_by_wh, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
df_by_wh = df_by_wh.drop(len(df_by_wh)-1) ## Remove the last row of totals for the plot

fig = make_subplots(
    rows=1, cols=1,
    specs=[[{"type": "pie"}]],
)

fig.add_trace(go.Pie(labels=df_by_wh['warehouse_name'].tolist(), values=df_by_wh['dollars'].tolist(),name='dollars',marker_colors=color_scheme),row=1,col=1)

fig.update_layout(
    title={
        'text': "Breakdown of total cost by warehouse",
        'y':0.1,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
fig.show()

- PROD_WH incurs the highest cost for warehouses amongst all with a dollar value of 1221.29, followed by DEV_WH with a dollar value of 797.61
- DAILY_REFRESH_WH’s cost is 347.14 and ML_WH’s cost is 140.47
- CLOUD_SERVICES_ONLY does not incur any costs.



In [None]:
df_by_user_ts = df.groupby(['warehouse_name','start_time','end_time']).sum('numeric_only').reset_index()
fig = px.area(df_by_user_ts, x="start_time", y="credits", color="warehouse_name",color_discrete_sequence=color_scheme)
fig.show()

- The credit usage increase on the 8th, 9th, and 10th of October from other days is because of more credit consumption by DEV_WH.
