In [76]:
import sys, pathlib
sys.path.append(str(pathlib.Path.cwd().parent))
# sys.path

In [77]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tabulate import tabulate

In [78]:
%load_ext autoreload
%autoreload 2
%aimport optiml.queries

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Initialization

In [79]:
# Initialize Snowflake connection to Snowflake
from optiml.connection import SnowflakeConnConfig
connection = SnowflakeConnConfig(accountname='jg84276.us-central1.gcp',username='nikhilu',password="NikhilU123",warehousename="XSMALL_WH").create_connection()
# Initialize local environment
import os
cache_dir = os.path.expanduser('~/data/kiva')
# Initialize query library
from optiml.queries import SNFLKQuery
qlib = SNFLKQuery(connection, 'KIV', cache_dir)
sdate = '2022-10-01'
edate = '2022-10-31'
color_scheme=["red","blue","green","orange","purple","brown","pink","gray","olive","cyan","darkviolet","goldenrod","darkgreen","chocolate","lawngreen"]

# Total cost breakdown 

## Cost by usage category

In [80]:
##center allign all the figure outputs.
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from IPython.display import display, HTML
from plotly.graph_objs import *
import numpy as np
init_notebook_mode(connected=True)

display(HTML("""
<style>
.output {
    display: flex;
    align-items: center;
    text-align: center;
}
</style>
"""))

In [81]:
df = qlib.total_cost_breakdown_ts(sdate, edate).round(2)
df.head()


In a future version of pandas all arguments of concat except for the argument 'objs' will be keyword-only.



Unnamed: 0,user_name,credits,dollars,start_time,end_time,category_name
0,GREGORYW,0.0,0.0,2022-10-12 02:00:00,2022-10-12 03:00:00,Compute
1,GREGORYW,0.0,0.0,2022-10-11 00:00:00,2022-10-11 01:00:00,Compute
2,DBT_PROD,0.0,0.0,2022-10-07 23:00:00,2022-10-08 00:00:00,Compute
3,VERTEX_PROD,0.0,0.0,2022-10-12 06:00:00,2022-10-12 07:00:00,Compute
4,VERTEX_PROD,0.0,0.0,2022-10-11 00:00:00,2022-10-11 01:00:00,Compute


In [82]:
df_by_usage_category = df.groupby("category_name").sum("numeric_only").reset_index()
df_by_usage_category.loc[len(df.index)] = ['Total', df['credits'].sum(), df['dollars'].sum()]
print(tabulate(df_by_usage_category, headers='keys', tablefmt='rounded_outline', showindex=False))

╭─────────────────┬───────────┬───────────╮
│ category_name   │   credits │   dollars │
├─────────────────┼───────────┼───────────┤
│ Cloud services  │     46    │     92.34 │
│ Compute         │   3799.14 │   7597.82 │
│ Storage         │      0    │     76.66 │
│ Total           │   3845.14 │   7766.82 │
╰─────────────────┴───────────┴───────────╯


In [83]:
## Remove the last row of totals for the plot
df_by_usage_category.reset_index(inplace=True)
df_by_usage_category.drop(columns=["index"], inplace=True)
df_by_usage_category = df_by_usage_category.drop(len(df_by_usage_category)-1) 


In [84]:
fig = make_subplots(
    rows=1, cols=2,
    specs=[[{"type": "pie"},{"type": "pie"}]],
    subplot_titles=("Dollars", "Credits")
)

fig.add_trace(go.Pie(labels=df_by_usage_category['category_name'].tolist(), values=df_by_usage_category['dollars'].tolist(),name="Dollars", rotation=45,marker_colors=color_scheme),row=1,col=1)
fig.add_trace(go.Pie(labels=df_by_usage_category['category_name'].tolist(), values=df_by_usage_category['credits'].tolist(),name='Credits', rotation=45,marker_colors=color_scheme),row=1,col=2)

fig.update_layout(
    title={
        'text': "Breakdown of total cost by usage category",
        'y':0.1,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'bottom'})
fig.show()

## Cost by usage category timeseries

In [85]:
df_by_category_ts = df.groupby(['category_name','start_time','end_time']).sum('numeric_only').reset_index()
fig = px.area(df_by_category_ts, x="start_time", y="credits", color="category_name",color_discrete_sequence=color_scheme)
fig.show()

## Cost by user

In [None]:
df_by_user = df.groupby(['user_name']).sum('numeric_only').reset_index()
print(tabulate(df_by_user, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
fig = make_subplots(
    rows=1, cols=2,
    specs=[[{"type": "pie"},{"type": "pie"}]],
    subplot_titles=("Dollars", "Credits")
)

fig.add_trace(go.Pie(labels=df_by_user['user_name'].tolist(), values=df_by_user['dollars'].tolist(),name="Dollars", rotation=45,marker_colors=color_scheme),row=1,col=1)
fig.add_trace(go.Pie(labels=df_by_user['user_name'].tolist(), values=df_by_user['credits'].tolist(),name='Credits', rotation=45,marker_colors=color_scheme),row=1,col=2)

fig.update_layout(
    title={
        'text': "Breakdown of total cost by user",
        'y':0.1,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'bottom'})
fig.show()

In [None]:
df_by_user_ts = df.groupby(['user_name','start_time','end_time']).sum('numeric_only').reset_index()
fig = px.area(df_by_user_ts, x="start_time", y="credits", color="user_name",color_discrete_sequence=color_scheme)
fig.show()

## Cost by warehouse

In [None]:
df = qlib.cost_by_wh_ts(sdate, edate).round(2)
df.head()
# df.loc[len(df.index)] = ['Total', df['credits'].sum(), df['dollars'].sum()]
# print(tabulate(df, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
df = df.drop(len(df)-1) ## Remove the last row of totals for the plot

fig = make_subplots(
    rows=1, cols=1,
    specs=[[{"type": "pie"}]],
)

fig.add_trace(go.Pie(labels=df['warehouse_name'].tolist(), values=df['dollars'].tolist(),name='dollars',marker_colors=color_scheme),row=1,col=1)

fig.update_layout(
    title={
        'text': "Breakdown of total cost by warehouse",
        'y':0.1,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
fig.show()

In [None]:
df_by_warehouse_ts=df.groupby(['warehouse_name','start_time','end_time']).sum('numeric_only').reset_index()
fig = px.area(df_by_warehouse_ts, x="start_time", y="credits", color="warehouse_name",color_discrete_sequence=color_scheme)
fig.show()


## Analysis
- Sum of the compute across all the warehouses is 1255.47 credits which is consistent with total cost of compute above. Sanity check to ensure we are not missing any compute or warehouse

## Breakdown of cost by partner tools

In [None]:
df = qlib.cost_by_partner_tool_ts(sdate, edate).round(2)
df = df.drop(len(df)-1).groupby('client_application_name').sum('numeric_only').reset_index()
df.loc[len(df.index)] = ['Total', df['approximate_credits_used'].sum()]
print(tabulate(df, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
df = df.drop(len(df)-1) ## Remove the last row of totals for the plot
fig = make_subplots(
    rows=1, cols=1,
    specs=[[{"type": "pie"}]],
)

fig.add_trace(go.Pie(labels=df_client_app['client_application_name'].tolist(), values=df_client_app['approximate_credits_used'].tolist(),name='credits'),row=1,col=1)

fig.update_layout(
    title={
        'text': "Breakdown of credits used by Client Apps",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
fig.show()

## Warehouse configuration

In [None]:
df = qlib.warehouse_config()
print(tabulate(df, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
df.columns

In [None]:
df.name

## Analysis
- auto suspend times here vary from 60 min to 3600 min - these seem very large assuming auto resuming does not take very long
- There are no resource monitors being used - this should be corrected

## Cost of queries

In [None]:
df = qlib.n_expensive_queries(sdate, edate, 10)
df

In [None]:
df['query_text'][0] == df['query_text'][1]

In [None]:
df['query_text'][0]

In [None]:
df['query_text'][1]

In [None]:
df = qlib.cost_by_wh_ts()
df.warehouse_name.unique()

In [None]:
df=qlib.idle_users(start_date="2022-01-01", end_date="2022-02-02")
df.head()

In [None]:
df=qlib.users_full_table_scans(start_date="2022-01-01", end_date="2022-02-02")
df.head()

In [None]:
df=qlib.heavy_users(start_date="2022-01-01", end_date="2022-02-02")
df.head()

In [None]:
df=qlib.users_never_logged_in(start_date="2022-01-01", end_date="2022-02-02")
df.head()

In [None]:
df=qlib.users_never_logged_in(start_date="2022-01-01", end_date="2022-02-02")
df.head()

In [None]:
df=qlib.idle_roles(start_date="2022-01-01", end_date="2022-02-02")
df.head()


In [None]:
df=qlib.failed_tasks(start_date="2022-01-01", end_date="2022-02-02")
df.head()

In [None]:
df=qlib.long_running_tasks(start_date="2022-01-01", end_date="2022-02-02")
df.head()