In [2]:
import sys, pathlib
sys.path.append(str(pathlib.Path.cwd().parent))
# sys.path

In [3]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tabulate import tabulate


In [4]:
%load_ext autoreload
%autoreload 2
%aimport optiml.queries

## Initialization (Done)

In [5]:
# Initialize Snowflake connection to Snowflake
from optiml.connection import SnowflakeConnConfig
connection = SnowflakeConnConfig(accountname='jg84276.us-central1.gcp',warehousename="XSMALL_WH").create_connection()
# Initialize local environment
import os
cache_dir = os.path.expanduser('~/data/kiva')
# Initialize query library
from optiml.queries import SNFLKQuery
qlib = SNFLKQuery(connection, 'KIV', cache_dir)
sdate = '2022-10-01'
edate = '2022-10-31'
color_scheme=["red","blue","green","orange","purple","brown","pink","gray","olive","cyan","darkviolet","goldenrod","darkgreen","chocolate","lawngreen"]

/home/manas/.snowsql/config
Connecting...


# Total cost breakdown 

## Visualization

## Breakdown of total cost by usage type (Done)

In [None]:
df = qlib.total_cost_breakdown(sdate, edate).round(2)
df.loc[len(df.index)] = ['Total', df['credits'].sum(), df['dollars'].sum()]
print(tabulate(df, headers='keys', tablefmt='rounded_outline', showindex=False))


In [None]:
df = df.drop(len(df)-1) ## Remove the last row of totals for the plot
fig = make_subplots(
    rows=1, cols=2,
    specs=[[{"type": "pie"},{"type": "pie"}]],
    subplot_titles=("Dollars", "Credits")
)

fig.add_trace(go.Pie(labels=df['cost_category'].tolist(), values=df['dollars'].tolist(),name="Dollars"),row=1,col=1)
fig.add_trace(go.Pie(labels=df['cost_category'].tolist(), values=df['credits'].tolist(),name='Credits'),row=1,col=2)

fig.update_layout(
    title={
        'text': "Breakdown of total cost by usage category",
        'y':0.1,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'bottom'})
fig.show()

## Analysis
- Compute forms overwhelming majority of the cost
- Cloud services are in aggregate < 10 % of compute so are not an added expense
- Autoclustering used to be on in 2020 but isn't anymore
- Should explore if there are any compute efficiencies to be gained by 
    - Autoclustering
    - Materialized views / caching
    - Search optimization
    - Query optimizations / reducing scans

* Largely used by compute
* In 2020 auto clustering was on but was turned off - why?
* Storage credits at 0 since storage is charged as a flat rate - are numbers consistent?
* Definitely did not use $80K worth of credits in October since total bill is $50K - so something is wrong!


## Breakdown of total cost as a time series

In [None]:
df = qlib.total_cost_breakdown_ts(sdate, edate)

In [None]:
df.head()

In [None]:
df[df['category_name'] == 'Storage']

In [None]:
df_by_user = df.groupby(['user_name','start_time','end_time']).sum('numeric_only').reset_index()
df_by_user.head()

In [None]:
df_by_category = df.groupby(['category_name','start_time','end_time']).sum('numeric_only').reset_index()
fig = px.area(df_by_category, x="start_time", y="credits", color="category_name",color_discrete_sequence=color_scheme )#, line_group="country")
fig.show()

## Breakdown of total cost by warehouse

In [None]:
df = qlib.cost_by_wh(sdate, edate).round(2)
df.loc[len(df.index)] = ['Total', df['credits'].sum(), df['dollars'].sum()]
print(tabulate(df, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
df = df.drop(len(df)-1) ## Remove the last row of totals for the plot

fig = make_subplots(
    rows=1, cols=1,
    specs=[[{"type": "pie"}]],
)

fig.add_trace(go.Pie(labels=df['warehouse_name'].tolist(), values=df['dollars'].tolist(),name='dollars'),row=1,col=1)

fig.update_layout(
    title={
        'text': "Breakdown of total cost by warehouse",
        'y':0.1,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
fig.show()

## Analysis
- Sum of the compute across all the warehouses is 1255.47 credits which is consistent with total cost of compute above. Sanity check to ensure we are not missing any compute or warehouse

## Breakdown of cost by partner tools

In [None]:
df = qlib.cost_by_partner_tool_ts(sdate, edate).round(2)
df = df.drop(len(df)-1).groupby('client_application_name').sum('numeric_only').reset_index()
df.loc[len(df.index)] = ['Total', df['approximate_credits_used'].sum()]
print(tabulate(df, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
df = df.drop(len(df)-1) ## Remove the last row of totals for the plot
fig = make_subplots(
    rows=1, cols=1,
    specs=[[{"type": "pie"}]],
)

fig.add_trace(go.Pie(labels=df_client_app['client_application_name'].tolist(), values=df_client_app['approximate_credits_used'].tolist(),name='credits'),row=1,col=1)

fig.update_layout(
    title={
        'text': "Breakdown of credits used by Client Apps",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
fig.show()

## Breakdown of total cost by user (to do)

In [None]:
df = qlib.cost_by_user_ts(sdate, edate)
df

## Warehouse configuration

In [None]:
df = qlib.warehouse_config()
print(tabulate(df, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
df.columns

In [None]:
df.name

## Analysis
- auto suspend times here vary from 60 min to 3600 min - these seem very large assuming auto resuming does not take very long
- There are no resource monitors being used - this should be corrected

## Cost of queries

In [None]:
df = qlib.n_expensive_queries(sdate, edate, 10)
df

In [None]:
df['query_text'][0] == df['query_text'][1]

In [None]:
df['query_text'][0]

In [None]:
df['query_text'][1]

In [None]:
df = qlib.cost_by_wh_ts()
df.warehouse_name.unique()

In [None]:
df=qlib.idle_users(start_date="2022-01-01", end_date="2022-02-02")
df.head()

In [12]:
df=qlib.users_full_table_scans(start_date="2022-01-01", end_date="2022-02-02")
df.head()

Unnamed: 0,user_name,count_of_queries


In [14]:
df=qlib.heavy_users(start_date="2022-01-01", end_date="2022-02-02")
df.head()

Unnamed: 0,user_name,warehouse_name,avg_pct_scanned


In [15]:
df=qlib.users_never_logged_in(start_date="2022-01-01", end_date="2022-02-02")
df.head()

Unnamed: 0,name,created_on,deleted_on,login_name,display_name,first_name,last_name,email,must_change_password,has_password,...,ext_authn_duo,ext_authn_uid,bypass_mfa_until,last_success_login,expires_at,locked_until_time,has_rsa_public_key,password_last_set_time,owner,default_secondary_role
0,TEST_EXERCISE_GG,2022-06-15 18:18:31.725000-07:00,2022-06-15 18:18:39.557000-07:00,TEST_EXERCISE_GG,TEST_EXERCISE_GG,,,,False,True,...,"""false""",,NaT,NaT,NaT,NaT,False,2022-06-15 18:18:32.368000-07:00,,
1,TEST_EXERCISE_PLASJ,2021-10-05 22:20:53.138000-07:00,2021-11-01 15:59:26.781000-07:00,TEST_EXERCISE_PLASJ,TEST_EXERCISE_PLASJ,,,,False,True,...,"""false""",,NaT,NaT,NaT,2021-11-01 16:59:26.632000-07:00,False,2021-10-05 22:20:53.168000-07:00,,
2,SPENCERM_DEV,2022-08-08 16:48:23.119000-07:00,NaT,SPENCERM_DEV,SPENCERM_DEV,,,,False,False,...,"""false""",,NaT,NaT,NaT,NaT,False,NaT,SECURITYADMIN,
3,YARIELI_DEV,2022-01-10 20:08:17.338000-08:00,NaT,YARIELI_DEV,YARIELI_DEV,,,,True,True,...,"""false""",,NaT,NaT,NaT,NaT,False,2022-01-10 20:08:17.360000-08:00,SECURITYADMIN,
4,TEST_EXERCISE_DV,2022-10-10 21:35:16.747000-07:00,NaT,TEST_EXERCISE_DV,TEST_EXERCISE_DV,,,,False,True,...,"""false""",,NaT,NaT,2022-10-17 21:35:16.916000-07:00,NaT,False,2022-10-10 21:35:16.747000-07:00,SECURITYADMIN,


In [16]:
df=qlib.users_never_logged_in(start_date="2022-01-01", end_date="2022-02-02")
df.head()

Unnamed: 0,name,created_on,deleted_on,login_name,display_name,first_name,last_name,email,must_change_password,has_password,...,ext_authn_duo,ext_authn_uid,bypass_mfa_until,last_success_login,expires_at,locked_until_time,has_rsa_public_key,password_last_set_time,owner,default_secondary_role
0,TEST_EXERCISE_GG,2022-06-15 18:18:31.725000-07:00,2022-06-15 18:18:39.557000-07:00,TEST_EXERCISE_GG,TEST_EXERCISE_GG,,,,False,True,...,"""false""",,NaT,NaT,NaT,NaT,False,2022-06-15 18:18:32.368000-07:00,,
1,TEST_EXERCISE_PLASJ,2021-10-05 22:20:53.138000-07:00,2021-11-01 15:59:26.781000-07:00,TEST_EXERCISE_PLASJ,TEST_EXERCISE_PLASJ,,,,False,True,...,"""false""",,NaT,NaT,NaT,2021-11-01 16:59:26.632000-07:00,False,2021-10-05 22:20:53.168000-07:00,,
2,SPENCERM_DEV,2022-08-08 16:48:23.119000-07:00,NaT,SPENCERM_DEV,SPENCERM_DEV,,,,False,False,...,"""false""",,NaT,NaT,NaT,NaT,False,NaT,SECURITYADMIN,
3,YARIELI_DEV,2022-01-10 20:08:17.338000-08:00,NaT,YARIELI_DEV,YARIELI_DEV,,,,True,True,...,"""false""",,NaT,NaT,NaT,NaT,False,2022-01-10 20:08:17.360000-08:00,SECURITYADMIN,
4,TEST_EXERCISE_DV,2022-10-10 21:35:16.747000-07:00,NaT,TEST_EXERCISE_DV,TEST_EXERCISE_DV,,,,False,True,...,"""false""",,NaT,NaT,2022-10-17 21:35:16.916000-07:00,NaT,False,2022-10-10 21:35:16.747000-07:00,SECURITYADMIN,


In [21]:
df=qlib.idle_roles(start_date="2022-01-01", end_date="2022-02-02")
df.head()


Unnamed: 0,created_on,deleted_on,name,comment,owner
0,2022-11-14 09:28:09.112000-08:00,NaT,KIV_ADMIN,,ACCOUNTADMIN
1,2022-09-26 13:24:02.852000-07:00,NaT,USERADMIN,,
2,2022-09-26 13:24:02.772000-07:00,NaT,PUBLIC,Public role is automatically available to ever...,
3,2022-09-26 13:24:02.839000-07:00,NaT,SYSADMIN,System administrator can create and manage dat...,
4,2022-09-26 13:24:02.795000-07:00,NaT,ACCOUNTADMIN,Account administrator can manage all aspects o...,


In [35]:
df=qlib.failed_tasks(start_date="2022-01-01", end_date="2022-02-02")
df.head()

Unnamed: 0,name,query_text,condition_text,schema_name,task_schema_id,database_name,task_database_id,scheduled_time,completed_time,state,return_value,query_id,query_start_time,error_code,error_message,graph_version,run_id,root_task_id,scheduled_from


In [9]:
df=qlib.long_running_tasks(start_date="2022-01-01", end_date="2022-02-02")
df.head()

Unnamed: 0,duration_seconds,name,query_text,condition_text,schema_name,task_schema_id,database_name,task_database_id,scheduled_time,completed_time,state,return_value,query_id,query_start_time,error_code,error_message,graph_version,run_id,root_task_id,scheduled_from
