In [12]:
# Adding system path
import sys, pathlib
sys.path.append(str(pathlib.Path.cwd().parent.parent))
# sys.path

In [13]:
# Set to show warnings only once
import warnings
warnings.filterwarnings('ignore')

In [14]:
# Setting up displays
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
# from dash import Dash,html,dcc,Input,Output
# app = Dash(__name__)
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tabulate import tabulate
color_scheme=["red","blue","green","orange","purple","brown","pink","gray","olive","cyan","darkviolet","goldenrod","darkgreen","chocolate","lawngreen"]

In [15]:
from dash import Dash,html,dcc

In [16]:
# ##center allign all the figure outputs.
# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# from IPython.display import display, HTML
# from plotly.graph_objs import *
# import numpy as np
# init_notebook_mode(connected=True)

# display(HTML("""
# <style>
# .output {
#     display: flex;
#     align-items: center;
#     text-align: center;
# }
# </style>
# """))

In [17]:
# Setting up autoreload for libs
%load_ext autoreload
%autoreload 2
%aimport optiml.queries

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
# Initialize connection to Snowflake and set analysis date
from optiml.connection import SnowflakeConnConfig
connection = SnowflakeConnConfig(accountname='jg84276.us-central1.gcp',warehousename="XSMALL_WH").create_connection()
# Initialize local environment
import os
cache_dir = os.path.expanduser('~/data/kiva')
# Initialize query library
from optiml.queries import SNFLKQuery
qlib = SNFLKQuery(connection, 'KIV', cache_dir)
sdate = '2022-10-01'
edate = '2022-10-31'
print(f"The analysis is carried our for date range {sdate} to {edate}")

Connecting...
The analysis is carried our for date range 2022-10-01 to 2022-10-31


# Query Analysis

## Most expensive queries

In [19]:
df=qlib.n_expensive_queries(sdate,edate,5)
df.head()

Unnamed: 0,query_id,query_type,query_text,user_name,nodes,role_name,database_name,schema_name,warehouse_name,warehouse_size,bytes_spilled_to_local_storage,bytes_spilled_to_remote_storage,partitions_scanned,partitions_total,compilation_time_sec,execution_time_min,credits,cluster_number,execution_status
0,01a0b506-0403-24a1-0000-08d150c21fbe,SELECT,with x as ( SELECT  vertex_fact_loan_displa...,GREGORYW,2.0,VERTEX_ADMIN_ROLE_PROD,KIVA_PROD,PUBLIC,PROD_WH,SMALL,0.0,0.0,1557.0,1621.0,1.49,670.02,22.33,1.0,SUCCESS
1,01a00769-0502-e6ee-0000-08d14f6b44e6,SELECT,with interaction_grid as (  ...,ROBS,2.0,SYSADMIN,KIVA_PROD,PUBLIC,PROD_WH,SMALL,5066634000000.0,4823005000000.0,371.0,2143.0,5.31,357.48,11.92,1.0,FAIL
2,01a750e9-0604-e3d8-0000-08d15d569cba,UPDATE,update iterable.event t set  ip = null  ...,DBT_PROD,4.0,VERTEX_ADMIN_ROLE_PROD,KIVA_PROD,DBT_MATERIALIZED,DAILY_REFRESH_WH,MEDIUM,0.0,0.0,9322.0,14788.0,0.32,37.87,2.52,1.0,SUCCESS
3,01a6a289-0604-c05e-0000-08d15be6ec1a,UPDATE,update iterable.event t set  ip = null  ...,DBT_PROD,4.0,VERTEX_ADMIN_ROLE_PROD,KIVA_PROD,DBT_MATERIALIZED,DAILY_REFRESH_WH,MEDIUM,0.0,0.0,8927.0,13854.0,0.49,37.52,2.5,1.0,SUCCESS
4,01a718a9-0604-d7b2-0000-08d15cddcba2,UPDATE,update iterable.event t set  ip = null  ...,DBT_PROD,4.0,VERTEX_ADMIN_ROLE_PROD,KIVA_PROD,DBT_MATERIALIZED,DAILY_REFRESH_WH,MEDIUM,0.0,0.0,9174.0,14548.0,0.51,37.35,2.49,1.0,SUCCESS


In [21]:
##TODO: Put labels on the axis
## Most expensive queries

trace1  = go.Scatter(
        mode='lines+markers',
        x = df['query_id'],
        y = df['credits'],
        name="Credits",
        marker_color='crimson'
    )

# trace2 = go.Bar(
#         x = df['query_id'],
#         y = df['warehouse_size'],
#         name="warehouse size",
#         yaxis='y2',
#         marker_color ='green',
#         marker_line_width=1.5,
#         marker_line_color='rgb(8,48,107)',
#         opacity=0.5
#     )

# data = [trace1, trace2]
data = [trace1]

layout = go.Layout(
    title_text='Most expensive queries',
    yaxis=dict(
        # range = [0, 100],
        title="Credits"
    ),
    xaxis=dict(
        title="Query ID"
    )
)
fig = go.Figure(data=data, layout=layout)
fig.show()

### Query text

#### Query 1

In [None]:
print(df.iloc[0]["query_text"])

#### Query 2

In [None]:
print(df.iloc[1]["query_text"])

#### Query 3

In [None]:
print(df.iloc[2]["query_text"])

## Queries that spill to storage

In [None]:
df=qlib.n_queries_spill_to_storage(sdate,edate,5)
df.head()

In [25]:
##TODO: Put labels on the axis
## Most expensive queries

trace1  = go.Scatter(
        mode='lines+markers',
        x = df['query_id'],
        y = df['bytes_spilled_to_remote_storage'],
        name="Bytes Spilled Remote",
        marker_color='crimson'
    )

trace2  = go.Scatter(
        mode='lines+markers',
        x = df['query_id'],
        y = df['bytes_spilled_to_local_storage'],
        name="Bytes Spilled Local",
        marker_color='purple'
    )


data = [trace1, trace2]


layout = go.Layout(
    title_text='Queries that spilled the most to storage',
    yaxis=dict(
        # range = [0, 100],
        side = 'left',
        title="Bytes spilled"
        
    ),
    xaxis=dict(
        title="Query ID"

    )
)
fig = go.Figure(data=data, layout=layout)
fig.show()

### Query text

#### Query 1

In [None]:
print(df.iloc[0]["query_text"])

#### Query 2

In [None]:
print(df.iloc[1]["query_text"])

#### Query 3

In [None]:
print(df.iloc[2]["query_text"])

## Queries that scanned the most data

In [None]:
df=qlib.n_queries_scanned_most_data(sdate,edate,5)
df.head() 

In [31]:
##TODO: Put labels on the axis
## Queries that scanned most data

trace1  = go.Scatter(
        mode='lines+markers',
        x = df['query_id'],
        y = df['partitions_scanned'],
        name="Partitions Scanned",
        marker_color='crimson'
    )

data = [trace1]

layout = go.Layout(
    title_text='Queries that scanned the most partitions',
    yaxis=dict(
        # range = [0, 100],
        side = 'left',
        title="Bytes spilled"
        
    ),
    xaxis=dict(
        title="Query ID"

    )
    
)
fig = go.Figure(data=data, layout=layout)
fig.show()

### Query text

#### Query 1

In [None]:
print(df.iloc[0]["query_text"])

#### Query 2

In [None]:
print(df.iloc[1]["query_text"])

#### Query 3

In [None]:
print(df.iloc[2]["query_text"])

## Most cached queries

In [29]:
df=qlib.n_most_cached_queries(sdate,edate,5)
df.head() 

Unnamed: 0,query_id,query_type,query_text,user_name,role_name,database_name,schema_name,warehouse_name,warehouse_size,bytes_spilled_to_local_storage,bytes_spilled_to_remote_storage,partitions_scanned,partitions_total,compilation_time_sec,execution_time_min,cluster_number,execution_status,bytes_scanned,percent_scanned_from_cache
0,01a76e5c-0604-eb4f-0000-08d15d962cba,SELECT,"select coalesce(abs(sum(yu.total)), 0.0)::...",VERTEX_API_PROD,VERTEX_API_PROD_ROLE,KIVA_PROD,,PROD_WH,Small,0.0,0.0,34.0,34.0,0.08,0.0,1.0,SUCCESS,139290112.0,100.0
1,01a76d26-0604-eb4f-0000-08d15d938dc2,SELECT,"SELECT p.sector_id , count(di...",VERTEX_API_PROD,VERTEX_API_PROD_ROLE,KIVA_PROD,,PROD_WH,Small,0.0,0.0,96.0,96.0,0.06,0.01,1.0,SUCCESS,579883520.0,100.0
2,01a76d23-0604-eb4f-0000-08d15d938c82,UPDATE,"UPDATE ""SALESFORCE"".""CASE_TAG"" SET ""IS_DELETED...",FIVETRAN_USER_DEV,FIVETRAN_ROLE_DEV,KIVA_DEV,,DEV_WH,Small,0.0,0.0,1.0,1.0,0.3,0.0,1.0,SUCCESS,10240.0,100.0
3,01a76f5c-0604-e414-0000-08d15d98155a,SELECT,"SELECT  vertex_dim_partner.PARTNER_ID AS ""v...",LOOKER_PROD,LOOKER_PROD_ROLE,KIVA_PROD,VERTEX_MATERIALIZED,PROD_WH,Small,0.0,0.0,34.0,34.0,0.19,0.01,1.0,SUCCESS,28445696.0,100.0
4,01a76f1b-0604-eb4f-0000-08d15d978686,SELECT,SELECT top_country FROM dbt_materialized.dbt_d...,VERTEX_API_PROD,VERTEX_API_PROD_ROLE,KIVA_PROD,,PROD_WH,Small,0.0,0.0,24.0,24.0,0.05,0.0,1.0,SUCCESS,16543744.0,100.0


In [32]:
##TODO: Put labels on the axis
## Queries that scanned the most from cache

trace1  = go.Scatter(
        mode='lines+markers',
        x = df['query_id'],
        y = df['percent_scanned_from_cache'],
        name="Percent Scanned From Cache",
        marker_color='crimson'
    )

data = [trace1]

layout = go.Layout(
    title_text='Queries that scanned the most percent from cache',
    yaxis=dict(
        # range = [0, 100],
        side = 'left',
        title="Percent scanned from cache"
        
    ),
    xaxis=dict(
        title="Query ID"

    )
)
fig = go.Figure(data=data, layout=layout)
fig.show()

#### Query 1

In [None]:
print(df.iloc[2]["query_text"])

#### Query 2

In [None]:
print(df.iloc[1]["query_text"])

#### Query 3

In [None]:
print(df.iloc[2]["query_text"])

## Most executed 'select' queries -- update this for select statements

In [36]:
df=qlib.n_most_executed_select_queries(sdate,edate,10)
df.head() 

Unnamed: 0,query_text,query_type,number_of_times_executed,sum(bytes_spilled_to_local_storage),sum(bytes_spilled_to_remote_storage),execution_seconds,execution_minutes,execution_hours,sum(partitions_scanned),sum(partitions_total),max(cluster_number)
0,select latest_timestamp from dbt_materialized....,SELECT,116028.0,0.0,0.0,5050.408,84.173467,1.402891,0.0,0.0,
1,SELECT CURRENT_SESSION(),SELECT,19770.0,0.0,0.0,1018.68,16.978,0.282967,0.0,0.0,
2,SELECT CURRENT_REGION(),SELECT,13815.0,0.0,0.0,730.535,12.175583,0.202926,0.0,0.0,
3,SELECT MAX(sequence_number) FROM snowplow.event,SELECT,13197.0,0.0,0.0,40151.255,669.187583,11.153126,21358978.0,21834253.0,1.0
4,SELECT FLOOR((EXTRACT(EPOCH FROM CURRENT_TIMES...,SELECT,13197.0,0.0,0.0,2157.082,35.951367,0.599189,0.0,0.0,1.0


In [37]:
##TODO: Put labels on the axis, fix ticks on the axis to be integers
## Queries that scanned the most from cache

trace1  = go.Scatter(
        mode='lines+markers',
        x = df.index,
        y = df['number_of_times_executed'],
        name="Number of times executed",
        marker_color='crimson'
    )

data = [trace1]

layout = go.Layout(
    title_text='Queries that were executed the most number of times',
    yaxis=dict(
        # range = [0, 100],
        side = 'left',
        title="Number of times executed"
        
    ),
    xaxis=dict(
        title="Query ID"

    )
)
fig = go.Figure(data=data, layout=layout)
fig.show()

#### Query 1

In [None]:

print(df.iloc[0]["query_text"])

#### Query 2

In [None]:
print(df.iloc[1]["query_text"])

#### Query 3

In [None]:
print(df.iloc[3]["query_text"])