In [None]:
# Adding system path
import sys, pathlib
sys.path.append(str(pathlib.Path.cwd().parent.parent))
# sys.path

In [None]:
# Set to show warnings only once
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Setting up displays
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
# from dash import Dash,html,dcc,Input,Output
# app = Dash(__name__)
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tabulate import tabulate
color_scheme=["red","blue","green","orange","purple","brown","pink","gray","olive","cyan","darkviolet","goldenrod","darkgreen","chocolate","lawngreen"]

In [None]:
# from dash import Dash,html,dcc

In [None]:
# ##center allign all the figure outputs.
# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# from IPython.display import display, HTML
# from plotly.graph_objs import *
# import numpy as np
# init_notebook_mode(connected=True)

# display(HTML("""
# <style>
# .output {
#     display: flex;
#     align-items: center;
#     text-align: center;
# }
# </style>
# """))

In [None]:
# Setting up autoreload for libs
%load_ext autoreload
%autoreload 2
%aimport optiml.queries

In [None]:
# Initialize connection to Snowflake and set analysis date
from optiml.connection import SnowflakeConnConfig
connection = SnowflakeConnConfig(accountname='jg84276.us-central1.gcp',warehousename="XSMALL_WH").create_connection()
# Initialize local environment
import os
cache_dir = os.path.expanduser('~/data/kiva')
# Initialize query library
from optiml.queries import SNFLKQuery
qlib = SNFLKQuery(connection, 'KIV', cache_dir)
sdate = '2022-10-01'
edate = '2022-10-31'
print(f"The analysis is carried our for date range {sdate} to {edate}")

# Query Analysis

## Most expensive queries

In [None]:
df=qlib.n_expensive_queries(sdate,edate,5)
# df.to_csv('/home/manas/DS_data/most_expensive.csv')
df.head()

In [None]:
## Most expensive queries

trace1  = go.Scatter(
        mode='lines+markers',
        x = df['query_id'],
        y = df['credits'],
        name="Credits",
        marker_color='crimson'
    )

# trace2 = go.Bar(
#         x = df['query_id'],
#         y = df['warehouse_size'],
#         name="warehouse size",
#         yaxis='y2',
#         marker_color ='green',
#         marker_line_width=1.5,
#         marker_line_color='rgb(8,48,107)',
#         opacity=0.5
#     )

# data = [trace1, trace2]
data = [trace1]

layout = go.Layout(
    title_text='Most expensive queries',
    yaxis=dict(
        # range = [0, 100],
        title="Credits"
    ),
    xaxis=dict(
        title="Query ID"
    )
)
fig = go.Figure(data=data, layout=layout)
fig.show()

* Cost of running a query depends on the time to execute the query and the warehouse it is run on.
* Best practice to optimize would be to either reduce the execution time of the query or use efficient virtual warehouse practices.
* Establishing resource monitors to estimate credit usage would be beneficial.

### Query text

#### Query 1

In [None]:
print(df.iloc[0]["query_text"])

### ANALYSIS

* Has an execution time of 670 mins.
* SELECT statement
* Multiple JOIN statements in query.
* Most partitions are scanned.
* Executed using a SMALL warehouse.
* Scans 6.3 GB of data
* Scans 91% of data from cache


#### Query 2

In [None]:
print(df.iloc[1]["query_text"])

### ANALYSIS
* Execution time of 357 mins.
* FAILED query.
* Large number of bytes are spilled to both local (4719 GB) and remote (4491 GB) storage.
* Multiple JOIN statements in query.
* Most partitions are scanned.
* Executed using SMALL warehouse.
* Scans 83% of the data from cache.



#### Query 3

In [None]:
print(df.iloc[2]["query_text"])

### ANALYIS
* Same query ran multiple times.
* UPDATE query
* Has execution time of 37 mins.
* Execution time remains constant.
* 146 GB of bytes scanned
* Scans 63% of total partitions
* Only 10% of data from cache is scanned.

### OPTIML RECOMMENDATIONS

* PROD_WH can be scaled out as multicluster as large amount of data is being computed with several users using them.
* Defining clustering keys will help with reducing number of partitions in the table being scanned.
* Scaling up is also required for PROD_WH as bytes are being spilled causing failure of query.
* Update query not being cached enough, therefore autosuspension happens too quickly or cache needs to be built before running this query.
* Optimizing SELECT query by eliminating multiple JOINS, SELECT clauses.

## Queries that spill to storage

In [None]:
df=qlib.n_queries_spill_to_storage(sdate,edate,5)
df.head()




In [None]:
## Most expensive queries

trace1  = go.Scatter(
        mode='lines+markers',
        x = df['query_id'],
        y = df['bytes_spilled_to_remote_storage'],
        name="Bytes Spilled Remote",
        marker_color='crimson'
    )

trace2  = go.Scatter(
        mode='lines+markers',
        x = df['query_id'],
        y = df['bytes_spilled_to_local_storage'],
        name="Bytes Spilled Local",
        marker_color='purple'
    )


data = [trace1, trace2]


layout = go.Layout(
    title_text='Queries that spilled the most to storage',
    yaxis=dict(
        # range = [0, 100],
        side = 'left',
        title="Bytes spilled"
        
    ),
    xaxis=dict(
        title="Query ID"

    )
)
fig = go.Figure(data=data, layout=layout)
fig.show()

### Query text

#### Query 1

In [None]:
print(df.iloc[0]["query_text"])

#### Query 2

In [None]:
print(df.iloc[1]["query_text"])

#### Query 3

In [None]:
print(df.iloc[2]["query_text"])

### ANALYSIS

<div class="alert alert-info">

 Queries 1,2 and 3 fail after execution time for each being 29 minutes.
* Query 1 (query_id: 01a78b77-0604-f44d-0000-08d15ddd870e) - 300 GB spilling to local storage and 200 GB to remote storage.
* Query 2 (query_id: 01a78b55-0604-f44e-0000-08d15ddd55da) - 170 GB splilling to local storge and 90 GB to remote.
* Query 3 (query_id: 01a78b55-0604-f44e-0000-08d15ddd55d6) - 245 GB spilling to local and 78 GB to remote storage.

Most partitions are scanned in the table for these 3 queries.
There are also multiple join and select statements for each query.
</div>




### Optiml TODO



<div class="alert alert-warning">
    
* Check if best virtual warehouse practices can be applied to most queries in this warehouse
</div>

### Optiml Recommendation




<div class="alert alert-success">

* Utilize a larger warehouse - Warehouse sizes of Xsmall and Small are used. 100's of GB are spilled due to insufficient resources. 

* Optimize the query - Reducing the number of JOINS, SELECT and UNION statements would improve the performance.

* Reduce the data that is being processed (Ex: Redunant columns used in computation).

* Split processing into multiple steps.

## Queries that scanned the most data

In [None]:
df=qlib.n_queries_scanned_most_data(sdate,edate,5)
df.head() 

In [None]:
##TODO: Put labels on the axis
## Queries that scanned most data

trace1  = go.Scatter(
        mode='lines+markers',
        x = df['query_id'],
        y = df['partitions_scanned'],
        name="Partitions Scanned",
        marker_color='crimson'
    )

data = [trace1]

layout = go.Layout(
    title_text='Queries that scanned the most partitions',
    yaxis=dict(
        # range = [0, 100],
        side = 'left',
        title="Bytes spilled"
        
    ),
    xaxis=dict(
        title="Query ID"

    )
    
)
fig = go.Figure(data=data, layout=layout)
fig.show()

### Query text

#### Query 1

In [None]:
print(df.iloc[0]["query_text"])

### ANALYSIS

* Inadequte pruning is observed. 99% of the partitions on the table are scanned.
* Execution time is 3.5 minutes.
* 2 GB is spilled onto local storage

#### Query 2

In [None]:
print(df.iloc[1]["query_text"])

### ANALYSIS

* Inadequte pruning is observed. 99% of the partitions on the table are scanned.
* Execution time is 7.7 minutes.
* 16.9 GB is spilled onto local storage.

#### Query 3

In [None]:
print(df.iloc[2]["query_text"])

### ANALYSIS

* Inadequte pruning is observed. 99% of the partitions on the table are scanned.
* Execution time is 3.5 minutes.
* 2 GB is spilled onto local storage.

### OPTIML Recommendations

* Queries are the same.
* Cluster keys can be defined to reduce partitions scanned. 
* Choose cluster key that appears frequently in a WHERE clause
* Scaling up warehouse would reduce bytes splilling to local storage
* Query can be optimized by reducing number of JOIN statements, eliminating redundant SELECT statements and using DISTINCT clauses.

## Most cached queries

In [None]:
df=qlib.n_most_cached_queries(sdate,edate,5)
df.head() 


In [None]:
## Queries that scanned the most from cache

trace1  = go.Scatter(
        mode='lines+markers',
        x = df['query_id'],
        y = df['percent_scanned_from_cache'],
        name="Percent Scanned From Cache",
        marker_color='crimson'
    )

data = [trace1]

layout = go.Layout(
    title_text='Queries that scanned the most percent from cache',
    yaxis=dict(
        # range = [0, 100],
        side = 'left',
        title="Percent Scanned From Cache"
        
    ),
    xaxis=dict(
        title="Query ID"

    )
)
fig = go.Figure(data=data, layout=layout)
fig.show()

#### Query 1

In [None]:
print(df.iloc[2]["query_text"])

#### Query 2

In [None]:
print(df.iloc[1]["query_text"])

#### Query 3

In [None]:
print(df.iloc[2]["query_text"])

## Most executed 'select' queries -- update this for select statements

In [None]:
df=qlib.n_most_executed_select_queries(sdate,edate,10)
# df.to_csv("/home/manas/DS_data/most_executed_select.csv")
df.head() 



In [None]:
## Queries that scanned the most from cache

trace1  = go.Scatter(
        mode='lines+markers',
        x = df.index,
        y = df['number_of_times_executed'],
        name="Number of times executed",
        marker_color='crimson'
    )

data = [trace1]

layout = go.Layout(
    title_text='Queries that were executed the most number of times',
    yaxis=dict(
        # range = [0, 100],
        side = 'left',
        title="Number of times executed"
        
    ),
    xaxis=dict(
        title="Query ID"

    )
)
fig = go.Figure(data=data, layout=layout)
fig.show()

#### Query 1

In [None]:

print(df.iloc[0]["query_text"])

#### Query 2

In [None]:
print(df.iloc[1]["query_text"])

#### Query 3

In [None]:
print(df.iloc[3]["query_text"])

### ANALYSIS
* Query 1 was executed 116028 in 30 days
* Query 2 was executed 19770 in 30 days
* Query 3 was executed 13815 in 30 days.

### OPTIML Recommendations
* Can convert SELECT Queries as Materialized Views

## MOST EXECUTED QUERIES


In [None]:
df=qlib.caching_warehouse(sdate,edate,5)
df.head()

## LONGEST RUNNING QUERIES

In [None]:
df=qlib.longest_running_queries(sdate,edate,5)
df.head()

#### QUERY 1

In [None]:
print(df.iloc[0]["query_text"])

#### QUERY 2

In [None]:
print(df.iloc[1]["query_text"])

#### QUERY 3

In [None]:
print(df.iloc[2]["query_text"])

### ANALYSIS

* Queries are the same.
* Updating event query.
* Execution time for query is approximately 135 minutes.
* 65% of total partitions of table are scanned.
* No spillage onto local or remote storage.
* No Query overloading.


### OPTIML to do

* Need to run update query to see if performance tuning can be done.

### OPTIML Recommendations
* Cluster keys can be defined to reduce execution time. WHERE clause present in query can be utilized.
* Warehouse scaling up won't help with performance.


# Notes

* For most expensive queries i.e. consuming most credits steps are:
  * Look at the amount of data the queries are operating on to see if it passes the sniff test
  * Idenitfy causes of credit consumption e.g. using INSERT instead of COPY_TO