In [None]:
# Adding system path
import sys, pathlib
sys.path.append(str(pathlib.Path.cwd().parent.parent))
# sys.path

In [None]:
# Set to show warnings only once
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Setting up displays
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

import math

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tabulate import tabulate
color_scheme=["red","blue","green","orange","purple","brown","pink","gray","olive","cyan","darkviolet","goldenrod","darkgreen","chocolate","lawngreen"]

In [None]:
# Setting up autoreload for libs
%load_ext autoreload
%autoreload 2
%aimport optiml.queries

In [None]:
# Initialize connection to Snowflake and set analysis date
from optiml.connection import SnowflakeConnConfig
connection = SnowflakeConnConfig(accountname='jg84276.us-central1.gcp',warehousename="XSMALL_WH").create_connection()
# Initialize local environment
import os
cache_dir = os.path.expanduser('~/data/kiva')
# Initialize query library
from optiml.queries import SNFLKQuery
qlib = SNFLKQuery(connection, 'KIV', cache_dir)
sdate = '2022-09-12'
edate = '2022-10-12'
print(f"The analysis is carried our for date range {sdate} to {edate}")

In [None]:
import hashlib

# Query Analysis

## Analysis setup
<div class="alert alert-warning">

* Analysis date range: '2022-09-12' to '2022-10-12': last rolling month in the data we collected.

* Type of Snowflake account: Standard Edition

* Credit to dollar conversion: `$`2 per credit

</div>

### Function for query text hashing

In [None]:
def get_query_hash(txt):
    hash_object = hashlib.md5(txt)
    return hash_object

def get_unique_query(df):
    queries = df["query_text"].values.tolist()
    query_hashes = [get_query_hash(q.encode()).hexdigest() for q in queries]
    df["query_hash"] = query_hashes
    df_unique_queries = df.drop_duplicates(subset=['query_hash']).reset_index(drop=True)
    df2 = df.groupby('query_hash',as_index=False)[['credits','execution_success','execution_fail']].sum()
    df_unique_queries.drop(columns=["execution_status","execution_success","execution_fail"], inplace=True)
#     df2.drop(columns=["execution_status"], inplace=True)
    df_new = pd.merge(df_unique_queries, df2, on=["query_hash"],suffixes=('_individual_max', '_total'))
    df_new = df_new.sort_values(by='credits_total', ascending=False)
    return df_new
    return df2

## Most expensive queries

In [None]:
df = qlib.n_expensive_queries(sdate,edate,200)

In [None]:
df["execution_success"] = (df["execution_status"] == "SUCCESS").astype(int)
df["execution_fail"] = (df["execution_status"] == "FAIL").astype(int)
df.head()

In [None]:
##TODO: Temporal distribution of expensive queries by warehouse - DEV_WH 8-10: 954a79a28187b69c99ca3de1d8a106ce
df_unique = get_unique_query(df).reset_index(drop=True)

In [None]:
## Most expensive queries

trace1  = go.Scatter(
        mode='lines+markers',
        x = df['query_id'][0:50],
        y = df['credits'][0:50],
        name="Credits",
        line=dict(color='black'),
        marker=dict(
            size=10,
            color=(df_unique["execution_fail"] >0 ).astype(int).astype('int'),
            colorscale=[[0, 'green'], [1, 'red']]
        )
    )


data = [trace1]

layout = go.Layout(
    title_text='Naive Most expensive queries',
    yaxis=dict(
        title="Credits"
    ),
    xaxis=dict(
        title="Query ID"
    )
)
fig = go.Figure(data=data, layout=layout)
fig.show()

In [None]:
## Most expensive queries

trace1 = go.Bar(
        x = df_unique['query_hash'],
        y = df_unique['execution_success'],
        name="Execution success count",
        yaxis='y2',
        marker=dict(color='green'),
        opacity=0.5
    )

trace2 = go.Bar(
        x = df_unique['query_hash'],
        y = df_unique['execution_fail'],
        name="Execution fail count",
        yaxis='y2',
        marker=dict(color='red'),
        opacity=0.5
    )

trace3  = go.Scatter(
        mode='lines+markers',
        x = df_unique['query_hash'],
        y = df_unique['credits_total'],
        name="Credits",
        line=dict(color='black'),
    )


data = [trace1, trace2, trace3]

layout = go.Layout(
    title_text='Most expensive queries',
    yaxis=dict(
        title="Total Credits",
        showgrid=False,
        range=[0, math.ceil(max(df_unique["credits_total"]))+10]
    ),
    yaxis2=dict(
        title="Number of times ran", overlaying="y",
        side="right",
#         position=0.98,
        showgrid=False,
        range=[0, math.ceil(max(df_unique["execution_success"] + df_unique["execution_fail"]))+10]
    ),
    xaxis=dict(
        title="Query Hash"
    ),
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="right",
        x=0.99
    ),
    barmode="stack"
)
fig = go.Figure(data=data, layout=layout)
fig.show()

In [None]:
df_unique.head(20)

### Analysis

<div class="alert alert-info">
    
* Naively optimizing the most expensive query is not an effective strategy. Some queries:
    * Fail to run to completion generating no ROI but use resources
    * Run more frequently than others so optimizing them will lead to better ROI
    * Queries that fail to run or cumulatively consume more credits should be optimized first
    
* Queries aggregated by query hash (i.e. query text matching exactly) show:
    * Query IDs: 01a78358-0604-ec7d-0000-08d15dc9f6fe, 01a78350-0604-ec7d-0000-08d15dc9f016  might be responsible for increase in compute consumption for DEV_WH on 8th Oct. (alternatively could be WH scaling/size change as well)
    * Query Hash: 6ef334ed61427031d52acf53a055ab57 cumulatively contributes to the most number of credits (~28) and runs to completion 12 times 
    * Query Hash: 49539ab6df6b8a1df712dbbd70efe3e3 fails 3 times it runs and costs ~3 credits
    * Multiple queries that fail once during the analysis period

</div>

### Actions and Recommendations

<div class="alert alert-success">

* High spillage -> consider a larger instance
* Large number of partitions scanned:
    * Consider Search Optimization enable for selective queries
    * Consider Autoclustering enable with thoughtfully chosen cluster keys
* SCD2 for update queries (https://community.snowflake.com/s/article/Building-a-Type-2-Slowly-Changing-Dimension-in-Snowflake-Using-Streams-and-Tasks-Part-1)
    
</div>

### Query text

#### Query 1

In [None]:
print(df_unique.iloc[0]["query_text"])

#### Query 2

In [None]:
print(df_unique.iloc[1]["query_text"])

#### Query 3

In [None]:
print(df_unique.iloc[2]["query_text"])

### Query 12

In [None]:
print(df_unique.iloc[12]["query_text"])

## Queries that spill to storage

In [None]:
df=qlib.n_queries_spill_to_storage(sdate,edate,5)
df.head()

In [None]:
## Most expensive queries

trace1  = go.Scatter(
        mode='lines+markers',
        x = df['query_id'],
        y = df['bytes_spilled_to_remote_storage'],
        name="Bytes Spilled Remote",
        marker_color='crimson'
    )

trace2  = go.Scatter(
        mode='lines+markers',
        x = df['query_id'],
        y = df['bytes_spilled_to_local_storage'],
        name="Bytes Spilled Local",
        marker_color='purple'
    )


data = [trace1, trace2]


layout = go.Layout(
    title_text='Queries that spilled the most to storage',
    yaxis=dict(
        # range = [0, 100],
        side = 'left',
        title="Bytes spilled"
        
    ),
    xaxis=dict(
        title="Query ID"

    )
)
fig = go.Figure(data=data, layout=layout)
fig.show()

### Query text

#### Query 1

In [None]:
print(df.iloc[0]["query_text"])

#### Query 2

In [None]:
print(df.iloc[1]["query_text"])

#### Query 3

In [None]:
print(df.iloc[2]["query_text"])

### ANALYSIS

<div class="alert alert-info">
##TODO: @Saravana sum local and remote query spillage and plot a graph against warehouse to see which warehouse should be upgraded.
    
 Queries 1,2 and 3 fail after execution time for each being 29 minutes.
* Query 1 (query_id: 01a78b77-0604-f44d-0000-08d15ddd870e) - 300 GB spilling to local storage and 200 GB to remote storage.
* Query 2 (query_id: 01a78b55-0604-f44e-0000-08d15ddd55da) - 170 GB splilling to local storge and 90 GB to remote.
* Query 3 (query_id: 01a78b55-0604-f44e-0000-08d15ddd55d6) - 245 GB spilling to local and 78 GB to remote storage.

Most partitions are scanned in the table for these 3 queries.
There are also multiple join and select statements for each query.
</div>




### Optiml TODO



<div class="alert alert-warning">
    
* Check if best virtual warehouse practices can be applied to most queries in this warehouse
</div>

### Optiml Recommendation




<div class="alert alert-success">

* Utilize a larger warehouse - Warehouse sizes of Xsmall and Small are used. 100's of GB are spilled due to insufficient resources. 

* Optimize the query - Reducing the number of JOINS, SELECT and UNION statements would improve the performance.

* Reduce the data that is being processed (Ex: Redunant columns used in computation).

* Split processing into multiple steps.

## Queries that scanned the most data

In [None]:
df=qlib.n_queries_scanned_most_data(sdate,edate,5)
df.head() 

In [None]:
##TODO: Put labels on the axis
## Queries that scanned most data

trace1  = go.Scatter(
        mode='lines+markers',
        x = df['query_id'],
        y = df['partitions_scanned'],
        name="Partitions Scanned",
        marker_color='crimson'
    )

data = [trace1]

layout = go.Layout(
    title_text='Queries that scanned the most partitions',
    yaxis=dict(
        # range = [0, 100],
        side = 'left',
        title="Bytes spilled"
        
    ),
    xaxis=dict(
        title="Query ID"

    )
    
)
fig = go.Figure(data=data, layout=layout)
fig.show()

### Query text

#### Query 1

In [None]:
print(df.iloc[0]["query_text"])

### ANALYSIS

* Inadequte pruning is observed. 99% of the partitions on the table are scanned.
* Execution time is 3.5 minutes.
* 2 GB is spilled onto local storage

#### Query 2

In [None]:
print(df.iloc[1]["query_text"])

### ANALYSIS

* Inadequte pruning is observed. 99% of the partitions on the table are scanned.
* Execution time is 7.7 minutes.
* 16.9 GB is spilled onto local storage.

#### Query 3

In [None]:
print(df.iloc[2]["query_text"])

### ANALYSIS

* Inadequte pruning is observed. 99% of the partitions on the table are scanned.
* Execution time is 3.5 minutes.
* 2 GB is spilled onto local storage.

### OPTIML Recommendations

##TODO: @saravana Look at the wording on the recommendations and update them approrpriately - create cluster keys on time stamps
* Queries are the same.
* Cluster keys can be defined to reduce partitions scanned. 
* Choose cluster key that appears frequently in a WHERE clause
* Scaling up warehouse would reduce bytes splilling to local storage
* Query can be optimized by reducing number of JOIN statements, eliminating redundant SELECT statements and using DISTINCT clauses.

## Most cached queries

In [None]:
df=qlib.n_most_cached_queries(sdate,edate,5)
df.head() 


In [None]:
## Queries that scanned the most from cache

trace1  = go.Scatter(
        mode='lines+markers',
        x = df['query_id'],
        y = df['percent_scanned_from_cache'],
        name="Percent Scanned From Cache",
        marker_color='crimson'
    )

data = [trace1]

layout = go.Layout(
    title_text='Queries that scanned the most percent from cache',
    yaxis=dict(
        # range = [0, 100],
        side = 'left',
        title="Percent Scanned From Cache"
        
    ),
    xaxis=dict(
        title="Query ID"

    )
)
fig = go.Figure(data=data, layout=layout)
fig.show()

#### Query 1

In [None]:
print(df.iloc[2]["query_text"])

#### Query 2

In [None]:
print(df.iloc[1]["query_text"])

#### Query 3

In [None]:
print(df.iloc[2]["query_text"])

### Recommendations:

##TODO: @saravana to word recommendations appropriately and also recommend further analysis if any.

1) We need high reliability on most cached queries - since they are most used
2) Also same holds for most executed

## Most executed 'select' queries -- update this for select statements

In [None]:
df=qlib.n_most_executed_select_queries(sdate,edate,10)
# df.to_csv("/home/manas/DS_data/most_executed_select.csv")
df.head() 

##TODO: @saravana to put query filter rules so that this particular section surfaces the right reelvant queries for most executed

In [None]:
## Queries that scanned the most from cache

trace1  = go.Scatter(
        mode='lines+markers',
        x = df.index,
        y = df['number_of_times_executed'],
        name="Number of times executed",
        marker_color='crimson'
    )

data = [trace1]

layout = go.Layout(
    title_text='Queries that were executed the most number of times',
    yaxis=dict(
        # range = [0, 100],
        side = 'left',
        title="Number of times executed"
        
    ),
    xaxis=dict(
        title="Query ID"

    )
)
fig = go.Figure(data=data, layout=layout)
fig.show()

#### Query 1

In [None]:

print(df.iloc[0]["query_text"])

#### Query 2

In [None]:
print(df.iloc[1]["query_text"])

#### Query 3

In [None]:
print(df.iloc[3]["query_text"])

### ANALYSIS
* Query 1 was executed 116028 in 30 days
* Query 2 was executed 19770 in 30 days
* Query 3 was executed 13815 in 30 days.

### OPTIML Recommendations
* Can convert SELECT Queries as Materialized Views

## MOST EXECUTED QUERIES


In [None]:
df=qlib.caching_warehouse(sdate,edate,5)
df.head()

## LONGEST RUNNING QUERIES

In [None]:
df=qlib.longest_running_queries(sdate,edate,5)
df.head()

#### QUERY 1

In [None]:
print(df.iloc[0]["query_text"])

#### QUERY 2

In [None]:
print(df.iloc[1]["query_text"])

#### QUERY 3

In [None]:
print(df.iloc[2]["query_text"])

### ANALYSIS

##TODO: @Saravana make a recommendation on how we recommend them to use SCD type 2 architecture

* Queries are the same.
* Updating event query.
* Execution time for query is approximately 135 minutes.
* 65% of total partitions of table are scanned.
* No spillage onto local or remote storage.
* No Query overloading.


### OPTIML to do

* Need to run update query to see if performance tuning can be done.

### OPTIML Recommendations
* Cluster keys can be defined to reduce execution time. WHERE clause present in query can be utilized.
* Warehouse scaling up won't help with performance.


# Notes

* For most expensive queries i.e. consuming most credits steps are:
  * Look at the amount of data the queries are operating on to see if it passes the sniff test
  * Idenitfy causes of credit consumption e.g. using INSERT instead of COPY_TO