In [None]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

In [None]:
# Adding system path
import sys, pathlib, os
sys.path.append(str(pathlib.Path.cwd().parent.parent))
# sys.path

In [None]:
# Set to show warnings only once
import warnings
warnings.filterwarnings('ignore')

In [None]:
## Setup connection to DWH
# customer = 'KIVA'
# schema = 'KIVA_PROD.OPTIML'
customer = 'OPTIML' # Use this for testing
schema = 'KIV.ACCOUNT_USAGE' # Use this for testing
username = customer + '_USERNAME'
password = customer + '_PASSWORD'
account = customer + '_ACCOUNT'

user = os.getenv(username)
password = os.getenv(password)
account = os.getenv(account)

In [None]:
## Setup pandas
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

import numpy as np

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tabulate import tabulate
color_scheme=["red","blue","green","orange","purple","brown","pink","gray","olive","cyan","darkviolet","goldenrod","darkgreen","chocolate","lawngreen"]

In [None]:
# Initialize connection to Snowflake and set analysis date
from optiml.connection import SnowflakeConnConfig
# connection = SnowflakeConnConfig(accountname='jg84276.us-central1.gcp',warehousename="XSMALL_WH").create_connection()
connection = SnowflakeConnConfig(username=user,password=password,accountname=account).create_connection()

# Initialize query library
from optiml.backend.query_profile import QueryProfile
from optiml.backend.cost_profile import CostProfile, get_previous_dates
qqlib = QueryProfile(connection, schema)
cqlib = CostProfile(connection, schema)

# Initialize dates
import datetime 
# edate = datetime.date.today() - datetime.timedelta(days=1)
# sdate = edate - datetime.timedelta(days=6)
edate = datetime.datetime.strptime('2022-10-12', '%Y-%m-%d').date()
sdate = datetime.datetime.strptime('2022-10-05', '%Y-%m-%d').date()
print('Customer:', customer)
print('Schema:', schema)
print(str(sdate), str(edate))

In [None]:
# Setting up autoreload for libs
%load_ext autoreload
%autoreload 2
%aimport optiml.backend.query_profile

# Query Analysis

## Failures

### Total by day

In [None]:
df = qqlib.queries_stats_by_execution_status(sdate,edate)
# df.head()

In [None]:
df_by_day = df.groupby(['day']).agg({'n_success': 'sum', 'n_fail': 'sum', 'credits_success': 'sum', 'credits_fail': 'sum'}).reset_index()


In [None]:
trace1 = go.Bar(
        x = df_by_day['day'],
        y = df_by_day['n_fail'],
        name="Execution fail count",
    )


trace2  = go.Scatter(
        mode='lines+markers',
        x = df_by_day['day'],
        y = df_by_day['credits_fail'],
        name="Credits",
        yaxis='y2',
    )

data = [trace1, trace2]


layout = go.Layout(
    title_text='Query fails and credits per day',
    yaxis=dict(
        title="Count number",
        showgrid=False,
    ),
     yaxis2=dict(
        title="Credits", 
        overlaying="y",
        side="right",
        showgrid=False,
    ),
    xaxis=dict(
        title="Date (UTC)"
    ),
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.99
    ),
    barmode="stack"
)
fig = go.Figure(data=data, layout=layout)
fig.show()

In [None]:
total_success = sum(df_by_day['n_success'])
total_fail = sum(df_by_day['n_fail'])
pct_fail = round(total_fail/(total_fail + total_success) * 100,2)
credits_success = round(sum(df_by_day['credits_success']),2)
credits_fail = sum(df_by_day['credits_fail'])
pct_credits_fail = round(credits_fail/(credits_fail + credits_success) * 100,2)
print('Summary Stats: Credits and counts')
print('---------------------------------')
print('Number of queries that ran to success: ', total_success)
print('Number of queries that ran to failure: ', total_fail)
print('% failed queries: ', pct_fail)
print('Credits used by queries that ran to success: ', credits_success)
print('Credits used by queries that ran to failure: ', credits_fail)
print('% credits due to failed queries: ', pct_credits_fail)
print('Credits per successful query: ', round(credits_success/total_success,4))
print('Credits per failed query: ', round(credits_fail/total_fail,4))

### By warehouse by day

In [None]:
df_by_wh = df.groupby(["warehouse_name", "day"]).agg({'n_success': 'sum', 'n_fail': 'sum', 'credits_success': 'sum', 'credits_fail': 'sum'}).reset_index()

In [None]:
fig = px.bar(df_by_wh, x="day", y="n_fail", color="warehouse_name", title="Number failed by warehouse")
fig.show()

In [None]:
fig = px.bar(df_by_wh, x="day", y="credits_fail", color="warehouse_name", title="Credits failed by warehouse")
fig.show()

### By user by day

In [None]:
df_by_user = df.groupby(["user_name", "day"]).agg({'n_success': 'sum', 'n_fail': 'sum', 'credits_success': 'sum', 'credits_fail': 'sum'}).reset_index()

In [None]:
fig = px.bar(df_by_user, x="day", y="n_fail", color="user_name", title="Number failed by user")
fig.show()

In [None]:
fig = px.bar(df_by_user, x="day", y="credits_fail", color="user_name", title="Credits failed by user")
fig.show()

### Top 10 expensive failing queries of past week

In [None]:
df_expensive_queries_failed = qqlib.queries_by_execution_status(sdate,edate,'FAIL')

In [None]:
df_unique_fail = qqlib.get_unique_failed_queries_with_metrics_ordered(df_expensive_queries_failed, 'credits')
df_unique_fail.reset_index(inplace=True)
# df_unique_fail.head(20)

In [None]:
print('Query details for top 10 expensive failing queries')
print('--------------------------------------------------')
for n in range(0,10):
    if n < len(df_unique_fail):
        # print()
        print('Query id:', df_unique_fail.iloc[n]["query_id"])
        print('User name:', df_unique_fail.iloc[n]["user_name"])
        print('Warehouse name:', df_unique_fail.iloc[n]["warehouse_name"])
        print('Total credits:', df_unique_fail.iloc[n]["credits"])
        print('Number of times the query ran:', len(df_unique_fail.iloc[n]["query_id"]))
        print('Query text snippet:', df_unique_fail.iloc[n]["query_text"][0:75],'...')
        print('---------------------------------------------------------------------------------------------------')

## Expensive queries

In [None]:
# metric = 'bytes_scanned'
# metric = 'percentage_scanned_from_cache'
# metric = 'bytes_spilled_to_local_storage'
# metric = 'bytes_spilled_to_remote_storage' -- division by 0
# metric = 'percentage_partitions_scanned' -- division by 0
# metric = 'partitions_total'
# metric = 'compilation_time_sec' #-- division by 0
# metric = 'execution_time_sec'
# metric = 'queued_provisioning_time_sec' -- division by 0
# metric = 'queued_repair_time_sec' -- division by 0
# metric = 'queued_overload_time_sec' -- division by 0
# metric = 'list_external_files_time_sec' -- division by 0
# metric = 'total_time_elapsed_sec'
metric = 'credits'
df = qqlib.n_inefficient_queries(sdate,edate,10,metric=metric)
# df.head()

In [None]:
df['n_success'] = [cell.count('SUCCESS') for cell in df["execution_status"].values]
df['n_fail'] = [cell.count('FAIL') for cell in df["execution_status"].values]

In [None]:
trace1 = go.Bar(
        x = list(df.index),
        y = df['n_success'],
        name="Execution success count",
    )

trace2 = go.Bar(
        x = list(df.index),
        y = df['n_fail'],
        name="Execution fail count",
    )

trace3  = go.Scatter(
        mode='markers+lines',
        x = list(df.index),
        y = df['credits'],
        name='credits',
        yaxis='y2',
        line=dict(color='black'),
    )

data = [trace1, trace2, trace3]

layout = go.Layout(
    title_text='Query count',
    yaxis=dict(
        title="Count number",
        showgrid=False,
    ),
     yaxis2=dict(
        title=metric, 
        overlaying="y",
        side="right",
        showgrid=False,
    ),
    xaxis=dict(
        title="Date (UTC)"
    ),
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="right",
        x=1.01
    ),
    barmode="stack"
)
fig = go.Figure(data=data, layout=layout)
fig.update_yaxes(rangemode="tozero")
fig.show()

In [None]:
print('Query details for top 10 expensive queries')
print('-------------------------------------------------')
for n in range(0,10):
    if n < len(df):
        # print()
        print('Query id:', df.iloc[n]["query_id"])
        print('User name:', df.iloc[n]["user_name"])
        print('Warehouse name:', df.iloc[n]["warehouse_name"])
        print('Total credits:', df.iloc[n]["credits"])
        print('N Success:', df.iloc[n]["n_success"])
        print('N Fail:', df.iloc[n]["n_fail"])
        print('Query text snippet:', df.iloc[n]["query_text"][0:75],'...')
        print('---------------------------------------------------------------------------------------------------')

## Top 10 most executed running Select queries

In [None]:
df_select = qqlib.n_most_executed_select_queries(sdate,edate, n=20)
# df_select.head()

In [None]:
print('Query details for top 10 expensive select queries (group size > 10, avg. run time > 30 sec)')
print('-------------------------------------------------------------------------------------------')
for n in range(0,10):
    if n < len(df_select):
        # print()
        print('Query id:', df_select.iloc[n]["query_id"])
        print('User name:', df_select.iloc[n]["user_name"])
        print('Warehouse name:', df_select.iloc[n]["warehouse_name"])
        print('Query credits:', df_select.iloc[n]["credits"])
        print('Query text snippet:', df_select.iloc[n]["query_text"][0:75],'...')
        print('---------------------------------------------------------------------------------------------------')

## Queries doing full table scans

In [None]:
df_full_scan = qqlib.queries_full_table_scan(sdate, edate)

In [None]:
print('Query details for top doing full table scans')
print('--------------------------------------------')
for n in range(0,10):
    if n < len(df_full_scan):
        # print()
        print('Query id:', df_full_scan.iloc[n]["query_id"])
        print('User name:', df_full_scan.iloc[n]["user_name"])
        print('Warehouse name:', df_full_scan.iloc[n]["warehouse_name"])
        print('Partitions scanned:', df_full_scan.iloc[n]["partitions_scanned"])
        print('% partitions scanned:', round(df_full_scan.iloc[n]["partitions_scanned"]/df_full_scan.iloc[n]["partitions_total"]*100,2))
        print('Query text snippet:', df_full_scan.iloc[n]["query_text"][0:75],'...')
        print('---------------------------------------------------------------------------------------------------')

In [None]:
df = qqlib.unique_queries_by_type(sdate,edate)

In [None]:
df["percent_usage"] = df["n_query_type"]/sum(df["n_query_type"])*100
# df.head()

In [None]:
df_low_usage_queries = df[df["percent_usage"] < 1.00]

In [None]:
# df_low_usage_queries

In [None]:
df = df[df["percent_usage"] > 1.00].reset_index(drop=True)
# df.tail()

In [None]:
df2 = pd.DataFrame({"n_query_type": sum(df_low_usage_queries["n_query_type"]),
                    "query_type": "low_usage_queries",
                    "percent_usage": sum(df_low_usage_queries["percent_usage"])
                    }, index=[0])
df = df.append(df2, ignore_index=True)
# df

In [None]:
## Plot pie
fig = make_subplots(
    rows=1, cols=1,
    specs=[[{"type": "pie"}]],
    subplot_titles=("Credits")
)

fig.add_trace(go.Pie(labels=df['query_type'].tolist(), \
                     values=df['n_query_type'].tolist(), \
                     name="Credits", rotation=320, \
                     marker_colors=color_scheme, hole=0.4),row=1,col=1)


fig.update_layout(
    title={
        'text': "Queries by type",
        'y':0.1,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'bottom'})
fig.show()