In [None]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

In [None]:
# Adding system path
import sys, pathlib, os
sys.path.append(str(pathlib.Path.cwd().parent.parent))
# sys.path

In [None]:
# Set to show warnings only once
import warnings
warnings.filterwarnings('ignore')

In [None]:
## Setup connection to DWH
# customer = 'KIVA'
# schema = 'KIVA_PROD.OPTIML'
customer = 'OPTIML' # Use this for testing
schema = 'KIV.ACCOUNT_USAGE' # Use this for testing
username = customer + '_USERNAME'
password = customer + '_PASSWORD'
account = customer + '_ACCOUNT'

user = os.getenv(username)
password = os.getenv(password)
account = os.getenv(account)

In [None]:
## Setup pandas
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

import numpy as np

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tabulate import tabulate
color_scheme=["red","blue","green","orange","purple","brown","pink","gray","olive","cyan","darkviolet","goldenrod","darkgreen","chocolate","lawngreen"]

In [None]:
# Initialize connection to Snowflake and set analysis date
from optiml.connection import SnowflakeConnConfig
# connection = SnowflakeConnConfig(accountname='jg84276.us-central1.gcp',warehousename="XSMALL_WH").create_connection()
connection = SnowflakeConnConfig(username=user,password=password,accountname=account).create_connection()

# Initialize query library
from optiml.backend.query_profile import QueryProfile
from optiml.backend.cost_profile import CostProfile, get_previous_dates
qqlib = QueryProfile(connection, schema)
cqlib = CostProfile(connection, schema)

# Initialize dates
import datetime 

edate = '2022-10-12'
sdate = '2022-10-05'
print('Customer:', customer)
print('Schema:', schema)
print(str(sdate), str(edate))

In [None]:
# Setting up autoreload for libs
%load_ext autoreload
%autoreload 2
%aimport optiml.backend.query_profile

# Query Analysis

## Queries executed per day - successes, failures, credits

In [None]:
df = qqlib.queries_by_execution_status(sdate,edate)
df.head()

In [None]:
df_count = df.groupby(['day']).agg({'n_success': 'sum', 'n_fail': 'sum'}).reset_index()
df_count.head()

In [None]:
df_credits = cqlib.credits_by_day(sdate, edate)

In [None]:
trace1 = go.Bar(
        x = df_count['day'],
        y = df_count['n_success'],
        name="Execution success count",
    )

trace2 = go.Bar(
        x = df_count['day'],
        y = df_count['n_fail'],
        name="Execution fail count",
    )

trace3  = go.Scatter(
        mode='lines+markers',
        x = df_credits['date'],
        y = df_credits['credits'],
        name="Credits",
        yaxis='y2',
        line=dict(color='black'),
    )


data = [trace1, trace2, trace3]

layout = go.Layout(
    title_text='Query success, fail, credits per day',
    yaxis=dict(
        title="Count number",
        showgrid=False,
    ),
     yaxis2=dict(
        title="Credits", 
        overlaying="y",
        side="right",
        showgrid=False,
    ),
    xaxis=dict(
        title="Date (UTC)"
    ),
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.99
    ),
    barmode="stack"
)
fig = go.Figure(data=data, layout=layout)
fig.show()

## Query execution by user per day

In [None]:
df = qqlib.queries_by_execution_status(sdate,edate)

In [None]:
df_by_user = df.groupby(['user_name','day']).agg({'n_success': 'sum', 'n_fail': 'sum'})
df_by_user.reset_index(inplace=True)
user_list = list(df_by_user["user_name"].unique())
# user_list

In [None]:
# user = 'ANALYTICS_EXERCISE_USER' #significant failures
# user =  'BAMBOO' #significant failures
# user =  'DBT_DEV'
# user = 'DBT_PROD'
# user = 'FIVETRAN_USER'
# user = 'FIVETRAN_USER_DEV'
# user = 'GREGORYW' #some failures
# user = 'GREGORYW_DEV'
user = 'KIVA_API_SNAPSHOT_PROD' #all failures
# user = 'LOOKER_DEV_ADMIN' #significant failures
# user = 'LOOKER_PROD' #some failures
# user = 'LOOKER_PROD_ADMIN' #some failures
# user = 'LOOKER_RAW_DEV' #significant failures
# user = 'LOOKER_RAW_PROD' #significant failures
# user = 'MAXH_DEV' #some failures
# user = 'ML_SERVICE_DEV'
# user = 'ML_SERVICE_PROD'
# user = 'PATRICKL'
# user = 'PATT' #significant failures
# user = 'ROBS' #significant failures
# user = 'TEST_EXERCISE_AS' #some failures
# user = 'TEST_EXERCISE_CM' #some failures
# user = 'TEST_EXERCISE_JAR' #some failures
# user = 'TEST_EXERCISE_LAZ' #some failures
# user = 'TEST_EXERCISE_RDN' #some failures
# user = 'VERTEX_API_DEV'
# user = 'VERTEX_API_DEV_JENKINS'
# user = 'VERTEX_API_PROD'
# user = 'WORKSHEETS_APP_USER'
df_user = df_by_user[df_by_user["user_name"] == user]
df_user.reset_index(drop=True,inplace=True)

In [None]:
trace1 = go.Bar(
        x = df_user['day'],
        y = df_user['n_success'],
        name="Execution success count",
    )

trace2 = go.Bar(
        x = df_user['day'],
        y = df_user['n_fail'],
        name="Execution fail count",
    )

data = [trace1, trace2]

layout = go.Layout(
    title_text='Query count',
    yaxis=dict(
        title="Count number",
        showgrid=False,
    ),
     yaxis2=dict(
        title="Number of times ran", 
        overlaying="y",
        side="right",
        showgrid=False,
    ),
    xaxis=dict(
        title="Date (UTC)"
    ),
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.99
    ),
    barmode="stack"
)
fig = go.Figure(data=data, layout=layout)
fig.show()

In [None]:
## all failures for a user between start and end date
df=qqlib.get_queries(start_date=sdate,end_date=edate,user=user,es="FAIL", n=1500)
df_unique = qqlib.get_unique_queries(df)
df_unique.sort_values('execution_status', inplace=True, ascending=False)
df_unique.reset_index(inplace=True)
df_unique.head()

## Query execution by warehouse per day

In [None]:
df = qqlib.queries_by_execution_status(sdate,edate)

In [None]:
df_by_wh = df.groupby(['warehouse_name','day']).agg({'n_success': 'sum', 'n_fail': 'sum'})
df_by_wh.reset_index(inplace=True)
wh_list = list(df_by_wh["warehouse_name"].unique())
# wh_list

In [None]:
# wh = 'DAILY_REFRESH_WH'
# wh = 'DEV_WH'
# wh = 'ML_WH'
# wh ='PROD_WH'
wh = 'Unassigned'
df_wh = df_by_wh[df_by_wh["warehouse_name"] == wh]
df_wh.reset_index(drop=True,inplace=True)

In [None]:
trace1 = go.Bar(
        x = df_wh['day'],
        y = df_wh['n_success'],
        name="Execution success count",
    )

trace2 = go.Bar(
        x = df_wh['day'],
        y = df_wh['n_fail'],
        name="Execution fail count",
    )

data = [trace1, trace2]

layout = go.Layout(
    title_text='Query count',
    yaxis=dict(
        title="Count number",
        showgrid=False,
    ),
     yaxis2=dict(
        title="Number of times ran", 
        overlaying="y",
        side="right",
        showgrid=False,
    ),
    xaxis=dict(
        title="Date (UTC)"
    ),
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.99
    ),
    barmode="stack"
)
fig = go.Figure(data=data, layout=layout)
fig.show()

## Most expensive queries

In [None]:
metric = 'credits'
df = qqlib.n_inefficient_queries_v2(sdate,edate,50,metric=metric)
print('----------------------------------------------')
print(tabulate(df[["query_hash", "warehouse_name", "user_name", "credits"]], headers='keys', tablefmt='rounded_outline', showindex=False))

# Most expensive queries by user

In [None]:
df_user = df.sort_values(by=['user_name', 'credits'], ascending=False)
print('----------------------------------------------')
print(tabulate(df_user[["user_name", "query_hash", "credits"]], headers='keys', tablefmt='rounded_outline', showindex=False))

# Most expensive queries by warehouse

In [None]:
df_warehouse = df.sort_values(by=['warehouse_name', 'credits'], ascending=False)
print('----------------------------------------------')
print(tabulate(df_warehouse[["warehouse_name", "query_hash", "credits"]], headers='keys', tablefmt='rounded_outline', showindex=False))

## Most executed 'select' queries (>5s execution time)

In [None]:
df=qqlib.n_most_executed_select_queries_v2(sdate,edate,50)
print('----------------------------------------------')
print(tabulate(df[["query_hash", "warehouse_name", "user_name", "number_of_queries", "execution_seconds", "average_execution_seconds"]], 
               headers='keys', tablefmt='rounded_outline', showindex=False))


# Query hash to query text mapping

In [None]:
print('----------------------------------------------')
print(tabulate(df[["query_hash", "query_text"]], headers='keys', tablefmt='rounded_outline', showindex=False))