In [None]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

In [None]:
# Adding system path
import sys, pathlib, os
sys.path.append(str(pathlib.Path.cwd().parent.parent))
# sys.path

In [None]:
# Set to show warnings only once
import warnings
warnings.filterwarnings('ignore')

In [None]:
## Setup connection to DWH
# customer = 'KIVA'
# schema = 'KIVA_PROD.OPTIML'
customer = 'RAKUTEN' # Use this for testing
schema = 'SNOwFLAKE.ACCOUNT_USAGE' # Use this for testing
username = customer + '_USERNAME'
password = customer + '_PASSWORD'
account = customer + '_ACCOUNT'
warehouse = customer + '_WAREHOUSE'
rolename = customer + '_ROLENAME'

user = os.getenv(username)
password = os.getenv(password)
account = os.getenv(account)
warehouse = os.getenv(warehouse)
rolename = os.getenv(rolename)

In [None]:
## Setup pandas
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tabulate import tabulate
color_scheme=["red","blue","green","orange","purple","brown","pink","gray","olive","cyan","darkviolet","goldenrod","darkgreen","chocolate","lawngreen"]

In [None]:
# Initialize connection to Snowflake and set analysis date
from optiml.connection import SnowflakeConnConfig
connection = SnowflakeConnConfig(username=user,
                                 password=password,
                                 accountname=account,
                                 rolename=rolename,
                                 warehousename=warehouse).create_connection()

# Initialize query library
from optiml.backend.cost_profile import CostProfile
cqlib = CostProfile(connection, schema)
from optiml.backend.warehouse_profile import WarehouseProfile
wqlib = WarehouseProfile(connection, schema)
from optiml.backend.query_profile import QueryProfile
qqlib = QueryProfile(connection, schema)

# Initialize dates
import datetime 
# edate = datetime.date.today() - datetime.timedelta(days=1)
# sdate = edate - datetime.timedelta(days=6)
# edate = datetime.datetime.strptime('2022-10-04', '%Y-%m-%d').date()
# sdate = datetime.datetime.strptime('2022-09-29', '%Y-%m-%d').date()
edate = datetime.datetime.strptime('2023-04-07', '%Y-%m-%d').date()
sdate = datetime.datetime.strptime('2023-03-23', '%Y-%m-%d').date()
# edate = str(edate)
# sdate = str(sdate)

print('Customer:', customer)
print('Schema:', schema)
print(str(sdate), str(edate))

In [None]:
# Setting up autoreload for libs
%load_ext autoreload
%autoreload 2
%aimport optiml.backend.warehouse_profile

## Optimally provision the warehouse:
- Correct size
- Set the right auto suspend time
- Set the right scaling policy
- Set the right suspension time for query

## What are the things to look at:
- Time series plots for warehouse of:
  - running load 
  - queued load 
  - credit consumption
  - % of time the warehouse was on during and hour
  - Number of clusters running at any point in time
- Queries haveing max resource utilization for the wh:
  - compilation time
  - execution time
  - ...
  - spillovers

In [None]:
## Scaling policy for DEV_WH and dither queries to other places when WH query load is low
## Analyze which queries are triggering a queue in PROD_WH.
## Analyze if there is opportunity to dither queries between ML_WH, DAILY_REFRESH_WH and DEV_WH
## Us there a specific user who is triggering a queue in PROD_WH, DEV_WH
## Is there a specific user whose queries are going to DEV_WH during quiet times?

In [None]:
import itertools
wh_names_lists = wqlib.wh_names(sdate, edate).values
wh_names = list(itertools.chain(*wh_names_lists))
max_consumption_wh_names = wh_name = ["ATSCALE_WH", "SALES_BU_WH", "PLATFORM_DE_WH", 
                                      "ACQUI_MARKETING_BU_WH","PLATFORM_DE_WH_L", "RET_MARKETING_BU_WH",
                                        "ACQUI_MARKETING_BI_WH", "PLATFORM_DE_WH_SOC", "MEMBER_PROFILE_DE_WH_L", 
                                        "SHOPSTYLE_DE_CUBE_WH" ]

wh_names_set = set(wh_names)
max_consumption_wh_names_set = set(max_consumption_wh_names)

wh_names_analysis = list(max_consumption_wh_names_set.intersection(wh_names_set))
wh_names_analysis

## Warehouse profiling

### Warehouse efficiency

In [None]:
delta = 'hour'
# warehouse_name = 'SHOPSTYLE_DE_CUBE_WH'
# warehouse_name = 'MEMBER_PROFILE_DE_WH_L'
# warehouse_name = 'RET_MARKETING_BU_WH'
# warehouse_name = 'ATSCALE_WH'
# warehouse_name =  'PLATFORM_DE_WH_L',
# warehouse_name =  'PLATFORM_DE_WH_SOC',
# warehouse_name = 'SALES_BU_WH',
warehouse_name =  'PLATFORM_DE_WH'
# warehouse_name = 'ACQUI_MARKETING_BI_WH',
# warehouse_name = 'ACQUI_MARKETING_BU_WH'

# warehouse_name = wh_names_analysis[0] ## Candidate - could easily drop a size without moving any queries around
# warehouse_name = wh_names_analysis[1] ## Candidate - I would say this is appropriately provisioned given ratios of 3 buckets
# warehouse_name = wh_names_analysis[2] ## Candidate - could easily move 1 query and drop the WH size by 3x (2x will do)
# warehouse_name = wh_names_analysis[3] ## Candidate - could easily move 1 query and drop the WH size by 3x (2x will do)
# warehouse_name = wh_names_analysis[4] ## Candidate - could easily drop a size without moving any queries around
# warehouse_name = wh_names_analysis[5] ## Candidate - could easily drop a size without moving any queries around
# warehouse_name = wh_names_analysis[6] ## Maybe Candidate - could easily drop a size without moving any queries around, compilation time seems high for queries in this WH
# warehouse_name = wh_names_analysis[7] ## Candidate - could easily move 1 query and drop the WH size by 3x (2x will do)
# warehouse_name = wh_names_analysis[8] ## Maybe Candidate - could easily drop a size without moving any queries around
# warehouse_name = wh_names_analysis[9] ## May be correctly provisioned - move smaller queries elsewhere and use for only larger queries, shut down aggressively
df = wqlib.wh_load_and_efficiency(start_date=sdate, end_date=edate, warehouse_name=warehouse_name,delta=delta)
df.head(10)

In [None]:
# Interpolate hourly time column
# Set the 'Time' column as the index
df.set_index('hourly_start_time', inplace=True)
# Interpolate missing values
df = df.resample('1H').asfreq()
df = df.apply(lambda col: col.fillna(0) if col.dtype.kind in 'biufc' else col.fillna(warehouse_name))
df.reset_index(inplace=True)

In [None]:
print(warehouse_name)
trace1 = go.Bar(
            x=df["hourly_start_time"], y=df["avg_queued_load"],
            name='Average Queued Load',marker=dict(color='rgb(222,0,0)')
        )
trace2 = go.Bar(
    x=df["hourly_start_time"], y=df["avg_running_load"],
    name='Average Running load',marker=dict(color='rgb(0,0,255)')
)
trace3 = go.Scatter(
    x=df['hourly_start_time'] ,y=df['avg_credits'],
    name='Average Credits', mode='lines+markers',
)

trace4 = go.Scatter(
    x=df['hourly_start_time'] ,y=df['avg_efficiency'],
    name='Average Efficiency', mode='lines+markers',
)

f = make_subplots(specs=[[{"secondary_y": True}]])
f.add_trace(trace1, secondary_y=False)
f.add_trace(trace2, secondary_y=False)
f.update_layout(barmode='stack')
f.add_trace(trace3, secondary_y=True)
f.update_layout(
    xaxis_title="Hourly start time (UTC)",
    xaxis=dict(showgrid=False),
    yaxis=dict(showgrid=False)
)
f.update_yaxes(title_text="Query Load", secondary_y=False)
f.update_yaxes(title_text="Credits", secondary_y=True)
f.update_yaxes(rangemode="tozero", secondary_y=True)

f.show()

In [None]:
df.head()

In [None]:
import pytz
# convert timestamp column to datetime format with timezone
df['hourly_start_time'] = pd.to_datetime(df['hourly_start_time'])
df['hourly_start_time'] = df['hourly_start_time'].dt.tz_convert(pytz.timezone('America/Los_Angeles'))

# specify the date of interest
day_of_interest_naive = pd.to_datetime('2023-03-29')
la_tz = pytz.timezone('America/Los_Angeles')
day_of_interest = la_tz.localize(day_of_interest_naive)

In [None]:
# filter rows before and after the day of interest
df_before = df[df['hourly_start_time'] < day_of_interest]
df_after = df[df['hourly_start_time'] >= day_of_interest]

# calculate mean of the value column for each group
mean_before = df_before['avg_credits'].mean()
mean_after = df_after['avg_credits'].mean()
# calculate mean of the value column for each group
std_before = df_before['avg_credits'].std()
std_after = df_after['avg_credits'].std()

print('Stats before', day_of_interest.date())
print('Mean: ', round(mean_before,2), ', Std: ', round(std_before,2))
print('Stats after', day_of_interest.date())
print('Mean: ', round(mean_after,2), ', Std: ', round(std_after,2))
print('% reduction in credit consumption:', round((mean_before-mean_after)/mean_before*100,2))

In [None]:
## Queries using most credits
df= qqlib.warehouse_resource_utilization(start_date=sdate, end_date=edate, warehouse_name=warehouse_name)
df.head()

In [None]:
scale_to_min = 1./1000./60.
scaling_factors = {"total_elapsed_time": scale_to_min, 
                   "compilation_time": scale_to_min, 
                   "execution_time": scale_to_min,
                   "queued_provisioning_time": scale_to_min,
                   "queued_repair_time": scale_to_min,
                   "queued_overload_time": scale_to_min,
                   "transaction_blocked_time": scale_to_min,
                   "list_external_files_time": scale_to_min}

# scaled_df = df.copy()
df.loc[:, list(scaling_factors.keys())] = df.loc[:, list(scaling_factors.keys())].multiply(pd.Series(scaling_factors), axis=1)
df["active_time"] = df["execution_time"] + df["compilation_time"]
df.sort_values("total_elapsed_time",inplace=True, ascending=False)
df.head()


In [None]:
## Trimodal query groups
time_elapsed_breakpoint_1 = 1 ## 1 min
time_elapsed_breakpoint_2 = 60 ## 60 min
nbins = 100
metric = "active_time"

In [None]:
## Short queries < 1 min, Medium 1-60 min, long > 60 min
df_short = df[df[metric] <= time_elapsed_breakpoint_1]
df_medium = df[(df[metric] > time_elapsed_breakpoint_1) & (df[metric] <= time_elapsed_breakpoint_2)]
df_long = df[df[metric] > time_elapsed_breakpoint_2]
print(f"Short queries < {time_elapsed_breakpoint_1} min: {len(df_short)}") 
print(f"Medium queries > {time_elapsed_breakpoint_1} min and < {time_elapsed_breakpoint_2} min: {len(df_medium)}") 
print(f"Long queries > {time_elapsed_breakpoint_2} min: {len(df_long)}") 
    #   len(df_medium), len(df_long)

In [None]:
# metric = "total_elapsed_time"
fig = px.histogram(df, x=metric, nbins=nbins)
fig.update_layout(xaxis_title=metric + ' (min)')
fig_short = px.histogram(df_short, x=metric, nbins=nbins)
fig_short.update_layout(xaxis_title=metric + ' (min)')
fig_medium = px.histogram(df_medium, x=metric, nbins=nbins)
fig_medium.update_layout(xaxis_title=metric + ' (min)')
fig_long = px.histogram(df_long, x=metric, nbins=nbins)
fig_long.update_layout(xaxis_title=metric + ' (min)')
fig.show()
fig_short.show()
fig_medium.show()
fig_long.show()

In [None]:
df_long.sort_values("active_time", inplace=True, ascending=False)
fig = px.bar(df_long, x="query_id", y=["compilation_time", 
                                       "execution_time",
                                        "queued_provisioning_time",
                                        "queued_repair_time",
                                        "queued_overload_time",
                                        "transaction_blocked_time",
                                        "list_external_files_time"], title="Longest running queries")
fig.update_layout(yaxis_title='Total Elapsed Time (min)')
fig.show()

In [None]:
df_medium.sort_values("active_time", inplace=True, ascending=False)
fig = px.bar(df_medium[0:400], x="query_id", y=["compilation_time", 
                                       "execution_time",
                                        "queued_provisioning_time",
                                        "queued_repair_time",
                                        "queued_overload_time",
                                        "transaction_blocked_time",
                                        "list_external_files_time"], title="Queries running medium time")
fig.update_layout(yaxis_title='Total Elapsed Time (min)')
fig.show()

In [None]:
df_long