In [None]:
# Adding system path
import sys, pathlib
sys.path.append(str(pathlib.Path.cwd().parent))
# sys.path

In [None]:
# Set to show warnings only once
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Setting up displays
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
from dash import Dash,html,dcc,Input,Output
app = Dash(__name__)
import pandas as pd
pd.set_option('display.max_rows', 500)

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tabulate import tabulate
color_scheme=["red","blue","green","orange","purple","brown","pink","gray","olive","cyan","darkviolet","goldenrod","darkgreen","chocolate","lawngreen"]

In [None]:
from dash import Dash,html,dcc

In [None]:
# ##center allign all the figure outputs.
# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# from IPython.display import display, HTML
# from plotly.graph_objs import *
# import numpy as np
# init_notebook_mode(connected=True)

# display(HTML("""
# <style>
# .output {
#     display: flex;
#     align-items: center;
#     text-align: center;
# }
# </style>
# """))

In [None]:
# Setting up autoreload for libs
%load_ext autoreload
%autoreload 2
%aimport optiml.queries

In [None]:
# Initialize connection to Snowflake and set analysis date
from optiml.connection import SnowflakeConnConfig
connection = SnowflakeConnConfig(accountname='jg84276.us-central1.gcp',warehousename="XSMALL_WH").create_connection()
# Initialize local environment
import os
cache_dir = os.path.expanduser('~/data/kiva')
# Initialize query library
from optiml.queries import SNFLKQuery
qlib = SNFLKQuery(connection, 'KIV', cache_dir)
sdate = '2022-10-01'
edate = '2022-10-31'
print(f"The analysis is carried our for date range {sdate} to {edate}")

# Total cost breakdown 

### SETUP FOR ANALYSIS
- Date range: 10/1/22 to 10/12/22

- Type of account with Snowflake - Standard Edition powered by GCP

- Credit to dollar conversion - $2 per credit


## Cost by usage category

In [None]:
df = qlib.total_cost_breakdown_ts(sdate, edate)
df = df.fillna('Unassigned')
# df.to_csv('/home/manas/DS_data/breakdown.csv')


In [None]:
df_by_usage_category = df.groupby("category_name").sum("numeric_only").reset_index()
df_by_usage_category.loc[len(df.index)] = ['Total', df_by_usage_category['credits'].sum(), df_by_usage_category['dollars'].sum()]
df_by_usage_category = df_by_usage_category.round(2)
print(tabulate(df_by_usage_category, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
# Pie charts for total cost breakdown
## Remove the last row of totals for the plot
df_by_usage_category.reset_index(inplace=True)
df_by_usage_category.drop(columns=["index"], inplace=True)
df_by_usage_category = df_by_usage_category.drop(len(df_by_usage_category)-1) 
fig = make_subplots(
    rows=1, cols=2,
    specs=[[{"type": "pie"},{"type": "pie"}]],
    subplot_titles=("Dollars", "Credits")
)

fig.add_trace(go.Pie(labels=df_by_usage_category['category_name'].tolist(), values=df_by_usage_category['dollars'].tolist(),name="Dollars", rotation=45,marker_colors=color_scheme),row=1,col=1)
fig.add_trace(go.Pie(labels=df_by_usage_category['category_name'].tolist(), values=df_by_usage_category['credits'].tolist(),name='Credits', rotation=45,marker_colors=color_scheme),row=1,col=2)

fig.update_layout(
    title={
        'text': "Breakdown of total cost by usage category",
        'y':0.1,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'bottom'})
fig.show()


## Cost by usage category timeseries

In [None]:
df_by_category_ts = df.groupby(['category_name','hourly_start_time']).sum('numeric_only').reset_index()
fig = px.area(df_by_category_ts, x="hourly_start_time", y="credits", color="category_name",color_discrete_sequence=color_scheme)
fig.show()

### COMPUTE:
 Average credits used for the Compute category reach a high at 9 a.m. and then a low at 7 p.m.
 >Hourly credit consumption -
- Max: 27.04		
- Time: 9:00 am
- Min: 9.18 
- Time: 7:00 pm

Credits are consumed throughout the night from 8:00 p.m. to 8:00 a.m.
> Average hourly credit consumption between 8:00 pm - 8:00 am:
- Mean: 9.85
- Standard deviation: 0.48

On the 8th, 9th and 10th of October, credits consumed at night increased , while at other times, they remained the same from previous days.
- Mean: 12.66
- Standard deviation: 0.419
- Increase in mean from the previous nights : 2.81

From October 1 - October 5, there is a peak in credit consumption at 2 pm.
- Average credit consumption: 29.5
- Increase from other days at 2 p.m : 12.8 

### CLOUD SERVICES
- The credits for cloud services follow a similar trend as compute with regards to daily timings.
- Usage of credits of cloud services are only charged if the daily consumption of cloud services resources exceeds 10% of the daily warehouse usage.



## Cost by user

In [None]:
df_by_user = df.groupby(['user_name']).sum('numeric_only').reset_index()
df_by_user = df_by_user.round(2)
df_by_user.loc[len(df.index)] = ['Total', df_by_user['credits'].sum(), df_by_user['dollars'].sum()]
print(tabulate(df_by_user, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
df_by_user.drop(df_by_user.tail(1).index,inplace=True)
fig = make_subplots(
    rows=1, cols=2,
    specs=[[{"type": "pie"},{"type": "pie"}]],
    subplot_titles=("Dollars", "Credits")
)

fig.add_trace(go.Pie(labels=df_by_user['user_name'].tolist(), values=df_by_user['dollars'].tolist(),name="Dollars", rotation=45,marker_colors=color_scheme),row=1,col=1)
fig.add_trace(go.Pie(labels=df_by_user['user_name'].tolist(), values=df_by_user['credits'].tolist(),name='Credits', rotation=45,marker_colors=color_scheme),row=1,col=2)

fig.update_layout(
    title={
        'text': "Breakdown of total cost by user",
        'y':0.1,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'bottom'})
fig.show()

- User GREGORYW incurs the maximum value of dollars and credits at 2418 and 1209, respectively.
- Users VERTX_PROD and DBT_PROD incur almost the same values of dollars and credit at 1497 and 748, respectively. 
- Users DBT_DEV and ROBS spend the least value of dollars and credits compared to other users.


In [None]:
df_by_user_ts = df.groupby(['user_name','hourly_start_time']).sum('numeric_only').reset_index()
fig = px.area(df_by_user_ts, x="hourly_start_time", y="credits", color="user_name",color_discrete_sequence=color_scheme)
fig.show()

- The time series by user follows a similar trend as the time series by usage category graph with regards to usage of credits at daily time slots.
- The increase in credit usage on the 8th, 9th and 10th of October is because of usage increases by GREGORYW and DBT_DEV. The 2 p.m. credit usage spike from October 1- October 5 can also be attributed to those users. 




## Cost by warehouse

In [None]:
# Returns results only for ACCOUNTADMIN role or any other role that has been granted MONITOR USAGE global privilege
# So results consisten with Greg's usage
df = qlib.cost_by_wh_ts(sdate, edate)
df.head()

In [None]:
df_by_wh = df.groupby(['warehouse_name']).sum('numeric_only').reset_index()
df_by_wh = df_by_wh.round(2)
df_by_wh.loc[len(df.index)] = ['Total', df_by_wh['credits'].sum(), df_by_wh['dollars'].sum(),  df_by_wh['cloud_services_credits'].sum(), df_by_wh['cloud_services_dollars'].sum()]
print(tabulate(df_by_wh, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
# Remove the last row of totals for the plot
df_by_wh.drop(df_by_wh.tail(1).index,inplace=True)

fig = make_subplots(
    rows=1, cols=1,
    specs=[[{"type": "pie"}]],
)

fig.add_trace(go.Pie(labels=df_by_wh['warehouse_name'].tolist(), values=df_by_wh['dollars'].tolist(),name='dollars',marker_colors=color_scheme),row=1,col=1)

fig.update_layout(
    title={
        'text': "Breakdown of total cost by warehouse",
        'y':0.1,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
fig.show()

- PROD_WH incurs the highest cost for warehouses amongst all with a dollar value of 1221.29, followed by DEV_WH with a dollar value of 797.61
- DAILY_REFRESH_WH’s cost is 347.14 and ML_WH’s cost is 140.47
- CLOUD_SERVICES_ONLY does not incur any costs.



In [None]:
df_by_wh_ts = df.groupby(['warehouse_name','hourly_start_time']).sum('numeric_only').reset_index()
df_by_wh_ts.head()

In [None]:
##TODO: Investigate why tunring off cloud services only makes daily refresh plot jump in some points
fig = px.area(df_by_wh_ts, x="hourly_start_time", y="credits", color="warehouse_name",color_discrete_sequence=color_scheme)
fig.show()

- The credit usage increase on the 8th, 9th, and 10th of October from other days is because of more credit consumption by DEV_WH.


## Cost by Partner Tools


In [None]:
df=qlib.cost_by_partner_tool_ts(start_date="2022-01-01", end_date="2022-02-02")
df.head()

In [None]:
df_by_pt = df.groupby(['client_application_name']).sum('numeric_only').reset_index()
df_by_pt = df_by_pt.round(2)
df_by_pt.loc[len(df.index)] = ['Total', df_by_pt['approximate_credits_used'].sum()]
print(tabulate(df_by_pt, headers='keys', tablefmt='rounded_outline', showindex=False))

In [None]:
# Remove the last row of totals for the plot
df_by_pt.drop(df_by_pt.tail(1).index,inplace=True)

fig = make_subplots(
    rows=1, cols=1,
    specs=[[{"type": "pie"}]],
)

fig.add_trace(go.Pie(labels=df_by_pt['client_application_name'].tolist(), values=df_by_pt['approximate_credits_used'].tolist(),name='credits',marker_colors=color_scheme),row=1,col=1)

fig.update_layout(
    title={
        'text': "Breakdown of total cost by partner tools",
        'y':0.1,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
fig.show()

In [None]:
df_by_pt_ts = df.groupby(['client_application_name','hourly_start_time']).sum('numeric_only').reset_index()
df_by_pt_ts.head()

In [None]:

fig = px.area(df_by_pt_ts, x="hourly_start_time", y="approximate_credits_used", color="client_application_name",color_discrete_sequence=color_scheme)
fig.show()

## QUERY ANALYSIS

### Most expensive queries

In [None]:
df=qlib.n_expensive_queries(sdate,edate,5)
df.head()

In [None]:
##TODO: Put labels on the axis
## Most expensive queries

trace1  = go.Scatter(
        mode='lines+markers',
        x = df['query_id'],
        y = df['credits'],
        name="Credits",
        marker_color='crimson'
    )

# trace2 = go.Bar(
#         x = df['query_id'],
#         y = df['warehouse_size'],
#         name="warehouse size",
#         yaxis='y2',
#         marker_color ='green',
#         marker_line_width=1.5,
#         marker_line_color='rgb(8,48,107)',
#         opacity=0.5
#     )

# data = [trace1, trace2]
data = [trace1]

layout = go.Layout(
    title_text='Most expensive queries',
    yaxis=dict(
        # range = [0, 100],
        side = 'left'
    ),
    # yaxis2=dict(
    #     overlaying='y',
    #     anchor='y3',
    # )
)
fig = go.Figure(data=data, layout=layout)
fig.show()

### Query text

#### Query 1

In [None]:
print(df.iloc[0]["query_text"])

#### Query 2

In [None]:
print(df.iloc[1]["query_text"])

#### Query 3

In [None]:
print(df.iloc[2]["query_text"])

### Queries that spill to storage

In [None]:
df=qlib.n_queries_spill_to_storage(sdate,edate,5)
df.head()

In [None]:
##TODO: Put labels on the axis
## Most expensive queries

trace1  = go.Scatter(
        mode='lines+markers',
        x = df['query_id'],
        y = df['bytes_spilled_to_remote_storage'],
        name="Bytes Spilled Remote",
        marker_color='crimson'
    )

trace2  = go.Scatter(
        mode='lines+markers',
        x = df['query_id'],
        y = df['bytes_spilled_to_local_storage'],
        name="Bytes Spilled Local",
        marker_color='purple'
    )


data = [trace1, trace2]


layout = go.Layout(
    title_text='Queries that spilled the most to storage',
    yaxis=dict(
        # range = [0, 100],
        side = 'left'
    ),
    yaxis2=dict(
        overlaying='y',
        anchor='y3',
    )
)
fig = go.Figure(data=data, layout=layout)
fig.show()

### Query text

#### Query 1

In [None]:
print(df.iloc[0]["query_text"])

#### Query 2

In [None]:
print(df.iloc[1]["query_text"])

#### Query 3

In [None]:
print(df.iloc[2]["query_text"])

### Queries that scanned the most data

In [None]:
df=qlib.n_queries_scanned_most_data(sdate,edate,5)
df.head() 

In [None]:
##TODO: Put labels on the axis
## Queries that scanned most data

trace1  = go.Scatter(
        mode='lines+markers',
        x = df['query_id'],
        y = df['partitions_scanned'],
        name="Partitions Scanned",
        marker_color='crimson'
    )

data = [trace1]

layout = go.Layout(
    title_text='Queries that scanned the most partitions',
    yaxis=dict(
        side = 'left'
    ),
)
fig = go.Figure(data=data, layout=layout)
fig.show()

### Query text

#### Query 1

In [None]:
print(df.iloc[0]["query_text"])

#### Query 2

In [None]:
print(df.iloc[1]["query_text"])

#### Query 3

In [None]:
print(df.iloc[2]["query_text"])

### Most cached queries

In [None]:
df=qlib.n_most_cached_queries(sdate,edate,5)
df.head() 

In [None]:
##TODO: Put labels on the axis
## Queries that scanned the most from cache

trace1  = go.Scatter(
        mode='lines+markers',
        x = df['query_id'],
        y = df['percent_scanned_from_cache'],
        name="Percent Scanned From Cache",
        marker_color='crimson'
    )

data = [trace1]

layout = go.Layout(
    title_text='Queries that scanned the most percent from cache',
    yaxis=dict(
        side = 'left'
    ),
)
fig = go.Figure(data=data, layout=layout)
fig.show()

#### Query 1

In [None]:
print(df.iloc[2]["query_text"])

#### Query 2

In [None]:
print(df.iloc[1]["query_text"])

#### Query 3

In [None]:
print(df.iloc[2]["query_text"])

### Most executed queries

In [None]:
df=qlib.n_most_executed_queries(sdate,edate,5)
df.head() 

In [None]:
##TODO: Put labels on the axis, fix ticks on the axis to be integers
## Queries that scanned the most from cache

trace1  = go.Scatter(
        mode='lines+markers',
        x = df.index,
        y = df['number_of_times_executed'],
        name="Number of times executed",
        marker_color='crimson'
    )

data = [trace1]

layout = go.Layout(
    title_text='Queries that were executed the most number of times',
    yaxis=dict(
        side = 'left'
    ),
)
fig = go.Figure(data=data, layout=layout)
fig.show()

#### Query 1

In [None]:

print(df.iloc[0]["query_text"])

#### Query 2

In [None]:
print(df.iloc[1]["query_text"])

#### Query 3

In [None]:
print(df.iloc[3]["query_text"])

## Analysis
* 4-20 most expensive queries are the same query

In [None]:
df=qlib.top_users(start_date="2022-01-01", end_date="2022-02-02")
df.head()

In [None]:
fig = px.bar(df, x=df['user_name'], y=df["estimated_credits"], color=df["user_name"], barmode="group")

app.layout = html.Div(children=[
    html.H1(children='Hello Dash'),

    html.Div(children='''
        Dash: A web application framework for your data.
    '''),

    dcc.Graph(
        id='example-graph',
        figure=fig
    )
])
fig.show()

In [None]:
df_expensiveq=qlib.n_expensive_queries(sdate,edate)


In [None]:
## Warehouse credit usage over time



app = Dash(__name__)



fig = px.scatter(df_expensiveq, x=df_expensiveq["query_id"], y=df_expensiveq["relative_performance_cost"],
                 color=df_expensiveq["user_name"], hover_name=df_expensiveq["warehouse_size"],size_max=30)
fig.update_traces(marker_size=20)


app.layout = html.Div([
    dcc.Graph(
        id='life-exp-vs-gdp',
        figure=fig
    )
])

fig.show()

In [None]:
app = Dash(__name__)



app.layout = html.Div([
    html.Div([

        html.Div([
            dcc.Dropdown(
                df['user_name'].unique(),
                'User_name',
                id='xaxis-column'
            ),
            dcc.RadioItems(
                ['Linear', 'Log'],
                'Linear',
                id='xaxis-type',
                inline=True
            )
        ], style={'width': '48%', 'display': 'inline-block'}),

        html.Div([
            dcc.Dropdown(
                df['Indicator Name'].unique(),
                'Life expectancy at birth, total (years)',
                id='yaxis-column'
            ),
            dcc.RadioItems(
                ['Linear', 'Log'],
                'Linear',
                id='yaxis-type',
                inline=True
            )
        ], style={'width': '48%', 'float': 'right', 'display': 'inline-block'})
    ]),

    dcc.Graph(id='indicator-graphic'),

    dcc.Slider(
        df['Year'].min(),
        df['Year'].max(),
        step=None,
        id='year--slider',
        value=df['Year'].max(),
        marks={str(year): str(year) for year in df['Year'].unique()},

    )
])


@app.callback(
    Output('indicator-graphic', 'figure'),
    Input('xaxis-column', 'value'),
    Input('yaxis-column', 'value'),
    Input('xaxis-type', 'value'),
    Input('yaxis-type', 'value'),
    Input('year--slider', 'value'))
def update_graph(xaxis_column_name, yaxis_column_name,
                 xaxis_type, yaxis_type,
                 year_value):
    dff = df[df['Year'] == year_value]

    fig = px.scatter(x=dff[dff['Indicator Name'] == xaxis_column_name]['Value'],
                     y=dff[dff['Indicator Name'] == yaxis_column_name]['Value'],
                     hover_name=dff[dff['Indicator Name'] == yaxis_column_name]['Country Name'])

    fig.update_layout(margin={'l': 40, 'b': 40, 't': 10, 'r': 0}, hovermode='closest')

    fig.update_xaxes(title=xaxis_column_name,
                     type='linear' if xaxis_type == 'Linear' else 'log')

    fig.update_yaxes(title=yaxis_column_name,
                     type='linear' if yaxis_type == 'Linear' else 'log')

    return fig