In [None]:
import os
import sys
sys.path.append(os.path.join('..', '..'))

import logging
logging.getLogger().setLevel(logging.INFO)


import datetime
import pandas as pd
import numpy as np
import multiprocessing as mp
import scipy.stats

from mikesnowflake.analysis.snowFlakeAnalysis import SnowFlakeAnalysis
from mikesnowflake.access.colorAccess import ColorAccess

In [None]:
# snowflake and credentials
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = ''

user = ''
password = ''

In [None]:
START_DATE = datetime.datetime(2018, 12, 1)
END_DATE = datetime.datetime(2020, 6, 26)
SFA = SnowFlakeAnalysis(START_DATE, END_DATE, user, password, excludeEtl=True)

In [None]:
def getQueryTypeHistory(tableNames, processes=8, sfa=SFA):
    """
    """
    startTime = datetime.datetime.today()
    print('processing %s tables with %s processes' % (len(tableNames), processes))
    if isinstance(tableNames, str):
        tableNames = [tableNames]
    with mp.Pool(processes) as pool:
        out = pool.map(sfa.getQueryTypeHistory, tableNames)
    endTime = datetime.datetime.today()
    print('done: %s' % (endTime - startTime))
    return dict(zip(tableNames, out))

In [None]:
qtHistory = getQueryTypeHistory(SFA.snowFlakeTables, processes=8)

In [None]:
windsor = pd.DataFrame(index=SFA.snowFlakeTables, columns=list(SFA.queryTypes.keys()))
windsor['is_view'] = list(map(lambda x: x in SFA.snowFlakeViews, windsor.index))
windsor['used_by_view'] = list(map(lambda x: x in SFA.viewGraph.nodes(), windsor.index))
windsor['is_rollup'] = list(map(lambda x: x in SFA.rollupGraph.nodes(), windsor.index))
windsor['degree'] = list(map(lambda x: SFA.tableDegrees.loc[x], windsor.index))
windsor['is_gcs'] = list(map(lambda x: x in SFA.gcsTables, windsor.index))

# we perform a monthly mean that's windsorized. monthly is better than daily for heavy month-end queries that would get dropped during a daily windsorization.
for tableName in windsor.index:
    commandHistory = qtHistory[tableName].pivot(index='query_date', columns='query_type', values='hits')
    for k, v in SFA.queryTypes.items():
        cols = [c for c in commandHistory.columns if c in v]
        windsor.loc[tableName, k] = scipy.stats.mstats.winsorize(commandHistory[cols].sum(axis=1).resample('M').sum(), limits=[0.025, 0.025]).mean()
cols = ['degree'] + [c for c in windsor.columns if not c in ['degree']]
windsor = windsor[cols]
windsor.sort_values(['degree', 'select'], ascending=[False, False]).style

In [None]:
trafficReport = windsor.reindex(SFA.yamlInfo[SFA.yamlInfo['feed_name']== 'TrafficReport']['table_name'].unique()).sort_values(['degree', 'select'], ascending=[False, False])
trafficReport