In [1]:
import numpy as np 
import pandas as pd
import json 
from typing import List
from datetime import datetime
import os

import snowflake
from snowflake.snowpark.session import Session
from snowflake.snowpark.functions import sproc, col, lit, count

In [2]:
#Build session object: 
accountname = 'xe85544.east-us-2.azure'
username = 'kx'

connection_parameters = {
    "account": accountname,
    "user": username,
    "password": password,
    "role": "ACCOUNTADMIN",
    "database": "KX",
    "schema": "BRUNO",
    "warehouse": "KX"
}

session = Session.builder.configs(connection_parameters).create()

### Creating the Ticker List: 

In this section, we will output a list of lists that contain tickers that will be fed into a Sproc. 

This algo looks to fill buckets based on a row max input.

In [5]:
trades_df = session.table('trades')
quotes_df = session.table('quotes')

trades_agg = trades_df.select(col('"Symbol"')).group_by('"Symbol"')\
            .agg(count('"Symbol"').alias('Record_Count')).to_pandas()

quotes_agg = quotes_df.select(col('"Symbol"')).group_by('"Symbol"')\
            .agg(count('"Symbol"').alias('Record_Count')).to_pandas()

records_df = trades_agg.merge(quotes_agg, how = 'inner', left_on = 'Symbol', right_on = 'Symbol')

records_df['SUM'] = records_df['RECORD_COUNT_x'] + records_df['RECORD_COUNT_y']
records_df.sort_values(by='SUM', inplace=True)
records_df.head(5)

Unnamed: 0,Symbol,RECORD_COUNT_x,RECORD_COUNT_y,SUM
2988,CHNG,1,2,3
5360,MTEST,2,6,8
3041,RSXJ,7,6,13
1922,RSX,8,6,14
5770,PTEST,7,10,17


In [6]:
def group_stocks(df, ROW_MAX):
    groups = []
    group_total = 0
    group_symbols = set()
    for i, row in df.iterrows():
        symbol, value = row['Symbol'], row['SUM']
        if group_total + value <= ROW_MAX:
            group_total += value
            group_symbols.add(symbol)
        else:
            groups.append(list(group_symbols))
            group_total = value
            group_symbols = {symbol}
    groups.append(list(group_symbols))
    return groups

In [7]:
ticker_buckets = group_stocks(records_df, 10_000_000)

In [8]:
len(ticker_buckets)

173

In [12]:
ticker_buckets[0]

['PCTT U',
 'CHG',
 'MBAC U',
 'KEY PRL',
 'BANF P',
 'CLRC',
 'IACC',
 'OKYO',
 'KIM PRM',
 'GRU',
 'PFIE',
 'LMST',
 'GFOR',
 'DTRT',
 'BTTX',
 'GNFT',
 'NCPL W',
 'JFBR W',
 'AUBN',
 'RILY G',
 'MNSB',
 'NBST',
 'IBDN',
 'PWFL',
 'OXUS',
 'DISA W',
 'ACR PRC',
 'PACX',
 'MFUL',
 'DMAT',
 'AGRX',
 'MLAI',
 'GLBS',
 'NS PRA',
 'IMPP P',
 'GRNQ',
 'ADFI',
 'BMAC WS',
 'CRBP',
 'PETV W',
 'LMFA',
 'DCTH',
 'MIO',
 'MLVF',
 'ALSA W',
 'FRC PRI',
 'LOAN',
 'PSB PRY',
 'LHC WS',
 'ML WS',
 'IHY',
 'RACB',
 'ANPC',
 'HCTI',
 'FXCO R',
 'MHF',
 'ACRO WS',
 'GRNB',
 'AHT PRF',
 'FPAC',
 'QLI',
 'IHIT',
 'CMRE PRE',
 'BBAI WS',
 'LUCY',
 'MAAX',
 'CURI W',
 'MJXL',
 'AKU',
 'CREG',
 'ORTX',
 'CLGN',
 'FTPA U',
 'GSRM',
 'ESHY',
 'ONBP P',
 'BEAT W',
 'ENVB',
 'ATAX',
 'MBTC R',
 'HYMC W',
 'FPEI',
 'MFA PRB',
 'BNOX',
 'OXAC W',
 'GPJA',
 'LOCC',
 'CLRM W',
 'FGF',
 'OFED',
 'FHS',
 'JG',
 'DLNG',
 'ADRA WS',
 'PGSS U',
 'RXRA',
 'CFFS',
 'EPHY W',
 'ICAP',
 'GHAC',
 'AIMA U',
 'ECCC',
 'BWSN'

### Create and run Sproc using multi-cluster warehouses: 

Steps: 
1. create a list of list of tickers to pass into a sproc
2. create an SPROC that has an input list as an argument, that contains the respective tickers to filter by, and calls the main logic

In [10]:
license = 'Y8NTo4WkoyVFpNkz0oikQnrkIJbISSogiCwXxiUY/idrTFbSW9NHlidzlg1GDZYsoyeWTCzSvfkOliez/uOGvRem6fmbR9JkDEaTVpZjvVs7NltmlqSmGFvtNg2WNsuINhgs2UcXR/5Gh5vLLKMH/Nnx/DOIMyayRmMGJvEGU4i5hrJG'

In [11]:
%%time

@sproc(packages=['snowflake-snowpark-python', 'numpy', 'pandas', 'pyarrow'],
       imports=['@kx/pykx-package.zip', '@kx/01_interval_returns.q_'],
       name='ms_test_sproc', 
       is_permanent = True, 
       stage_location= '@KX', 
       replace = True)
def ms_test_sproc(session: Session, tickers: List[str]) -> None:
    """Takes in a list of tickers that we will be filtering on"""
    import base64, json, os, shutil, sys, time, zipfile
   
    ## CUSTOM SETUP OF PyKX (won't be needed when PyKX is available on Anaconda)
    tstart = time.perf_counter()
    if not os.path.exists('/tmp/k4.lic'):
        import_dir = sys._xoptions.get("snowflake_import_directory")
        lib = 'pykx-package.zip'
        lib_path = os.path.join(import_dir, lib)
        load_path = '/tmp/pykx-package/'

        if os.path.exists('/tmp/pykx-package'):
            shutil.rmtree('/tmp/pykx-package')
            time.sleep(0)
        with zipfile.ZipFile(lib_path, 'r') as zip_ref:
            zip_ref.extractall('/tmp/pykx-package')

        sys.path.insert(0, '/tmp/pykx-package')
        
        shutil.copyfile(os.path.join(import_dir, '01_interval_returns.q_'), '/tmp/01_interval_returns.q_')

        with open('/tmp/k4.lic', 'wb') as f:
            f.write(base64.b64decode(license))
    tend = time.perf_counter()
    tpykxinstall = tend - tstart
    ## END

    tstart = time.perf_counter()
    os.environ['QHOME'] = ''
    os.environ['QLIC'] = '/tmp'

    os.environ['PYKX_LOAD_PYARROW_UNSAFE'] = '1'
    os.environ['PYKX_NOQCE'] = '1'
    os.environ['SKIP_UNDERQ'] = '1'
    os.environ['IGNORE_QHOME'] = '1'
    os.environ['QARGS'] = '--licensed'
    
    import pykx as kx
    tend = time.perf_counter()
    tpykximport = tend - tstart
    
    tstart = time.perf_counter()
    select_columns = ['"TTime"', '"Symbol"', '"Trade Volume"', '"Trade Price"']
    trades = session.table('trades').select(select_columns)\
                    .filter(col('"Symbol"').isin(tickers))\
                    .order_by('"TTime"').to_pandas()                 # 1
    trades.columns = ['time', 'symbol', 'volume', 'price']
    tend = time.perf_counter()
    tloadtrades = tend - tstart
    
    tstart = time.perf_counter()
    kx.q['trades'] = trades
    del trades
    tend = time.perf_counter()
    ttoqtrades = tend - tstart

    tstart = time.perf_counter()
    select_columns = ['"TTime"', '"Symbol"', '"Bid_Size"', '"Bid_Price"', '"Offer_Size"', '"Offer_Price"']
    quotes = session.table('quotes').select(select_columns)\
                    .filter(col('"Symbol"').isin(tickers))\
                    .order_by('"TTime"').to_pandas()                 # 2
    quotes.columns = ['time', 'symbol', 'ask_volume', 'ask_price', 'bid_volume', 'bid_price']
    tend = time.perf_counter()
    tloadquotes = tend - tstart
    
    tstart = time.perf_counter()
    kx.q['quotes'] = quotes
    del quotes
    tend = time.perf_counter()
    ttoqquotes = tend - tstart

    tstart = time.perf_counter()
    kx.q('\l /tmp/01_interval_returns.q_')  # Run a locked q script to apply the business logic             # 3
    tend = time.perf_counter()
    tqalgo = tend - tstart
    
    tstart = time.perf_counter()
    output = kx.q('t').pd()
    tend = time.perf_counter()
    ttopd = tend - tstart
    
    tstart = time.perf_counter()
    session.write_pandas(output, 'MS_OUTPUT', overwrite=False, auto_create_table=True, chunk_size=1000000, compression='snappy')        # 4
    tend = time.perf_counter()
    tout = tend - tstart
    
    return None

CPU times: user 141 ms, sys: 15.9 ms, total: 157 ms
Wall time: 32.7 s


In [13]:
import time
from joblib import Parallel, delayed
start = datetime.now()

#instantiate queries
query_ids = []

def execute_sproc(job):
    
    conn = snowflake.connector.connect(
        account="xe85544.east-us-2.azure",
        user="kx",
        password = 'Snowflake1',
        role="ACCOUNTADMIN",  # optional
        warehouse="KX",
        database="KX",
        schema="BRUNO",
    )
    cur = conn.cursor()
    cur.execute_async(f'CALL MS_TEST_SPROC({job})')
    return cur.sfqid

results = Parallel(n_jobs=-1)(delayed(execute_sproc)(job) for job in ticker_buckets)

query_ids.extend(results)

end = datetime.now()
print(end-start)

0:00:09.244673


In [14]:
conn = snowflake.connector.connect(
    account="xe85544.east-us-2.azure",
    user="kx",
    password = 'Snowflake1',
    role="ACCOUNTADMIN",  # optional
    warehouse="kx",  # medium snowpark-optimized
    database="KX",
    schema="BRUNO",
)
cur = conn.cursor()

t = cur.execute(f'''
SELECT
    MAX(END_TIME) AS end_time
FROM
    SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY
WHERE QUERY_ID in {tuple(query_ids)}
''').fetch_pandas_all()

In [15]:
t['END_TIME'].values[0] - pd.Timestamp(start) - pd.Timedelta(hours=4) #timezone adjust

Timedelta('0 days 00:01:37.837763')