### Step 2: merge all .csv files into a one-year dataframe

loop through the lob_caps directory, forming one time-sorted dataframe, with all CAPS files. These files captured sample bid and ask capitalization, and respective bid and ask volumes.

In [71]:
!pip3 install matplotlib
!pip3 install altair



In [72]:
import altair as alt
import pandas as pd
import os
import numpy as np

In [73]:
!mv $(find . -type d -name "lob_caps" -exec grep -q MATCH {} \; -print0 | xargs -0 echo) backup_match/

grep: ./lob_caps: Is a directory
mv: missing destination file operand after 'backup_match/'
Try 'mv --help' for more information.


In [74]:
#https://stackoverflow.com/a/21232849 model 
def getCAPSByDateAndType(type):  #returns a dict, date + df caps for that date, then extended date and time
                                # print("for type, ", type)  ./lob_caps/
    ret = []
    for root, dirs, files in os.walk("./lob_caps"): #core/gh-code/grus-code/ver2-pctChangeDriven/lob_caps
        for filename in files:
            if type in filename:
                # print("CAPS file, ", filename) #mac, do find . -name ._\* -delete
                ret.append(filename)
    return ret

csvFileList = getCAPSByDateAndType("CAPS") #iterate this array to dip into each csv, later on
li = []                         #form the endFrame / global data frame around this array
for filename in csvFileList:
    csv = "lob_caps/" + filename
    # print(csv)
    df = pd.read_csv(csv, index_col=None, header=0)
    li.append(df)

capsFrame = pd.concat(li, axis=0, ignore_index=True) #end frame contains all data
capsFrame.sort_values(by=['time'], ascending=True)   #sorted by time into one time series
print("for new df: ", capsFrame.shape[0])
start = capsFrame["time"].min()
end = capsFrame["time"].max()
print("start: ", start, " end: ", end)
print(capsFrame.columns)

for new df:  298660
start:  1660221600292.0  end:  1693078943553.0
Index(['bc', 'ac', 'tbv', 'tav', 'time', 'mp', 'minBid'], dtype='object')


## schema for capitalization data

loads the csv files, as acquired from coinbase

In [75]:
capsFrame.head(2) #shows the basic data collection via coinbase, these are aggregated values, collected several x a minute

Unnamed: 0,bc,ac,tbv,tav,time,mp,minBid
0,2828853.84,10260926.86,541682.81,221178.78,1672934000000.0,11.76,11.71
1,2826677.43,10256711.31,541497.46,220827.12,1672934000000.0,11.76,11.71


### imputation

In [76]:
# impute missing values with last non-null value
capsFrame['bc'] = capsFrame['bc'].fillna(method='ffill')
capsFrame['ac'] = capsFrame['ac'].fillna(method='ffill')
capsFrame['tbv'] = capsFrame['tbv'].fillna(method='ffill')
capsFrame['tav'] = capsFrame['tav'].fillna(method='ffill')
capsFrame['mp'] = capsFrame['mp'].fillna(method='ffill')
capsFrame['minBid'] = capsFrame['minBid'].fillna(method='ffill')


## Discover precursor and surge episodes

the goal of the data prep is to discover periods of continuous, positive momentum. These are **surges**. 

The periods preceding surges are, for the sake of the experiment, **precursors**. They are detected as periods of discontinuous positive momentum, or negative momentum. 

A ten-row window is used to calculate positive or negative momentum. A percent **change** is calculated for the ten row subsample.

## regularization of critical features
get percent change as basis for comprehending LOB

create new columns which depict the momentum of one row versus the next, in terms of price , capitalization and volume

In [77]:
# Load your time series data into a pandas dataframe
caps_df = capsFrame   
lookback_period = 10 # in rows
caps_df['change'] = caps_df['mp'].pct_change(periods=lookback_period)
caps_df['bc_change'] = caps_df['bc'].pct_change(periods=lookback_period)
caps_df['ac_change'] = caps_df['ac'].pct_change(periods=lookback_period)
caps_df['tav_change'] = caps_df['tav'].pct_change(periods=lookback_period)
caps_df['tbv_change'] = caps_df['tbv'].pct_change(periods=lookback_period)
## key components: bc_change, ac_change, tav_change, tbv_change, change
# caps_df.sample
print(caps_df.shape[0], caps_df.columns)# Calculate the returns of your asset over a fixed lookback period

298660 Index(['bc', 'ac', 'tbv', 'tav', 'time', 'mp', 'minBid', 'change', 'bc_change',
       'ac_change', 'tav_change', 'tbv_change'],
      dtype='object')


###  establish benchmarks for percent change

the mean of change represents the average rate of change between LOB samples. This is used to determine whether the change between rows is significant or not. 

In [78]:
#for period, average or mean change metric. this changes with window size
meanChange = round(caps_df['change'].mean(),8)
meanChange

0.0005284

## data mining: sequence discovery
define precursors from surges, prepare the data with this sequence: 

precursor -> surge

prepare to cluster every precursor, by the sequential, resultant surge. Do not assume causality, but rather preoccurance.

use the threshold, mean change as tool to separate precursor from surges, where surges represent periods of positive momentum above threshold.

This step defines the data schema for the remainder of the process, where key statistics are defined for precursors and surges.

In [79]:
# identify units of 10 rows where the percent change is greater or less than the threshold
### key components: bc_change, ac_change, tav_change, tbv_change, change
threshold = meanChange
surges = []
precursors = []
for i in range(0,len(caps_df),10):
    if caps_df.iloc[i:i+10]['change'].mean() >= threshold:
        surges.append({'time': caps_df.iloc[i]['time'],
                       's_MP': caps_df.iloc[i]['mp'],
                       'change': caps_df.iloc[i:i+10]['change'].mean(),
                       'type':'surge'})  #['bc', 'ac', 'tbv', 'tav', 'time', 'mp', 'minBid', 'change']
    else:
        precursors.append({'time': caps_df.iloc[i]['time'],
                           'p_MP': caps_df.iloc[i]['mp'],
                           'change': caps_df.iloc[i:i+10]['change'].mean(),
                            'type':'precursor',
                            'precursor_buy_cap_pct_change':caps_df.iloc[i]['bc_change'], 
                            'precursor_ask_cap_pct_change':caps_df.iloc[i]['ac_change'],
                            'precursor_bid_vol_pct_change':caps_df.iloc[i]['tbv_change'],
                            'precursor_ask_vol_pct_change':caps_df.iloc[i]['tav_change']
                            })  

In [80]:
# for item in surges:
#     print(item)

In [81]:
# for item in precursors:
#     print(item)

## prepprocess: merge precursors and surges into time series

a dataframe of sequences, **sequence_df** is created by concatenating both buckets, and sorting by time. This will create a time series of surge and precursor periods, as defined by: 

* 10 window percent change values
* contiguity: these precursor and surges are next to each other and thus have a length or duration of momentum.

In [82]:
surges_df = pd.DataFrame(surges)
precursors_df = pd.DataFrame(precursors)
sequence_df = pd.concat([surges_df, precursors_df]).sort_values(by=['time'], ascending=True)

In [83]:
sequence_df.index

Int64Index([ 9696, 18333, 18334, 18335, 18336, 18337, 18338,  9697, 18339,
             9698,
            ...
            10476,  5635, 10477, 10478, 10479, 10480, 10481,  5636,  5637,
             5638],
           dtype='int64', length=29866)

### view the aligned, continuous time series of precursors and surges

view the final abstraction: sets of precursor periods, next to surges, in a linear time series. Precursors effectively precede surges on a linear time series.

In [84]:
# for index, row in sequence_df.iterrows():
#     print(row['surge'], row['precursor'])
sequence_df['type'].head(40)

9696         surge
18333    precursor
18334    precursor
18335    precursor
18336    precursor
18337    precursor
18338    precursor
9697         surge
18339    precursor
9698         surge
9699         surge
18340    precursor
18341    precursor
18342    precursor
9700         surge
9701         surge
18343    precursor
18344    precursor
9702         surge
18345    precursor
18346    precursor
18347    precursor
18348    precursor
18349    precursor
18350    precursor
18351    precursor
9703         surge
9704         surge
18352    precursor
18353    precursor
9705         surge
9706         surge
9707         surge
9708         surge
18354    precursor
18355    precursor
18356    precursor
18357    precursor
9709         surge
18358    precursor
Name: type, dtype: object

In [85]:
# sequence_df.head(45)

## visualize proof of algorithmic accuracy

this chart will plot the price time series, with an area of precursor and surge, as proof of our algorithmic accuracy.

In [86]:
subset = sequence_df[:4999]
line = alt.Chart(subset).mark_line(color='green').encode(
    x='time',
    y='s_MP'
)

s_bar = alt.Chart(subset).mark_bar().encode(
    x='time',
    y='s_MP',
    color='type:N'
)

p_bar = alt.Chart(subset).mark_bar().encode(
    x='time',
    y='p_MP',
    color='type:N'
)

chart = (s_bar + p_bar + line).properties(width=600, height=500)
chart.title = 'Data Mining Accuracy, Surge vs Precursor Sequence'
subtitle = 'Precursors are contiguous periods where percentage rate of growth is less than threshold'
chart.properties(title=alt.TitleParams(text=[chart.title, subtitle], baseline='bottom', orient='top', anchor='start', fontSize=14))
chart.interactive()

In [87]:
sequence_df.columns

Index(['time', 's_MP', 'change', 'type', 'p_MP',
       'precursor_buy_cap_pct_change', 'precursor_ask_cap_pct_change',
       'precursor_bid_vol_pct_change', 'precursor_ask_vol_pct_change'],
      dtype='object')

### data mining 2: information gain, create new features

Perform information gain on grouped precursors and surges

define the **sum change**, or total change per continuous episode (precursor or surge). 

define the **length** of each episode. 

define the height of the surge, how high did the continuous positive momentum reach?

define the size (area) of the surge, as a triangular area (height times length), as **surge_area**

Create one line to describe a precursor or search and it's related order book statistics

In [88]:
#### imputation

In [89]:
# # impute missing values with last non-null value DONE PRIOR, NOW AT START
sequence_df['s_MP'] = sequence_df['s_MP'].fillna(method='ffill')
sequence_df['p_MP'] = sequence_df['p_MP'].fillna(method='ffill')
sequence_df['precursor_buy_cap_pct_change'] = sequence_df['precursor_buy_cap_pct_change'].fillna(method='ffill')
sequence_df['precursor_ask_cap_pct_change'] = sequence_df['precursor_ask_cap_pct_change'].fillna(method='ffill')
sequence_df['precursor_bid_vol_pct_change'] = sequence_df['precursor_bid_vol_pct_change'].fillna(method='ffill')
sequence_df['precursor_ask_vol_pct_change'] = sequence_df['precursor_ask_vol_pct_change'].fillna(method='ffill')

In [90]:


sequence_df['group'] = (sequence_df['type'] != sequence_df['type'].shift(1)).cumsum()
sequence_df['length'] = sequence_df.groupby(['type', 'group'])['group'].transform('count')

print(sequence_df.shape[0])
sequence_df['sum_change'] = sequence_df.groupby(['type', 'group'])['change'].transform('sum')

sequence_df['max_surge_mp'] = sequence_df.groupby(['type', 'group'])['s_MP'].transform('max')
sequence_df['min_surge_mp'] = sequence_df.groupby(['type', 'group'])['s_MP'].transform('min')

sequence_df['max_precursor_mp'] = sequence_df.groupby(['type', 'group'])['p_MP'].transform('max')
sequence_df['min_precursor_mp'] = sequence_df.groupby(['type', 'group'])['p_MP'].transform('min')

sequence_df['area']  = sequence_df.apply(lambda row: row['length'] * row['sum_change'], axis=1)

sequence_df.loc[sequence_df['type'] == 'surge', 'surge_area'] = sequence_df.loc[sequence_df['type'] == 'surge', 'area']




29866


In [91]:
# define a custom function to calculate the percentage by which max_surge_mp exceeds max_precursor_mp
'''for a pandas dataframe wth attributes ['group', 'time', 's_MP', 'change', 'type', 'length', 'sum_change',
       'max_surge_mp', 'min_surge_mp', 'area', 'surge_area', 'group', 'time',
       'change', 'type', 'p_MP', 'precursor_buy_cap_pct_change',
       'precursor_ask_cap_pct_change', 'precursor_bid_vol_pct_change',
       'precursor_ask_vol_pct_change', 'length', 'sum_change',
       'max_precursor_mp', 'min_precursor_mp', 'area'] 
       group by type, group then create  
       a new column 'surge_targets_met_pct' which equals the percentage 
       by which the max_surge_mp exceeds the max_precursor_mp'''

sequence_df['surge_targets_met_pct']  = sequence_df.apply(lambda group: ((group['max_precursor_mp']-group['max_surge_mp'])/group['max_surge_mp']  ) *100, axis=1)

sequence_df.columns
print(sequence_df.shape[0])

29866


In [92]:
sequence_df.head(30)

Unnamed: 0,time,s_MP,change,type,p_MP,precursor_buy_cap_pct_change,precursor_ask_cap_pct_change,precursor_bid_vol_pct_change,precursor_ask_vol_pct_change,group,length,sum_change,max_surge_mp,min_surge_mp,max_precursor_mp,min_precursor_mp,area,surge_area,surge_targets_met_pct
9696,1660222000000.0,30.0,0.505364,surge,,,,,,1,1,0.5053641,30.0,30.0,,,0.505364,0.505364,
18333,1660222000000.0,30.0,-0.000533,precursor,29.98,-0.000618,-1.7e-05,-0.00021,-0.002384,2,6,-0.005009518,30.0,30.0,29.98,29.85,-0.030057,,-0.066667
18334,1660222000000.0,30.0,-0.001067,precursor,29.98,0.004469,-1.8e-05,0.001356,-0.00232,2,6,-0.005009518,30.0,30.0,29.98,29.85,-0.030057,,-0.066667
18335,1660222000000.0,30.0,-0.001035,precursor,29.93,-0.002967,-2e-06,-0.000885,-0.000246,2,6,-0.005009518,30.0,30.0,29.98,29.85,-0.030057,,-0.066667
18336,1660222000000.0,30.0,-0.001136,precursor,29.88,-0.000648,8e-06,-0.000158,0.00114,2,6,-0.005009518,30.0,30.0,29.98,29.85,-0.030057,,-0.066667
18337,1660222000000.0,30.0,-6.7e-05,precursor,29.88,0.000163,2e-06,6.2e-05,0.000263,2,6,-0.005009518,30.0,30.0,29.98,29.85,-0.030057,,-0.066667
18338,1660222000000.0,30.0,-0.001171,precursor,29.85,-0.000443,2.3e-05,-0.000184,0.003196,2,6,-0.005009518,30.0,30.0,29.98,29.85,-0.030057,,-0.066667
9697,1660222000000.0,29.86,0.00067,surge,29.85,-0.000443,2.3e-05,-0.000184,0.003196,3,1,0.0006702414,29.86,29.86,29.85,29.85,0.00067,0.00067,-0.03349
18339,1660222000000.0,29.86,0.000135,precursor,29.89,-0.002358,-2e-06,-0.000818,-0.000333,4,1,0.000134564,29.86,29.86,29.89,29.89,0.000135,,0.100469
9698,1660222000000.0,29.88,0.001273,surge,29.89,-0.002358,-2e-06,-0.000818,-0.000333,5,2,0.001807711,29.9,29.88,29.89,29.89,0.003615,0.003615,-0.033445


## data mining 3: form final sequences by statistical weight

Critical group by unique identifier

In [93]:
unique_df = sequence_df.groupby('group').first().reset_index()
# print(unique_df)

#### Merge even and odd Rows to form the final sequences

Even rows contain surge, and odd rows contain precursors. **When you merge them, you form a sequence of precursor, and surge.**

Each row will contain a continuous **precursor->surge** sequence.

In [94]:
even_df = unique_df.iloc[::2].reset_index(drop=True)
odd_df = unique_df.iloc[1::2].reset_index(drop=True)

merged_df = pd.concat([even_df, odd_df], axis=1)

# print(merged_df)

In [95]:
nan_cols = merged_df.dropna(axis=1, how='all')
nan_cols.head()

Unnamed: 0,group,time,s_MP,change,type,p_MP,precursor_buy_cap_pct_change,precursor_ask_cap_pct_change,precursor_bid_vol_pct_change,precursor_ask_vol_pct_change,...,precursor_bid_vol_pct_change.1,precursor_ask_vol_pct_change.1,length,sum_change,max_surge_mp,min_surge_mp,max_precursor_mp,min_precursor_mp,area,surge_targets_met_pct
0,1,1660222000000.0,30.0,0.505364,surge,,,,,,...,-0.00021,-0.002384,6.0,-0.005009518,30.0,30.0,29.98,29.85,-0.030057,-0.066667
1,3,1660222000000.0,29.86,0.00067,surge,29.85,-0.000443,2.3e-05,-0.000184,0.003196,...,-0.000818,-0.000333,1.0,0.000134564,29.86,29.86,29.89,29.89,0.000135,0.100469
2,5,1660222000000.0,29.88,0.001273,surge,29.89,-0.002358,-2e-06,-0.000818,-0.000333,...,0.001573,-0.004835,3.0,-0.003410602,29.9,29.9,29.94,29.87,-0.010232,0.133779
3,7,1660222000000.0,29.8,0.000873,surge,29.87,0.000349,3.1e-05,0.000189,0.004485,...,-0.000398,6.2e-05,2.0,8.855895e-07,29.88,29.88,29.87,29.85,2e-06,-0.033467
4,9,1660223000000.0,29.9,0.001305,surge,29.85,0.000473,1.5e-05,0.00012,0.001861,...,0.000334,0.001094,7.0,-0.01377036,29.9,29.9,29.95,29.52,-0.096393,0.167224


In [96]:
nan_cols.columns

Index(['group', 'time', 's_MP', 'change', 'type', 'p_MP',
       'precursor_buy_cap_pct_change', 'precursor_ask_cap_pct_change',
       'precursor_bid_vol_pct_change', 'precursor_ask_vol_pct_change',
       'length', 'sum_change', 'max_surge_mp', 'min_surge_mp',
       'max_precursor_mp', 'min_precursor_mp', 'area', 'surge_area',
       'surge_targets_met_pct', 'group', 'time', 's_MP', 'change', 'type',
       'p_MP', 'precursor_buy_cap_pct_change', 'precursor_ask_cap_pct_change',
       'precursor_bid_vol_pct_change', 'precursor_ask_vol_pct_change',
       'length', 'sum_change', 'max_surge_mp', 'min_surge_mp',
       'max_precursor_mp', 'min_precursor_mp', 'area',
       'surge_targets_met_pct'],
      dtype='object')

### Write to CSV: step one, pipeline
Label to use is surge_targets_met_pct

In [97]:
nan_cols.to_csv('pipeline1.csv', index=False)