### Step 2: merge all .csv files into a one-year dataframe

loop through the lob_caps directory, forming one time-sorted dataframe, with all CAPS files. These files captured sample bid and ask capitalization, and respective bid and ask volumes.

In [1]:
!pip3 install matplotlib
!pip3 install altair



In [2]:
import altair as alt
import pandas as pd
import os
import numpy as np

In [3]:
!mv $(find . -type d -name "lob_caps" -exec grep -q MATCH {} \; -print0 | xargs -0 echo) backup_match/

mv: missing destination file operand after 'backup_match/'
Try 'mv --help' for more information.


In [6]:
#https://stackoverflow.com/a/21232849 model 
def getCAPSByDateAndType(type):  #returns a dict, date + df caps for that date, then extended date and time
                                # print("for type, ", type)
    ret = []
    for root, dirs, files in os.walk("./lob_caps/"):
        for filename in files:
            if type in filename:
#                 print("CAPS file, ", filename) #mac, do find . -name ._\* -delete
                ret.append(filename)
    return ret

csvFileList = getCAPSByDateAndType("CAPS") #iterate this array to dip into each csv, later on
li = []                         #form the endFrame / global data frame around this array
for filename in csvFileList:
    csv = "lob_caps/" + filename
    # print(csv)
    df = pd.read_csv(csv, index_col=None, header=0)
    li.append(df)

capsFrame = pd.concat(li, axis=0, ignore_index=True) #end frame contains all data
capsFrame.sort_values(by=['time'], ascending=True)   #sorted by time into one time series
print("for new df: ", capsFrame.shape[0])
start = capsFrame["time"].min()
end = capsFrame["time"].max()
print("start: ", start, " end: ", end)
print(capsFrame.columns)

for new df:  298660
start:  1660221600292.0  end:  1693078943553.0
Index(['bc', 'ac', 'tbv', 'tav', 'time', 'mp', 'minBid'], dtype='object')


In [None]:
# def getSKEWByDateAndType(type):  #returns a dict, date + df caps for that date, then extended date and time
#                                 # print("for type, ", type)
#     ret = []
#     for root, dirs, files in os.walk("./lob_caps/"):
#         for filename in files:
#             if type in filename:
# #                 print("CAPS file, ", filename) #mac, do find . -name ._\* -delete
#                 ret.append(filename)
#     return ret

# csvFileList = getSKEWByDateAndType("MEANSHIFT") #iterate this array to dip into each csv, later on
# li = []                         #form the endFrame / global data frame around this array
# for filename in csvFileList:
#     csv = "lob_caps/" + filename
#     # print(csv)
#     df = pd.read_csv(csv, index_col=None, header=0)
#     li.append(df)

# skewFrame = pd.concat(li, axis=0, ignore_index=True) #end frame contains all data
# skewFrame.sort_values(by=['timeStamp'], ascending=True)   #sorted by time into one time series
# skewFrame.rename(columns={'timeStamp': 'time'}, inplace=True)
# print("for new df: ", skewFrame.shape[0])
# start = skewFrame["time"].min()
# end = skewFrame["time"].max()
# print("start: ", start, " end: ", end)
# print(skewFrame.columns)

In [None]:
# Merging those two data frames Will not take place based on a Shared key of time
# You'll need to look up the SKU value for every row based on approximation
# merged_df = pd.merge(capsFrame, skewFrame, on='time')

In [None]:
# skewFrame.head(10)

In [None]:
# capsFrame.loc[capsFrame['time'].sub(skewFrame['time'].values[0]).abs().idxmin(), 'mean'] = skewFrame['mean'].values[0]
# capsFrame.loc[capsFrame['time'].sub(skewFrame['time'].values[0]).abs().idxmin(), 'skew'] = skewFrame['skew'].values[0]
# capsFrame.loc[capsFrame['time'].sub(skewFrame['time'].values[0]).abs().lt(pd.Timedelta(minutes=2)).idxmax(), 'mean'] = skewFrame['mean'].values[0]
# 
# Could not get this operation to work try it again using a range of values once the precursor and surge are defined


## schema for capitalization data

loads the csv files, as acquired from coinbase

In [7]:
capsFrame.head(2) #shows the basic data collection via coinbase, these are aggregated values, collected several x a minute

Unnamed: 0,bc,ac,tbv,tav,time,mp,minBid
0,2828853.84,10260926.86,541682.81,221178.78,1672934000000.0,11.76,11.71
1,2826677.43,10256711.31,541497.46,220827.12,1672934000000.0,11.76,11.71


## Discover precursor and surge episodes

the goal of the data prep is to discover periods of continuous, positive momentum. These are **surges**. 

The periods preceding surges are, for the sake of the experiment, **precursors**. They are detected as periods of discontinuous positive momentum, or negative momentum. 

A ten-row window is used to calculate positive or negative momentum. A percent **change** is calculated for the ten row subsample.

## get percent change as basis for comprehending LOB

create new columns which depict the momentum of one row versus the next, in terms of price , capitalization and volume

In [8]:
# Load your time series data into a pandas dataframe
caps_df = capsFrame   
lookback_period = 10 # in rows
caps_df['change'] = caps_df['mp'].pct_change(periods=lookback_period)
caps_df['bc_change'] = caps_df['bc'].pct_change(periods=lookback_period)

caps_df['ac_change'] = caps_df['ac'].pct_change(periods=lookback_period)

caps_df['tav_change'] = caps_df['tav'].pct_change(periods=lookback_period)

caps_df['tbv_change'] = caps_df['tbv'].pct_change(periods=lookback_period)

# caps_df.sample
print(caps_df.shape[0], caps_df.columns)# Calculate the returns of your asset over a fixed lookback period

298660 Index(['bc', 'ac', 'tbv', 'tav', 'time', 'mp', 'minBid', 'change', 'bc_change',
       'ac_change', 'tav_change', 'tbv_change'],
      dtype='object')


###  establish benchmarks for percent change

the mean of change represents the average rate of change between LOB samples. This is used to determine whether the change between rows is significant or not. 

In [9]:
#for period, average or mean change metric. this changes with window size
meanChange = round(caps_df['change'].mean(),8)
meanChange

0.0005284

## define precursors from surges

use the threshold, mean change as tool to separate precursor from surges, where surges represent periods of positive momentum above threshold.

This step defines the data schema for the remainder of the process, where key statistics are defined for precursors and surges.

In [10]:
# identify units of 10 rows where the percent change is greater or less than the threshold
threshold = meanChange
surges = []
precursors = []
for i in range(0,len(caps_df),10):
    if caps_df.iloc[i:i+10]['change'].mean() >= threshold:
        surges.append({'time': caps_df.iloc[i]['time'],
                       's_MP': caps_df.iloc[i]['mp'],
                       'change': caps_df.iloc[i:i+10]['change'].mean(),
                       'type':'surge'})  #['bc', 'ac', 'tbv', 'tav', 'time', 'mp', 'minBid', 'change']
    else:
        precursors.append({'time': caps_df.iloc[i]['time'],
                           'p_MP': caps_df.iloc[i]['mp'],
                           'change': caps_df.iloc[i:i+10]['change'].mean(),
                            'type':'precursor',
                            'p_buyCap':caps_df.iloc[i]['bc'], 
                            'p_askCap':caps_df.iloc[i]['ac'],
                            'p_totalBidVol':caps_df.iloc[i]['tbv'],
                            'p_totalAskVol':caps_df.iloc[i]['tav']
                            })  

In [None]:
# for item in surges:
#     print(item)

In [None]:
# for item in precursors:
#     print(item)

## merge precursors and surges into time series

a dataframe of sequences, **sequence_df** is created by concatenating both buckets, and sorting by time. This will create a time series of surge and precursor periods, as defined by: 

* 10 window percent change values
* contiguity: these precursor and surges are next to each other and thus have a length or duration of momentum.

In [None]:
surges_df = pd.DataFrame(surges)
precursors_df = pd.DataFrame(precursors)
sequence_df = pd.concat([surges_df, precursors_df]).sort_values(by=['time'], ascending=True)

In [None]:
sequence_df.index

### view the aligned, continuous time series of precursors and surges

view the final abstraction: sets of precursor periods, next to surges, in a linear time series. Precursors effectively precede surges on a linear time series.

In [None]:
# for index, row in sequence_df.iterrows():
#     print(row['surge'], row['precursor'])
sequence_df['type'].head(40)

In [None]:
# sequence_df.head(45)

## visualize proof of algorithmic accuracy

this chart will plot the price time series, with an area of precursor and surge, as proof of our algorithmic accuracy.

In [None]:
subset = sequence_df[:4999]
line = alt.Chart(subset).mark_line(color='green').encode(
    x='time',
    y='s_MP'
)

s_bar = alt.Chart(subset).mark_bar().encode(
    x='time',
    y='s_MP',
    color='type:N'
)

p_bar = alt.Chart(subset).mark_bar().encode(
    x='time',
    y='p_MP',
    color='type:N'
)

chart = (line + s_bar + p_bar).properties(width=600, height=500)
chart.title = 'Sequential order of precursor and surges for April 7th 2023'
subtitle = 'Precursors are contiguous periods where percentage rate of growth is less than threshold'
chart.properties(title=alt.TitleParams(text=[chart.title, subtitle], baseline='bottom', orient='top', anchor='start', fontSize=14))
chart.interactive()

In [None]:
sequence_df.columns

### Perform information gain on grouped precursors and surges

define the **sum change**, or total change per continuous episode (precursor or surge). 

define the **length** of each episode. 

define the height of the surge, how high did the continuous positive momentum reach?

define the size (area) of the surge, as a triangular area (height times length), as **surge_area**

Create one line to describe a precursor or search and it's related order book statistics

In [None]:

sequence_df['group'] = (sequence_df['type'] != sequence_df['type'].shift(1)).cumsum()
sequence_df['length'] = sequence_df.groupby(['type', 'group'])['group'].transform('count')
# sequence_df['identifier'] = sequence_df.groupby(['type', 'group'])['time'].transform('min') #prep the label early, if surge?
print(sequence_df.shape[0])
sequence_df['sum_change'] = sequence_df.groupby(['type', 'group'])['change'].transform('sum')
print(sequence_df.shape[0])

sequence_df['area']  = sequence_df.apply(lambda row: row['length'] * row['sum_change'], axis=1)
sequence_df.loc[sequence_df['type'] == 'surge', 'surge_area'] = sequence_df.loc[sequence_df['type'] == 'surge', 'area']
sequence_df.columns
# sequence_df['area'] = sequence_df.groupby(['type', 'group'])['sum_change'].multiply(sequence_df['length'])
# sequence_df.loc[sequence_df['type'] == 'surge', 'surge_area'] =  sequence_df['length'] * sequence_df['sum_change']


# sequence_df['sum'] = sequence_df.groupby(['surge', 'group'])['change'].transform('sum')

# sequence_df['end_time'] = sequence_df.groupby(['surge', 'group'])['time'].transform('max')
# sequence_df['type'] = sequence_df['surge']

# sequence_df['buyCapSum'] = sequence_df.groupby(['surge', 'group'])['buyCap'].transform('avg')
# sequence_df['askCapSum'] = sequence_df.groupby(['surge', 'group'])['askCap'].transform('avg')

# calculate the area for the surge

# sequence_df = sequence_df.drop('next_value', axis=1)
# sequence_df.loc[sequence_df['bucket'] == 'surge', 'surge_length'] =  sequence_df['length']
# sequence_df.drop('length', axis=1, inplace=True)
# df = df.loc[:,~df.columns.duplicated()]

#unique_df = sequence_df.groupby('identifier').first().reset_index()

# unique_df.loc[unique_df['surge'] == '1', 'surge_length'] = unique_df['length']
# unique_df.loc[unique_df['surge'] == '0', 'length'] = 0

In [None]:
sequence_df.head(20)

## Critical group by unique identifier

In [None]:
unique_df = sequence_df.groupby('group').first().reset_index()
print(unique_df)

### Merge even and odd Rows to form the final sequences

Even rows contain surge, and odd rows contain precursors. **When you merge them, you form a sequence of precursor, and surge.**

Each row will contain a continuous **precursor->surge** sequence.

In [None]:
even_df = unique_df.iloc[::2].reset_index(drop=True)
odd_df = unique_df.iloc[1::2].reset_index(drop=True)

merged_df = pd.concat([even_df, odd_df], axis=1)

print(merged_df)

In [None]:
nan_cols = merged_df.dropna(axis=1, how='all')
nan_cols.head()

In [None]:
nan_cols.columns

### Write to CSV: step one, pipeline
Label to use is surge_area

In [None]:
# nan_cols = nan_cols.rename(columns={'group': 'group_1', 'time': 'time_1', 'change': 'change_1', 'type': 'type_1', 'length': 'length_1', 'sum_change': 'sum_change_1', 'area': 'area_1'})
# writeable_df = nan_cols['group', 'time', 's_MP', 'change', 'type', 'length', 'sum_change','area', \
#                         'surge_area', 'group', 'time', 'change', 'type', 'p_MP',
#                        'p_buyCap', 'p_askCap', 'p_totalBidVol', 'p_totalAskVol', 'length','sum_change', 'area']

# writeable_df.columns = ['group', 'time', 's_MP', 'change', 'type', 'length', 'sum_change',
#        's_area', 'surge_area', 'p_group', 'p_time', 'p_change', 'p_type', 'p_MP',
#        'p_buyCap', 'p_askCap', 'p_totalBidVol', 'p_totalAskVol', 'p_length',
#        'p_sum_change', 'p_area']
nan_cols.to_csv('pipeline1.csv', index=False)
# df.to_csv('filename.csv', index=False)
# writeable_df.to_csv('pipeline1.csv', index=False)


In [None]:
'''The repeating elements in the list are:
- group
- time
- change
- type
- length
- sum_change
- area '''