### Step 2: merge all .csv files into a one-year dataframe

loop through the lob_caps directory, forming one time-sorted dataframe, with all CAPS files. These files captured sample bid and ask capitalization, and respective bid and ask volumes.

In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
!pip3 install matplotlib
!pip3 install altair
!pip3 install sklearn

Collecting sklearn
  Downloading sklearn-0.0.post10.tar.gz (3.6 kB)
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25ldone
[?25h  Created wheel for sklearn: filename=sklearn-0.0.post10-py3-none-any.whl size=2370 sha256=c9ad354ff354a9ae8db6b1086b2bf78da7cee4779714e56f8fdf48ebacc494ae
  Stored in directory: /home/jovyan/.cache/pip/wheels/d4/d0/c9/b0d7844dfdfa484eb999955105f3610d20c27fb9593d7c3299
Successfully built sklearn
Installing collected packages: sklearn
Successfully installed sklearn-0.0.post10


In [3]:
import altair as alt
import pandas as pd
import os
import numpy as np

In [4]:
# !mv $(find . -type d -name "lob_caps" -exec grep -q MATCH {} \; -print0 | xargs -0 echo) backup_match/

In [5]:
#https://stackoverflow.com/a/21232849 model 
def getCAPSByDateAndType(type):  #returns a dict, date + df caps for that date, then extended date and time
                                # print("for type, ", type)  ./lob_caps/
    ret = []
    for root, dirs, files in os.walk("../lob_caps"): #core/gh-code/grus-code/ver2-pctChangeDriven/lob_caps
        for filename in files:
            if type in filename:
                # print("CAPS file, ", filename) #mac, do find . -name ._\* -delete
                ret.append(filename)
    return ret

csvFileList = getCAPSByDateAndType("CAPS") #iterate this array to dip into each csv, later on
li = []                         #form the endFrame / global data frame around this array
for filename in csvFileList:
    csv = "../lob_caps/" + filename
    # print(csv)
    df = pd.read_csv(csv, index_col=None, header=0)
    li.append(df)

capsFrame = pd.concat(li, axis=0, ignore_index=True) #end frame contains all data
capsFrame.sort_values(by=['time'], ascending=True)   #sorted by time into one time series
print("for new df: ", capsFrame.shape[0])
start = capsFrame["time"].min()
end = capsFrame["time"].max()
print("start: ", start, " end: ", end)
print(capsFrame.columns)

for new df:  298660
start:  1660221600292.0  end:  1693078943553.0
Index(['bc', 'ac', 'tbv', 'tav', 'time', 'mp', 'minBid'], dtype='object')


In [6]:
capsFrame

Unnamed: 0,bc,ac,tbv,tav,time,mp,minBid
0,5290023.77,1.768014e+08,753988.75,239368.00,1.670113e+12,13.54,13.49
1,5299776.75,1.767976e+08,754750.47,239087.60,1.670113e+12,13.54,13.49
2,5301033.77,1.767938e+08,754843.30,238809.10,1.670113e+12,13.54,13.49
3,5319912.24,1.767687e+08,756239.22,237061.73,1.670113e+12,13.54,13.49
4,5312650.47,1.767650e+08,755700.91,236784.48,1.670113e+12,13.54,13.49
...,...,...,...,...,...,...,...
298655,2380889.22,1.007667e+07,372179.55,224957.48,1.682874e+12,17.72,17.22
298656,2382365.18,1.007683e+07,372264.11,224967.31,1.682874e+12,17.72,17.22
298657,2356074.25,1.005477e+07,370778.29,223724.25,1.682874e+12,17.72,17.22
298658,2358096.38,1.007875e+07,371036.78,225185.04,1.682875e+12,17.63,17.22


## schema for capitalization data

loads the csv files, as acquired from coinbase

In [7]:
capsFrame.head(2) #shows the basic data collection via coinbase, these are aggregated values, collected several x a minute

Unnamed: 0,bc,ac,tbv,tav,time,mp,minBid
0,5290023.77,176801400.0,753988.75,239368.0,1670113000000.0,13.54,13.49
1,5299776.75,176797600.0,754750.47,239087.6,1670113000000.0,13.54,13.49


### imputation

In [8]:
# impute missing values with last non-null value
capsFrame['bc'] = capsFrame['bc'].fillna(method='ffill')
capsFrame['ac'] = capsFrame['ac'].fillna(method='ffill')
capsFrame['tbv'] = capsFrame['tbv'].fillna(method='ffill')
capsFrame['tav'] = capsFrame['tav'].fillna(method='ffill')
capsFrame['mp'] = capsFrame['mp'].fillna(method='ffill')
capsFrame['minBid'] = capsFrame['minBid'].fillna(method='ffill')


In [9]:
capsFrame.to_csv("global_data.csv")

## Discover precursor and surge episodes

the goal of the data prep is to discover periods of continuous, positive momentum. These are **surges**. 

The periods preceding surges are, for the sake of the experiment, **precursors**. They are detected as periods of discontinuous positive momentum, or negative momentum. 

A ten-row window is used to calculate positive or negative momentum. A percent **change** is calculated for the ten row subsample.

## regularization of critical features
get percent change as basis for comprehending LOB

create new columns which depict the momentum of one row versus the next, in terms of price , capitalization and volume

In [10]:
# Load your time series data into a pandas dataframe
# consider cahnging this approach because it doesnt actually check in between values

caps_df = capsFrame   
lookback_period = 10 # in rows
caps_df['change'] = caps_df['mp'].pct_change(periods=lookback_period)
caps_df['bc_change'] = caps_df['bc'].pct_change(periods=lookback_period)
caps_df['ac_change'] = caps_df['ac'].pct_change(periods=lookback_period)
caps_df['tav_change'] = caps_df['tav'].pct_change(periods=lookback_period)
caps_df['tbv_change'] = caps_df['tbv'].pct_change(periods=lookback_period)
## key components: bc_change, ac_change, tav_change, tbv_change, change
# caps_df.sample
print(caps_df.shape[0], caps_df.columns)# Calculate the returns of your asset over a fixed lookback period

298660 Index(['bc', 'ac', 'tbv', 'tav', 'time', 'mp', 'minBid', 'change', 'bc_change',
       'ac_change', 'tav_change', 'tbv_change'],
      dtype='object')


###  establish benchmarks for percent change

the mean of change represents the average rate of change between LOB samples. This is used to determine whether the change between rows is significant or not. 

In [11]:
#for period, average or mean change metric. this changes with window size
meanChange = round(caps_df['change'].mean(),8)
meanChange

0.00050491

## data mining: sequence discovery
define precursors from surges, prepare the data with this sequence: 

precursor -> surge

prepare to cluster every precursor, by the sequential, resultant surge. Do not assume causality, but rather preoccurance.

use the threshold, mean change as tool to separate precursor from surges, where surges represent periods of positive momentum above threshold.

This step defines the data schema for the remainder of the process, where key statistics are defined for precursors and surges.

In [12]:
# identify units of 10 rows where the percent change is greater or less than the threshold
### key components: bc_change, ac_change, tav_change, tbv_change, change
threshold = meanChange
surges = []
precursors = []
for i in range(0,len(caps_df),10):
    if caps_df.iloc[i:i+10]['change'].mean() >= threshold:
        surges.append({'time': caps_df.iloc[i]['time'],
                       's_MP': caps_df.iloc[i]['mp'],
                       'change': caps_df.iloc[i:i+10]['change'].mean(),
                       'type':'surge'})  #['bc', 'ac', 'tbv', 'tav', 'time', 'mp', 'minBid', 'change']
    else:
        precursors.append({'time': caps_df.iloc[i]['time'],
                           'p_MP': caps_df.iloc[i]['mp'],
                           'change': caps_df.iloc[i:i+10]['change'].mean(),
                            'type':'precursor',
                            'precursor_buy_cap_pct_change':caps_df.iloc[i]['bc_change'], 
                            'precursor_ask_cap_pct_change':caps_df.iloc[i]['ac_change'],
                            'precursor_bid_vol_pct_change':caps_df.iloc[i]['tbv_change'],
                            'precursor_ask_vol_pct_change':caps_df.iloc[i]['tav_change']
                            })  

In [13]:
#for item in surges[:2]:
    #print(item)

In [14]:
#for item in precursors:
    #print(item)

## prepprocess: merge precursors and surges into time series

a dataframe of sequences, **sequence_df** is created by concatenating both buckets, and sorting by time. This will create a time series of surge and precursor periods, as defined by: 

* 10 window percent change values
* contiguity: these precursor and surges are next to each other and thus have a length or duration of momentum.

In [15]:
surges_df = pd.DataFrame(surges)
precursors_df = pd.DataFrame(precursors)
sequence_df = pd.concat([surges_df, precursors_df]).sort_values(by=['time'], ascending=True)

In [16]:
sequence_df.index

Int64Index([2713, 5021, 5022, 5023, 5024, 5025, 5026, 2714, 5027, 2715,
            ...
            3928, 3929, 3930, 3931, 3932, 2145, 3933, 3934, 2146, 2147],
           dtype='int64', length=29866)

### view the aligned, continuous time series of precursors and surges

view the final abstraction: sets of precursor periods, next to surges, in a linear time series. Precursors effectively precede surges on a linear time series.

In [17]:
# for index, row in sequence_df.iterrows():
#     print(row['surge'], row['precursor'])
sequence_df['type'].head(40)

2713        surge
5021    precursor
5022    precursor
5023    precursor
5024    precursor
5025    precursor
5026    precursor
2714        surge
5027    precursor
2715        surge
2716        surge
5028    precursor
5029    precursor
5030    precursor
2717        surge
2718        surge
5031    precursor
5032    precursor
2719        surge
5033    precursor
5034    precursor
5035    precursor
5036    precursor
5037    precursor
5038    precursor
5039    precursor
2720        surge
2721        surge
5040    precursor
5041    precursor
2722        surge
2723        surge
2724        surge
2725        surge
5042    precursor
5043    precursor
5044    precursor
5045    precursor
2726        surge
5046    precursor
Name: type, dtype: object

In [18]:
# sequence_df.head(45)

## visualize proof of algorithmic accuracy

this chart will plot the price time series, with an area of precursor and surge, as proof of our algorithmic accuracy.

In [19]:
subset = sequence_df[:4999]
line = alt.Chart(subset).mark_line(color='green').encode(
    x='time',
    y='s_MP'
)

s_bar = alt.Chart(subset).mark_bar().encode(
    x='time',
    y='s_MP',
    color='type:N'
)

p_bar = alt.Chart(subset).mark_bar().encode(
    x='time',
    y='p_MP',
    color='type:N'
)

chart = (s_bar + p_bar + line).properties(width=600, height=500)
chart.title = 'Data Mining Accuracy, Surge vs Precursor Sequence'
subtitle = 'Precursors are contiguous periods where percentage rate of growth is less than threshold'
chart.properties(title=alt.TitleParams(text=[chart.title, subtitle], baseline='bottom', orient='top', anchor='start', fontSize=14))
chart.interactive()

In [20]:
sequence_df.columns

Index(['time', 's_MP', 'change', 'type', 'p_MP',
       'precursor_buy_cap_pct_change', 'precursor_ask_cap_pct_change',
       'precursor_bid_vol_pct_change', 'precursor_ask_vol_pct_change'],
      dtype='object')

### data mining 2: information gain, create new features

Perform information gain on grouped precursors and surges

define the **sum change**, or total change per continuous episode (precursor or surge). 

define the **length** of each episode. 

define the height of the surge, how high did the continuous positive momentum reach?

define the size (area) of the surge, as a triangular area (height times length), as **surge_area**

Create one line to describe a precursor or search and it's related order book statistics

In [21]:
sequence_df['group'] = (sequence_df['type'] != sequence_df['type'].shift(1)).cumsum()
columns_to_transform = [
    'precursor_buy_cap_pct_change',
    'precursor_ask_cap_pct_change',
    'precursor_bid_vol_pct_change',
    'precursor_ask_vol_pct_change'
]

for col in columns_to_transform:
    sequence_df[col] = sequence_df.groupby('group')[col].transform(lambda x: x.sum() if not x.isna().all() else np.nan)

In [22]:
#### imputation

In [23]:
# # impute missing values with last non-null value DONE PRIOR, NOW AT START
sequence_df['s_MP'] = sequence_df['s_MP'].fillna(method='ffill')
sequence_df['p_MP'] = sequence_df['p_MP'].fillna(method='ffill')
sequence_df['precursor_buy_cap_pct_change'] = sequence_df['precursor_buy_cap_pct_change'].fillna(method='ffill')
sequence_df['precursor_ask_cap_pct_change'] = sequence_df['precursor_ask_cap_pct_change'].fillna(method='ffill')
sequence_df['precursor_bid_vol_pct_change'] = sequence_df['precursor_bid_vol_pct_change'].fillna(method='ffill')
sequence_df['precursor_ask_vol_pct_change'] = sequence_df['precursor_ask_vol_pct_change'].fillna(method='ffill')

In [24]:
#sequence_df['group'] = (sequence_df['type'] != sequence_df['type'].shift(1)).cumsum()

In [25]:
sequence_df['length'] = sequence_df.groupby(['type', 'group'])['group'].transform('count')

print(sequence_df.shape[0])
sequence_df['sum_change'] = sequence_df.groupby(['type', 'group'])['change'].transform('sum')

sequence_df['max_surge_mp'] = sequence_df.groupby(['type', 'group'])['s_MP'].transform('max')
sequence_df['min_surge_mp'] = sequence_df.groupby(['type', 'group'])['s_MP'].transform('min')

sequence_df['max_precursor_mp'] = sequence_df.groupby(['type', 'group'])['p_MP'].transform('max')
sequence_df['min_precursor_mp'] = sequence_df.groupby(['type', 'group'])['p_MP'].transform('min')

sequence_df['area']  = sequence_df.apply(lambda row: row['length'] * row['sum_change'], axis=1)

sequence_df.loc[sequence_df['type'] == 'surge', 'surge_area'] = sequence_df.loc[sequence_df['type'] == 'surge', 'area']

sequence_df['surge_targets_met_pct']  = sequence_df.apply(lambda group: ((group['max_precursor_mp']-group['max_surge_mp'])/group['max_surge_mp']  ) *100, axis=1)

29866


In [26]:
# define a custom function to calculate the percentage by which max_surge_mp exceeds max_precursor_mp
'''for a pandas dataframe wth attributes ['group', 'time', 's_MP', 'change', 'type', 'length', 'sum_change',
       'max_surge_mp', 'min_surge_mp', 'area', 'surge_area', 'group', 'time',
       'change', 'type', 'p_MP', 'precursor_buy_cap_pct_change',
       'precursor_ask_cap_pct_change', 'precursor_bid_vol_pct_change',
       'precursor_ask_vol_pct_change', 'length', 'sum_change',
       'max_precursor_mp', 'min_precursor_mp', 'area'] 
       group by type, group then create  
       a new column 'surge_targets_met_pct' which equals the percentage 
       by which the max_surge_mp exceeds the max_precursor_mp'''


sequence_df.columns
print(sequence_df.shape[0])

29866


In [27]:
sequence_df.head(30)

Unnamed: 0,time,s_MP,change,type,p_MP,precursor_buy_cap_pct_change,precursor_ask_cap_pct_change,precursor_bid_vol_pct_change,precursor_ask_vol_pct_change,group,length,sum_change,max_surge_mp,min_surge_mp,max_precursor_mp,min_precursor_mp,area,surge_area,surge_targets_met_pct
2713,1660222000000.0,30.0,0.246621,surge,,,,,,1,1,0.2466213,30.0,30.0,,,0.246621,0.246621,
5021,1660222000000.0,30.0,-0.000533,precursor,29.98,-4.4e-05,-4e-06,-1.7e-05,-0.000351,2,6,-0.005009518,30.0,30.0,29.98,29.85,-0.030057,,-0.066667
5022,1660222000000.0,30.0,-0.001067,precursor,29.98,-4.4e-05,-4e-06,-1.7e-05,-0.000351,2,6,-0.005009518,30.0,30.0,29.98,29.85,-0.030057,,-0.066667
5023,1660222000000.0,30.0,-0.001035,precursor,29.93,-4.4e-05,-4e-06,-1.7e-05,-0.000351,2,6,-0.005009518,30.0,30.0,29.98,29.85,-0.030057,,-0.066667
5024,1660222000000.0,30.0,-0.001136,precursor,29.88,-4.4e-05,-4e-06,-1.7e-05,-0.000351,2,6,-0.005009518,30.0,30.0,29.98,29.85,-0.030057,,-0.066667
5025,1660222000000.0,30.0,-6.7e-05,precursor,29.88,-4.4e-05,-4e-06,-1.7e-05,-0.000351,2,6,-0.005009518,30.0,30.0,29.98,29.85,-0.030057,,-0.066667
5026,1660222000000.0,30.0,-0.001171,precursor,29.85,-4.4e-05,-4e-06,-1.7e-05,-0.000351,2,6,-0.005009518,30.0,30.0,29.98,29.85,-0.030057,,-0.066667
2714,1660222000000.0,29.86,0.00067,surge,29.85,-4.4e-05,-4e-06,-1.7e-05,-0.000351,3,1,0.0006702414,29.86,29.86,29.85,29.85,0.00067,0.00067,-0.03349
5027,1660222000000.0,29.86,0.000135,precursor,29.89,-0.002358,-2e-06,-0.000818,-0.000333,4,1,0.000134564,29.86,29.86,29.89,29.89,0.000135,,0.100469
2715,1660222000000.0,29.88,0.001273,surge,29.89,-0.002358,-2e-06,-0.000818,-0.000333,5,2,0.001807711,29.9,29.88,29.89,29.89,0.003615,0.003615,-0.033445


## data mining 3: form final sequences by statistical weight

Critical group by unique identifier

In [28]:
unique_df = sequence_df.groupby('group').first().reset_index()
# print(unique_df)

In [29]:
unique_df.head(20)

Unnamed: 0,group,time,s_MP,change,type,p_MP,precursor_buy_cap_pct_change,precursor_ask_cap_pct_change,precursor_bid_vol_pct_change,precursor_ask_vol_pct_change,length,sum_change,max_surge_mp,min_surge_mp,max_precursor_mp,min_precursor_mp,area,surge_area,surge_targets_met_pct
0,1,1660222000000.0,30.0,0.246621,surge,,,,,,1,0.2466213,30.0,30.0,,,0.246621,0.246621,
1,2,1660222000000.0,30.0,-0.000533,precursor,29.98,-4.4e-05,-4e-06,-1.7e-05,-0.000351,6,-0.005009518,30.0,30.0,29.98,29.85,-0.030057,,-0.066667
2,3,1660222000000.0,29.86,0.00067,surge,29.85,-4.4e-05,-4e-06,-1.7e-05,-0.000351,1,0.0006702414,29.86,29.86,29.85,29.85,0.00067,0.00067,-0.03349
3,4,1660222000000.0,29.86,0.000135,precursor,29.89,-0.002358,-2e-06,-0.000818,-0.000333,1,0.000134564,29.86,29.86,29.89,29.89,0.000135,,0.100469
4,5,1660222000000.0,29.88,0.001273,surge,29.89,-0.002358,-2e-06,-0.000818,-0.000333,2,0.001807711,29.9,29.88,29.89,29.89,0.003615,0.003615,-0.033445
5,6,1660222000000.0,29.9,-0.000233,precursor,29.94,-0.001354,-1.7e-05,-0.000446,-0.001967,3,-0.003410602,29.9,29.9,29.94,29.87,-0.010232,,0.133779
6,7,1660222000000.0,29.8,0.000873,surge,29.87,-0.001354,-1.7e-05,-0.000446,-0.001967,2,0.002114419,29.88,29.8,29.87,29.87,0.004229,0.004229,-0.033467
7,8,1660222000000.0,29.88,-0.000502,precursor,29.87,-0.000514,1.7e-05,-0.000278,0.001923,2,8.855895e-07,29.88,29.88,29.87,29.85,2e-06,,-0.033467
8,9,1660223000000.0,29.9,0.001305,surge,29.85,-0.000514,1.7e-05,-0.000278,0.001923,1,0.001305053,29.9,29.9,29.85,29.85,0.001305,0.001305,-0.167224
9,10,1660223000000.0,29.9,-0.000768,precursor,29.95,0.014278,3.4e-05,0.006498,0.008866,7,-0.01377036,29.9,29.9,29.95,29.52,-0.096393,,0.167224


#### Merge even and odd Rows to form the final sequences

Even rows contain surge, and odd rows contain precursors. **When you merge them, you form a sequence of precursor, and surge.**

Each row will contain a continuous **precursor->surge** sequence.

In [30]:
# needs to start with a precursor removes the first surge
unique_df = unique_df.iloc[1:]
even_df = unique_df.iloc[::2].reset_index(drop=True)
odd_df = unique_df.iloc[1::2].reset_index(drop=True)

merged_df = pd.concat([even_df, odd_df], axis=1)

# print(merged_df)

In [31]:
merged_df[:10]

Unnamed: 0,group,time,s_MP,change,type,p_MP,precursor_buy_cap_pct_change,precursor_ask_cap_pct_change,precursor_bid_vol_pct_change,precursor_ask_vol_pct_change,...,precursor_ask_vol_pct_change.1,length,sum_change,max_surge_mp,min_surge_mp,max_precursor_mp,min_precursor_mp,area,surge_area,surge_targets_met_pct
0,2,1660222000000.0,30.0,-0.000533,precursor,29.98,-4.4e-05,-4e-06,-1.7e-05,-0.000351,...,-0.000351,1,0.00067,29.86,29.86,29.85,29.85,0.00067,0.00067,-0.03349
1,4,1660222000000.0,29.86,0.000135,precursor,29.89,-0.002358,-2e-06,-0.000818,-0.000333,...,-0.000333,2,0.001808,29.9,29.88,29.89,29.89,0.003615,0.003615,-0.033445
2,6,1660222000000.0,29.9,-0.000233,precursor,29.94,-0.001354,-1.7e-05,-0.000446,-0.001967,...,-0.001967,2,0.002114,29.88,29.8,29.87,29.87,0.004229,0.004229,-0.033467
3,8,1660222000000.0,29.88,-0.000502,precursor,29.87,-0.000514,1.7e-05,-0.000278,0.001923,...,0.001923,1,0.001305,29.9,29.9,29.85,29.85,0.001305,0.001305,-0.167224
4,10,1660223000000.0,29.9,-0.000768,precursor,29.95,0.014278,3.4e-05,0.006498,0.008866,...,0.008866,2,0.004639,29.63,29.53,29.52,29.52,0.009278,0.009278,-0.371245
5,12,1660225000000.0,29.63,-0.000303,precursor,29.66,0.002581,1.9e-05,0.000901,0.002203,...,0.002203,4,0.008537,29.85,29.7,29.58,29.58,0.034148,0.034148,-0.904523
6,14,1660227000000.0,29.85,-0.000904,precursor,29.9,-0.012366,-0.000218,-0.002351,-0.019411,...,-0.019411,1,0.004761,29.08,29.08,29.4,29.4,0.004761,0.004761,1.100413
7,16,1660230000000.0,29.08,-0.002143,precursor,29.27,0.016804,7.7e-05,0.005661,0.014315,...,0.014315,1,0.000592,29.12,29.12,29.1,29.1,0.000592,0.000592,-0.068681
8,18,1660233000000.0,29.12,-0.006358,precursor,29.0,-0.015299,2e-05,-0.004036,0.006264,...,0.006264,2,0.003328,28.92,28.88,28.82,28.82,0.006656,0.006656,-0.345781
9,20,1660236000000.0,28.92,-0.001795,precursor,28.96,-0.000879,-1.8e-05,-0.000493,-0.003028,...,-0.003028,2,0.00488,29.02,28.92,28.96,28.96,0.00976,0.00976,-0.206754


In [32]:
nan_cols = merged_df.dropna(axis=1, how='all')
nan_cols.head()

Unnamed: 0,group,time,s_MP,change,type,p_MP,precursor_buy_cap_pct_change,precursor_ask_cap_pct_change,precursor_bid_vol_pct_change,precursor_ask_vol_pct_change,...,precursor_ask_vol_pct_change.1,length,sum_change,max_surge_mp,min_surge_mp,max_precursor_mp,min_precursor_mp,area,surge_area,surge_targets_met_pct
0,2,1660222000000.0,30.0,-0.000533,precursor,29.98,-4.4e-05,-4e-06,-1.7e-05,-0.000351,...,-0.000351,1,0.00067,29.86,29.86,29.85,29.85,0.00067,0.00067,-0.03349
1,4,1660222000000.0,29.86,0.000135,precursor,29.89,-0.002358,-2e-06,-0.000818,-0.000333,...,-0.000333,2,0.001808,29.9,29.88,29.89,29.89,0.003615,0.003615,-0.033445
2,6,1660222000000.0,29.9,-0.000233,precursor,29.94,-0.001354,-1.7e-05,-0.000446,-0.001967,...,-0.001967,2,0.002114,29.88,29.8,29.87,29.87,0.004229,0.004229,-0.033467
3,8,1660222000000.0,29.88,-0.000502,precursor,29.87,-0.000514,1.7e-05,-0.000278,0.001923,...,0.001923,1,0.001305,29.9,29.9,29.85,29.85,0.001305,0.001305,-0.167224
4,10,1660223000000.0,29.9,-0.000768,precursor,29.95,0.014278,3.4e-05,0.006498,0.008866,...,0.008866,2,0.004639,29.63,29.53,29.52,29.52,0.009278,0.009278,-0.371245


In [33]:
nan_cols.columns

Index(['group', 'time', 's_MP', 'change', 'type', 'p_MP',
       'precursor_buy_cap_pct_change', 'precursor_ask_cap_pct_change',
       'precursor_bid_vol_pct_change', 'precursor_ask_vol_pct_change',
       'length', 'sum_change', 'max_surge_mp', 'min_surge_mp',
       'max_precursor_mp', 'min_precursor_mp', 'area', 'surge_targets_met_pct',
       'group', 'time', 's_MP', 'change', 'type', 'p_MP',
       'precursor_buy_cap_pct_change', 'precursor_ask_cap_pct_change',
       'precursor_bid_vol_pct_change', 'precursor_ask_vol_pct_change',
       'length', 'sum_change', 'max_surge_mp', 'min_surge_mp',
       'max_precursor_mp', 'min_precursor_mp', 'area', 'surge_area',
       'surge_targets_met_pct'],
      dtype='object')

## Binning Process

In [None]:
bins = [
    m2_pipeline['surge_targets_met_pct'].min() -1,  # Min value  # -4 to 0 divided into three equal parts
    -4/3,# Second bin edge for negative values
    -4/6,
    -4/12,
    0,
    0.125,
    0.25, 0.5, 0.75, 1,  # Four bins between 0 and 1
    2,  # One bin between 1 and 2
    m2_pipeline['surge_targets_met_pct'].max() + 1]
bin_labels = list(range(1, len(bins)))

for i in range(1, len(bins)):
    print(f'Bin {bin_labels[i-1]}: {bins[i-1]:.2f} - {bins[i]:.2f}')

In [None]:
m2_pipeline['label'] = pd.cut(m2_pipeline['surge_targets_met_pct'], bins=bins, labels=bin_labels)

### Write to CSV: step one, pipeline
Label to use is surge_targets_met_pct

In [34]:
nan_cols.to_csv('binned_pipeline.csv', index=False)