## Environment Data Breakdown

In [3]:
from pathlib import Path
import os
import sys
sys.path.append(str(Path(os.path.abspath('')).parent.parent))

from gym_trading.utils.data_pipeline import DataPipeline
from configurations import DATA_PATH, EMA_ALPHA, LOGGER, MAX_BOOK_ROWS, TIMEZONE

fitting_file = 'XBTUSD_20200101_20200108_merge.csv.xz'
#fitting_file = 'XBTUSD_20200101_20200102_merge.csv.xz'
testing_file = 'paper_data/XBTUSD_2020-01-09.csv.xz'
including_imbalances = True
as_pandas = True

pipeline = DataPipeline(0.99)

[2023-05-26 13:58:37,837, ema.py:68] EMA smoothing ENABLED: 0.99


In [4]:
# importing raw data (recorded and Simulator.py applied)
fitting_data_filepath = os.path.join(DATA_PATH, fitting_file)
fitting_data1 = pipeline.import_csv(filename=fitting_data_filepath)
print(fitting_data1.shape)
fitting_data1.head()

[2023-05-26 13:58:53,289, data_pipeline.py:49] Imported 101_20200108_merge.csv.xz from a csv in 13 seconds


(690726, 204)


Unnamed: 0_level_0,midpoint,spread,buys,sells,bids_distance_0,bids_distance_1,bids_distance_2,bids_distance_3,bids_distance_4,bids_distance_5,...,asks_market_notional_10,asks_market_notional_11,asks_market_notional_12,asks_market_notional_13,asks_market_notional_14,asks_market_notional_15,asks_market_notional_16,asks_market_notional_17,asks_market_notional_18,asks_market_notional_19
system_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-12-31 20:00:00,7141.75,0.5,0.0,0.0,-3.5e-05,-0.000105,-0.000175,-0.000245,-0.000315,-0.000385,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-12-31 20:00:01,7141.75,0.5,0.0,0.0,-3.5e-05,-0.000105,-0.000175,-0.000245,-0.000315,-0.000385,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-12-31 20:00:02,7141.75,0.5,0.0,0.0,-3.5e-05,-0.000105,-0.000175,-0.000245,-0.000315,-0.000385,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-12-31 20:00:03,7141.75,0.5,4845.0,118233.0,-3.5e-05,-0.000105,-0.000175,-0.000245,-0.000315,-0.000385,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-12-31 20:00:04,7141.75,0.5,0.0,342744.0,-3.5e-05,-0.000105,-0.000175,-0.000245,-0.000315,-0.000385,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
import numpy as np
print(fitting_data1.columns[:4])
print(np.unique(np.array([col[:-2].replace('_','') for col in fitting_data1.columns[4:]])))
# fitting data should have 4 + 2*(5*20) columns = 204

Index(['midpoint', 'spread', 'buys', 'sells'], dtype='object')
['askscancelnotional' 'asksdistance' 'askslimitnotional'
 'asksmarketnotional' 'asksnotional' 'bidscancelnotional' 'bidsdistance'
 'bidslimitnotional' 'bidsmarketnotional' 'bidsnotional']


In [6]:
# derive OFI (Order Flow Imbalances)
fitting_data2 = pipeline._decompose_order_flow_information(data=fitting_data1)
print(fitting_data2.shape)
print(fitting_data2.columns[:4])
print(np.unique(np.array([col[:-2].replace('_','') for col in fitting_data2.columns[4:]])))
# fitting data should have 4 + 2*(2*20 + 20) = 124

(690726, 124)
Index(['midpoint', 'spread', 'buys', 'sells'], dtype='object')
['asksdistance' 'asksnotional' 'bidsdistance' 'bidsnotional' 'ofiask'
 'ofibid']


In [7]:
# take log difference of midpoint prices
fitting_data3 = pipeline._midpoint_diff(fitting_data2)
fitting_data3

Unnamed: 0_level_0,midpoint,spread,buys,sells,bids_distance_0,bids_distance_1,bids_distance_2,bids_distance_3,bids_distance_4,bids_distance_5,...,ofi_ask_10,ofi_ask_11,ofi_ask_12,ofi_ask_13,ofi_ask_14,ofi_ask_15,ofi_ask_16,ofi_ask_17,ofi_ask_18,ofi_ask_19
system_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-12-31 20:00:00,0.0,0.5,0.0,0.0,-0.000035,-0.000105,-0.000175,-0.000245,-0.000315,-0.000385,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-12-31 20:00:01,0.0,0.5,0.0,0.0,-0.000035,-0.000105,-0.000175,-0.000245,-0.000315,-0.000385,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-12-31 20:00:02,0.0,0.5,0.0,0.0,-0.000035,-0.000105,-0.000175,-0.000245,-0.000315,-0.000385,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-12-31 20:00:03,0.0,0.5,4845.0,118233.0,-0.000035,-0.000105,-0.000175,-0.000245,-0.000315,-0.000385,...,-2.0,2798.0,-20697.0,-28802.0,25580.0,0.0,-3.0,-25.0,-59950.0,-9570.0
2019-12-31 20:00:04,0.0,0.5,0.0,342744.0,-0.000035,-0.000105,-0.000175,-0.000245,-0.000315,-0.000385,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1612.0,10000.0,-8152.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-01-08 23:59:55,0.0,0.5,0.0,1001.0,-0.000031,-0.000093,-0.000155,-0.000217,-0.000279,-0.000341,...,5377.0,0.0,0.0,5377.0,0.0,5377.0,0.0,0.0,0.0,5377.0
2020-01-08 23:59:56,0.0,0.5,45826.0,0.0,-0.000031,-0.000093,-0.000155,-0.000217,-0.000279,-0.000341,...,0.0,0.0,50000.0,0.0,0.0,0.0,-45992.0,3769.0,0.0,-40400.0
2020-01-08 23:59:57,0.0,0.5,1.0,10001.0,-0.000031,-0.000093,-0.000155,-0.000217,-0.000279,-0.000341,...,0.0,4223.0,-50000.0,-12946.0,-40400.0,33462.0,0.0,0.0,0.0,0.0
2020-01-08 23:59:58,0.0,0.5,0.0,0.0,-0.000031,-0.000093,-0.000155,-0.000217,-0.000279,-0.000341,...,1000.0,-9502.0,1000.0,1000.0,0.0,0.0,0.0,16004.0,0.0,-16149.0


In [8]:
# potentially smooth data with exponential moving average(s)
from indicators.ema import apply_ema_all_data
fitting_data4 = apply_ema_all_data(pipeline.ema, fitting_data3)
fitting_data4

[2023-05-26 13:59:22,797, ema.py:94] Applying EMA to data...


Unnamed: 0_level_0,midpoint,spread,buys,sells,bids_distance_0,bids_distance_1,bids_distance_2,bids_distance_3,bids_distance_4,bids_distance_5,...,ofi_ask_10,ofi_ask_11,ofi_ask_12,ofi_ask_13,ofi_ask_14,ofi_ask_15,ofi_ask_16,ofi_ask_17,ofi_ask_18,ofi_ask_19
system_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-12-31 20:00:00,0.000000e+00,0.500000,0.000000,0.000000,-0.000035,-0.000105,-0.000175,-0.000245,-0.000315,-0.000385,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2019-12-31 20:00:01,0.000000e+00,0.500000,0.000000,0.000000,-0.000035,-0.000105,-0.000175,-0.000245,-0.000315,-0.000385,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2019-12-31 20:00:02,0.000000e+00,0.500000,0.000000,0.000000,-0.000035,-0.000105,-0.000175,-0.000245,-0.000315,-0.000385,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2019-12-31 20:00:03,0.000000e+00,0.500000,48.450001,1182.329956,-0.000035,-0.000105,-0.000175,-0.000245,-0.000315,-0.000385,...,-0.020000,27.980000,-206.970001,-288.019989,255.800003,0.000000,-0.030000,-0.250000,-599.500000,-95.699997
2019-12-31 20:00:04,0.000000e+00,0.500000,47.965500,4597.946777,-0.000035,-0.000105,-0.000175,-0.000245,-0.000315,-0.000385,...,-0.019800,27.700199,-204.900299,-285.139801,253.242004,0.000000,-0.029700,-16.367500,-493.505005,-176.263000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-01-08 23:59:55,-1.818556e-07,0.500167,14919.483398,15730.098633,-0.000031,-0.000093,-0.000155,-0.000217,-0.000279,-0.000341,...,-1427.800903,-783.303101,1418.646729,2687.833008,4738.869141,-3052.556885,-2885.273193,-2328.959961,-3608.784180,143.006897
2020-01-08 23:59:56,-1.800370e-07,0.500165,15228.548828,15572.797852,-0.000031,-0.000093,-0.000155,-0.000217,-0.000279,-0.000341,...,-1413.522949,-775.470032,1904.460327,2660.954834,4691.480469,-3022.031250,-3316.340576,-2267.980469,-3572.696289,-262.423157
2020-01-08 23:59:57,-1.782367e-07,0.500164,15076.273438,15517.080078,-0.000031,-0.000093,-0.000155,-0.000217,-0.000279,-0.000341,...,-1399.387695,-725.485352,1385.415649,2504.885254,4240.565430,-2657.190918,-3283.177002,-2245.300537,-3536.969238,-259.798920
2020-01-08 23:59:58,-1.764543e-07,0.500162,14925.510742,15361.909180,-0.000031,-0.000093,-0.000155,-0.000217,-0.000279,-0.000341,...,-1375.393799,-813.250488,1381.561523,2489.836426,4198.160156,-2630.619141,-3250.345215,-2062.807617,-3501.599609,-418.690948


In [24]:
# fit scaler, saves _scaler.mean_ and _scaler.scale_ (scaling factor for each feature)
pipeline.fit_scaler(fitting_data4)
#print(pipeline._scaler.mean_)
#print(pipeline._scaler.scale_)

In [9]:
mid_point_prices, raw_data, normalized_data = pipeline.load_environment_data(
    fitting_file=fitting_file,
    testing_file=testing_file,
    include_imbalances=True,
    as_pandas=True
)

[2023-05-26 13:59:55,545, data_pipeline.py:49] Imported 101_20200108_merge.csv.xz from a csv in 13 seconds
[2023-05-26 13:59:57,334, ema.py:94] Applying EMA to data...
[2023-05-26 14:00:01,630, data_pipeline.py:49] Imported /XBTUSD_2020-01-09.csv.xz from a csv in 1 seconds
[2023-05-26 14:00:01,780, ema.py:94] Applying EMA to data...
[2023-05-26 14:00:02,002, data_pipeline.py:228] Adding order imbalances...
[2023-05-26 14:00:02,012, ema.py:128] Reset EMA data.
[2023-05-26 14:00:02,013, ema.py:94] Applying EMA to data...


In [27]:
raw_data.columns

Index(['midpoint', 'spread', 'buys', 'sells', 'bids_distance_0',
       'bids_distance_1', 'bids_distance_2', 'bids_distance_3',
       'bids_distance_4', 'bids_distance_5',
       ...
       'asks_market_notional_10', 'asks_market_notional_11',
       'asks_market_notional_12', 'asks_market_notional_13',
       'asks_market_notional_14', 'asks_market_notional_15',
       'asks_market_notional_16', 'asks_market_notional_17',
       'asks_market_notional_18', 'asks_market_notional_19'],
      dtype='object', length=204)

In [29]:
normalized_data.columns

Index(['midpoint', 'spread', 'buys', 'sells', 'bids_distance_0',
       'bids_distance_1', 'bids_distance_2', 'bids_distance_3',
       'bids_distance_4', 'bids_distance_5',
       ...
       'ofi_ask_16', 'ofi_ask_17', 'ofi_ask_18', 'ofi_ask_19',
       'notional_imbalance_0', 'notional_imbalance_1', 'notional_imbalance_2',
       'notional_imbalance_3', 'notional_imbalance_4',
       'notional_imbalance_mean'],
      dtype='object', length=130)