In [87]:
import pandas as pd
import random
import json
import datetime
import os
import struct
from progiter import ProgIter

In [84]:
year = 2024

base_data_path = "/mnt/c/SierraChart/Data"
base_depth_path = f"{base_data_path}/MarketDepthData"
history_path = "/mnt/h/trading/history"
year_path = f"{history_path}/{year}"
data_path = f"{year_path}/Data"
depth_path = f"{data_path}/MarketDepthData"

print(history_path)
print(year_path)
print(data_path)
print(depth_path)

/mnt/h/trading/history
/mnt/h/trading/history/2024
/mnt/h/trading/history/2024/Data
/mnt/h/trading/history/2024/Data/MarketDepthData


In [16]:
depth_files = pd.DataFrame({
    "filename": os.listdir(base_depth_path)
})
depth_files['instrument'] = depth_files.filename.apply(lambda fn: fn.split('-')[0])
depth_files['contract'] = depth_files.filename.apply(lambda fn: fn.split('-')[1])
depth_files['depth_day'] = depth_files.filename.apply(lambda fn: fn.split('.')[1])
depth_files['filesize'] = depth_files.filename.apply(
    lambda fn: os.path.getsize(f"{base_depth_path}/{fn}")
)
depth_files

Unnamed: 0,filename,instrument,contract,depth_day,filesize
0,CAD-202403-CME.2024-01-07.depth,CAD,202403,2024-01-07,357784
1,CAD-202403-CME.2024-01-08.depth,CAD,202403,2024-01-08,24001456
2,CAD-202403-CME.2024-01-09.depth,CAD,202403,2024-01-09,23390344
3,CAD-202403-CME.2024-01-10.depth,CAD,202403,2024-01-10,21276808
4,CAD-202403-CME.2024-01-11.depth,CAD,202403,2024-01-11,35073376
...,...,...,...,...,...
3715,ZN-202412-CBOT.2024-10-08.depth,ZN,202412,2024-10-08,98431720
3716,ZN-202412-CBOT.2024-10-09.depth,ZN,202412,2024-10-09,97745200
3717,ZN-202412-CBOT.2024-10-10.depth,ZN,202412,2024-10-10,194253712
3718,ZN-202412-CBOT.2024-10-11.depth,ZN,202412,2024-10-11,96539776


In [29]:
# remove duplicates in case the same depth day for 2 contracts (keep the bigger size/volume).
depth_files = depth_files.sort_values('filesize', ascending=False)
depth_files = depth_files.drop_duplicates(['depth_day', 'instrument'], keep='first')
depth_files = depth_files.sort_values('filename').reset_index(drop=True)
depth_files

Unnamed: 0,filename,instrument,contract,depth_day,filesize
0,CAD-202403-CME.2024-01-07.depth,CAD,202403,2024-01-07,357784
1,CAD-202403-CME.2024-01-08.depth,CAD,202403,2024-01-08,24001456
2,CAD-202403-CME.2024-01-09.depth,CAD,202403,2024-01-09,23390344
3,CAD-202403-CME.2024-01-10.depth,CAD,202403,2024-01-10,21276808
4,CAD-202403-CME.2024-01-11.depth,CAD,202403,2024-01-11,35073376
...,...,...,...,...,...
3058,ZN-202412-CBOT.2024-10-08.depth,ZN,202412,2024-10-08,98431720
3059,ZN-202412-CBOT.2024-10-09.depth,ZN,202412,2024-10-09,97745200
3060,ZN-202412-CBOT.2024-10-10.depth,ZN,202412,2024-10-10,194253712
3061,ZN-202412-CBOT.2024-10-11.depth,ZN,202412,2024-10-11,96539776


In [39]:
instruments = depth_files.instrument.unique()
instruments

array(['CAD', 'EUR', 'GBP', 'JPY', 'M2K', 'MCL', 'MES', 'MGC', 'MNQ',
       'NG', 'ZN'], dtype=object)

In [54]:
groups = depth_files.groupby('instrument')
ranges = {}
for instrument in instruments:
    ranges[instrument] = {}
    ranges[instrument]['start'] = groups.get_group(instrument).sort_values('depth_day').iloc[0].depth_day
    ranges[instrument]['end'] = groups.get_group(instrument).sort_values('depth_day').iloc[-1].depth_day

In [64]:
start_common = max([v['start'] for k, v in ranges.items()])
end_common = min([v['end'] for k, v in ranges.items()])
start_common, end_common

('2024-01-07', '2024-10-11')

In [66]:
depth_files = depth_files[(depth_files.depth_day >= start_common) & (depth_files.depth_day <= end_common)]
depth_files

Unnamed: 0,filename,instrument,contract,depth_day,filesize
0,CAD-202403-CME.2024-01-07.depth,CAD,202403,2024-01-07,357784
1,CAD-202403-CME.2024-01-08.depth,CAD,202403,2024-01-08,24001456
2,CAD-202403-CME.2024-01-09.depth,CAD,202403,2024-01-09,23390344
3,CAD-202403-CME.2024-01-10.depth,CAD,202403,2024-01-10,21276808
4,CAD-202403-CME.2024-01-11.depth,CAD,202403,2024-01-11,35073376
...,...,...,...,...,...
3057,ZN-202412-CBOT.2024-10-07.depth,ZN,202412,2024-10-07,128239000
3058,ZN-202412-CBOT.2024-10-08.depth,ZN,202412,2024-10-08,98431720
3059,ZN-202412-CBOT.2024-10-09.depth,ZN,202412,2024-10-09,97745200
3060,ZN-202412-CBOT.2024-10-10.depth,ZN,202412,2024-10-10,194253712


In [94]:
depth_files_to_history = depth_files.filename.unique().tolist()

In [97]:
# create dirs
if not os.path.exists(year_path):
    os.system(f'mkdir {year_path}')
    
if not os.path.exists(data_path):
    os.system(f'mkdir {data_path}')
    
if not os.path.exists(depth_path):
    os.system(f'mkdir {depth_path}')

In [100]:
print("archive depth files...")
for dfile in ProgIter(depth_files_to_history):
    if not os.path.exists(f'{depth_path}/{dfile}'):
        os.system(f'cp {base_depth_path}/{dfile} {depth_path}/{dfile}')
    else:
        print(f"{fname} already exists -> skip")

archive depth files...
 100.00% 3035/3035... rate=1.03 Hz, eta=0:00:00, total=0:45:45
