# Parallel multiprocessed data acqusition and dumping to file test

In [1]:
from enum import IntFlag # Для декодирования битовых флагов

import datetime as dt
import time

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import MetaTrader5 as mt5

import multiprocessing

from os import listdir
from os.path import isfile, join


# from pandas.plotting import register_matplotlib_converters
# register_matplotlib_converters()

print("MetaTrader5 package author: ",mt5.__author__)
print("MetaTrader5 package version: ",mt5.__version__)

MetaTrader5 package author:  MetaQuotes Ltd.
MetaTrader5 package version:  5.0.37


**Ticker**

https://www.moex.com/ru/derivatives/equity/indices/  
Site with actual RTS futures ticker.

In [18]:
ticker = 'RIU2'

In [19]:
%%html
<style>
table {float:left}
</style>

In [20]:
# terminal init
if not mt5.initialize():
    print("initialize() failed, error code =",mt5.last_error())
    quit()
else:
    print(mt5.last_error())

(1, 'Success')


The idea is not to miss any market book data and dump it to file in parallel.  
Maybe a variable with that data should be emptied every hour or upon reaching a 1 gb in memory.

Lets make a latency test with different approaches.

We will try:
1. No multiprocessing, sequential execution
2. Simple multiprocessing with 2 parallel processes. One with data acquisition and packing to dataframe, other with saving it to csv. With pipe as communication between processes.
3. Same but with queue.
4. Complex multiprocessing with 4 processes, where first transfers data to one of two dataframe packers in order to decrease big dataframe creation latency, and then to another process, that saves it to csv. With pipes
5. Same but with queue

Every approach will be tested with different memory threshold.

What do we get by any approach? Big df with datetimes. We calculate differences of those datetimes. Those differences gives us a distributions. First we choose an approach with lowest range as the fastest and lowest latency. And actually that's it.

But on the other side the fastest method may be not the safest. What is data corruption? But redundancy may alleviate potential risk of data corruption. Can we gather data on different machines? No, as datetime will be incomparable, i guess. Can we gather data on one machine but 2 processes in parallel? Maybe, but will it add latency?

In [70]:
# FUNCTION TO ORDER BY FILE NUMBER
def file_number(str_):
    return int(str_.split('/')[-1][:-4])

In [71]:
# FUNCTION TO GET PATHS TO CSVs

def get_paths_to_csvs(path_to_csvs):
    result = [join(path_to_csvs, f) for f in listdir(path_to_csvs) if isfile(join(path_to_csvs, f))]
    result.sort(key=file_number)
    return result

# test_path = 'data/sequential/'
# get_paths_to_csvs(test_path)

In [81]:
# FUNCTION TO CALCULATE LATENCY DISTRIBUTION
def concate_files(all_files):
    df_list = []

    for filename in all_files:
        df = pd.read_csv(filename)
        df_list.append(df)

    df = pd.concat(df_list, axis=0, ignore_index=True)
    df['time_after'] = pd.to_datetime(df['time_after'])
    df['time_before'] = pd.to_datetime(df['time_before'])
    df['delta'] = df['time_after'] - df['time_before']
    df['time_mean'] = df['time_before'] + (df['delta']/2)
    
    df_index_before = df.index.copy()
    df = df.sort_values('time_mean',kind='stable')
    
    if df_index_before.equals(df.index):
        print('Time order is kept')
    else:
        print('Time order is invalid')
    
    df['diff'] = df['time_mean'].diff()
    return df

def calculate_diff_components(df,delete_zero_diffs=True):
    dt_components = df['diff'].dt.components
    
    if delete_zero_diffs:
        dt_components['bool_sum'] = (dt_components == 0).sum(axis=1)
        dt_components = dt_components[dt_components['bool_sum'] != 7]
        dt_components = dt_components.drop(columns='bool_sum')
        
    return dt_components

    
# all_files = ['scripts/0.csv','scripts/1.csv']
# check_latency(all_files)

In [73]:
# def describe_(df):
#     display(df.describe())
#     for col in df.columns:
#         df[col].hist(bins=100)
#         plt.title(col)
#         plt.show()

In [84]:
# test_path = 'data/'
# test_files = get_paths_to_csvs(test_path)
# concate_files(test_files)

Time order is invalid


Unnamed: 0.1,Unnamed: 0,type,price,volume,volume_dbl,time_after,time_before,name,delta,time_mean,diff
0,0,1,106440.0,8,8.0,2022-08-10 18:45:23.064392,2022-08-10 18:45:23.063392,zero,0 days 00:00:00.001000,2022-08-10 18:45:23.063892,NaT
1,1,1,106430.0,8,8.0,2022-08-10 18:45:23.064392,2022-08-10 18:45:23.063392,zero,0 days 00:00:00.001000,2022-08-10 18:45:23.063892,0 days
2,2,1,106420.0,7,7.0,2022-08-10 18:45:23.064392,2022-08-10 18:45:23.063392,zero,0 days 00:00:00.001000,2022-08-10 18:45:23.063892,0 days
3,3,1,106410.0,6,6.0,2022-08-10 18:45:23.064392,2022-08-10 18:45:23.063392,zero,0 days 00:00:00.001000,2022-08-10 18:45:23.063892,0 days
4,4,1,106400.0,28,28.0,2022-08-10 18:45:23.064392,2022-08-10 18:45:23.063392,zero,0 days 00:00:00.001000,2022-08-10 18:45:23.063892,0 days
...,...,...,...,...,...,...,...,...,...,...,...
6250235,27,2,105530.0,1,1.0,2022-08-10 19:00:10.304983,2022-08-10 19:00:10.304983,one,0 days 00:00:00,2022-08-10 19:00:10.304983,0 days
6250236,28,2,105510.0,18,18.0,2022-08-10 19:00:10.304983,2022-08-10 19:00:10.304983,one,0 days 00:00:00,2022-08-10 19:00:10.304983,0 days
6250237,29,2,105500.0,7,7.0,2022-08-10 19:00:10.304983,2022-08-10 19:00:10.304983,one,0 days 00:00:00,2022-08-10 19:00:10.304983,0 days
6250238,30,2,105470.0,7,7.0,2022-08-10 19:00:10.304983,2022-08-10 19:00:10.304983,one,0 days 00:00:00,2022-08-10 19:00:10.304983,0 days


In [None]:
# Эту штуку нужно как-нибудь запустить на день
# С начала торгового дня до конца
columns = ['type','price','volume','volume_dbl']

df_mb = pd.DataFrame()

mt5.market_book_add(ticker)

counter = 0

while dt.datetime.now().hour < 13:
#     print(counter)
    df_market_book = pd.DataFrame(mt5.market_book_get(ticker),columns=columns)
    df_market_book['time'] = dt.datetime.now()
    df_market_book['batch_number'] = counter
    df_mb = pd.concat([df_mb,df_market_book])
    counter += 1    
    time.sleep(0.05)
    
mt5.market_book_release(ticker)

df_mb

In [None]:
# df_mb.info()
# df_mb.to_csv('data/market_book.csv')
# df_mb['time'].describe(datetime_is_numeric=True)

In [43]:
# Ф-ция разового запроса стакана
# Здесь каждый раз идет регистрация "сбора инфы о стакане" с помощью
# mt5.market_book_add(symbol),
# а затем отзыв
# mt5.market_book_release(symbol)
def get_market_book(symbol):
    mt5.market_book_add(symbol)
    columns = ['type','price','volume','volume_dbl']
    df_market_book = pd.DataFrame(mt5.market_book_get(symbol),columns=columns)
    df_market_book['time'] = dt.datetime.now()

    mt5.market_book_release(symbol)
    return df_market_book

get_market_book(ticker)

Unnamed: 0,type,price,volume,volume_dbl,time
0,1,105530.0,18,18.0,2022-08-10 13:55:15.536588
1,1,105520.0,5,5.0,2022-08-10 13:55:15.536588
2,1,105510.0,4,4.0,2022-08-10 13:55:15.536588
3,1,105500.0,3,3.0,2022-08-10 13:55:15.536588
4,1,105490.0,5,5.0,2022-08-10 13:55:15.536588
5,1,105480.0,10,10.0,2022-08-10 13:55:15.536588
6,1,105470.0,6,6.0,2022-08-10 13:55:15.536588
7,1,105460.0,11,11.0,2022-08-10 13:55:15.536588
8,1,105450.0,9,9.0,2022-08-10 13:55:15.536588
9,1,105440.0,22,22.0,2022-08-10 13:55:15.536588


In [9]:
# terminal shutdown MetaTrader 5
mt5.shutdown()

True