In [1]:
import gzip
import itertools
import pandas as pd
import time
import os
from utils import *
from orderbook import OrderBook
from features import FeatureGenerator

In [66]:
test_asks = None
test_bids = None

def make_orderbooks(order_log: list):
    """
    Function for making orderbooks for each spectrum
    
    Return orderbooks, dataframe with spectrums and dataframe with VWAPs
    """
    global test_asks, test_bids
    # creating order book for each seccode
    order_books = dict()
    for secc in feature_seccodes:
        order_books[secc] = OrderBook(secc)

    # creating spectrum for each seccode
    spectrums = dict()
    for secc in feature_seccodes:
        spectrums[secc] = FeatureGenerator(seccode=secc, px_step=instruments_info[secc]['PRICE_STEP'])

    start = time.time()

    list_spec = []
    list_vwap = []
    list_aggressive_band = []
    list_aggressive_time = []
    list_bid_ask_spread = []

    # Count total volume for each period from trades made by aggressors
    # The idea is to keep the volume for the largest period and then derive its subperiods
    # Represented as a list of (time, volume) pair
    volume_bids = []
    volume_asks = []
    
    col_names = ['SECCODE', 'TIMESTAMP', 'BID_ASK']
    feature_names = ['SECCODE', 'TIMESTAMP', 'BID', 'ASK']
    aggressors_names = ['SECCODE', 'TIMESTAMP', 'BID', 'ASK']
    bid_ask_spread_names = ['SECCODE', 'TIMESTAMP', 'SPREAD']

    for row_log in order_log:
        if instruments_info[row_log['SECCODE']]['SCHEDULE'] <= row_log['TIME']:
            continue

        is_ask = row_log['BUYSELL'] == 'S'
        current_time = row_log['TIME'] % int(1E8) / 1E6 # In seconds
        current_volume = row_log['VOLUME']

        order_book = order_books[row_log['SECCODE']]
        spectrum = spectrums[row_log['SECCODE']]
        correct = False
        
        volume_bids = list(itertools.dropwhile(lambda pair: current_time - pair[0] >= FeatureGenerator.PERIODS[-1], volume_bids))
        volume_asks = list(itertools.dropwhile(lambda pair: current_time - pair[0] >= FeatureGenerator.PERIODS[-1], volume_asks))
        
        # CATCHING AGGRESSORS
        if is_ask:
            if spectrum.best_bid > 0 and \
                    (row_log['PRICE'] >= spectrum.best_bid or row_log['PRICE'] == 0):

                volume_asks.append((current_time, current_volume))
                # print('Aggressive ask: ', row_log['NO'], current_time, current_volume)
                continue
        else:
            if spectrum.best_ask < int(1e19) and \
                    (row_log['PRICE'] <= spectrum.best_ask or row_log['PRICE'] == 0):

                volume_bids.append((current_time, current_volume))
                # print('Aggressive bid: ', row_log['NO'], current_time, current_volume)
                continue

        # handle post
        if row_log['ACTION'] == Action.POST:
            order_book.add_entry(entry=row_log, 
                                 ask=is_ask)
            spectrum.update_post(order_book=order_book, new_price=row_log['PRICE'], 
                                 volume=current_volume, ask=is_ask,
                                 aggressive_bids=volume_bids + [(current_time, 0)], aggressive_asks=volume_asks + [(current_time, 0)])
            correct = True

        # handle revoke
        elif row_log['ACTION'] == Action.REVOKE:
            order_book.revoke(orderno=row_log['ORDERNO'], volume=current_volume,
                              ask=is_ask, row_numb=row_log['NO'])
            if correct:
                spectrum.update_revoke(order_book=order_book, new_price=row_log['PRICE'],
                                 volume=current_volume, ask=is_ask,
                                 aggressive_bids=volume_bids + [(current_time, 0)], aggressive_asks=volume_asks + [(current_time, 0)])

        elif row_log['ACTION'] == Action.MATCH:
            order_book.match(orderno=row_log['ORDERNO'], volume=current_volume,
                              ask=is_ask, row_numb=row_log['NO'])
            if correct:
                if is_ask:
                    spectrum.update_match(order_book=order_book, new_price=row_log['PRICE'],
                                     volume=current_volume, ask=is_ask,
                                     aggressive_bids=volume_bids + [(current_time, 0)], aggressive_asks=volume_asks + [(current_time, 0)])
                else:
                    spectrum.update_match(order_book=order_book, new_price=row_log['PRICE'],
                                     volume=current_volume, ask=is_ask,
                                     aggressive_bids=volume_bids + [(current_time, 0)], aggressive_asks=volume_asks + [(current_time, 0)])
        
        # print(row_log['TIME'])
        # print(volume_bids)
        # print(volume_asks)
        # print(spectrum.best_ask)
        # print(spectrum.best_bid, '=====', sep='\n')
        
        if row_log['TIME'] == 100206583723 and row_log['SECCODE'] == 'USD000000TOD':
            print(spectrums['USD000000TOD'].best_ask)
            print(spectrums['USD000000TOD'].best_bid)
            print(order_books['USD000000TOD'].asks)
            print('=' * 10)
            print(order_books['USD000000TOD'].bids)
            test_asks = order_books['USD000000TOD'].asks
            test_bids = order_books['USD000000TOD'].bids
        
        if correct:
            # для каждой новой row считаем спектрум добавляем в df
            values = spectrum.bids_normalized.copy()
            values.extend(spectrum.asks_normalized.copy())
            d_values = [ row_log['SECCODE'], row_log['TIME'], values ]
            list_spec.append(d_values)

            # для каждой новой row считаем VWAPs и добавляем в df
            vwaps_bids = list(spectrum.VWAP_bids.values()).copy()
            vwaps_asks = list(spectrum.VWAP_asks.values()).copy()
            d_vwaps = [ row_log['SECCODE'], row_log['TIME'], vwaps_bids, vwaps_asks ]
            list_vwap.append(d_vwaps)
            
            # Add aggressors (normalized by band)
            aggressive_bids = list(spectrum.aggressive_bids_normalized_band.values()).copy()
            aggressive_asks = list(spectrum.aggressive_asks_normalized_band.values()).copy()
            d_aggressive = [ row_log['SECCODE'], row_log['TIME'], aggressive_bids, aggressive_asks ]
            list_aggressive_band.append(d_aggressive)
            
            # Add aggressors (normalized by periods)
            aggressive_bids = list(spectrum.aggressive_bids_normalized_time.values()).copy()
            aggressive_asks = list(spectrum.aggressive_asks_normalized_time.values()).copy()
            d_aggressive = [ row_log['SECCODE'], row_log['TIME'], aggressive_bids, aggressive_asks ]
            list_aggressive_time.append(d_aggressive)

            # Add bid-ask spread
            d_bid_ask_spread = [ row_log['SECCODE'], row_log['TIME'], spectrum.bid_ask_spread ]
            list_bid_ask_spread.append(d_bid_ask_spread)
    
    # Saving spectrum
    df_spec = pd.DataFrame(list_spec, columns=col_names)
    # Saving VWAPs
    df_vwap = pd.DataFrame(list_vwap, columns=feature_names)
    # Saving aggressive trades
    df_aggressive_band = pd.DataFrame(list_aggressive_band, columns=aggressors_names)
    df_aggressive_time = pd.DataFrame(list_aggressive_time, columns=aggressors_names)
    # Saving bid-ask spread
    df_bid_ask_spread = pd.DataFrame(list_bid_ask_spread, columns=bid_ask_spread_names)

    end = time.time()

    return order_books, df_spec, df_vwap, df_bid_ask_spread, df_aggressive_band, df_aggressive_time, end - start

## Single Day

In [35]:
# Reading
WORKING_DIR = r"D:\Data\MOEX-FX\2018-03\\"
WORKING_DIR = r'D:\Innopolis University\2021 Spring Semester\Data Mining\data-mining\\'

orderlog_filename = WORKING_DIR + 'OrderLog20180330.txt'

order_log = read_orderlog(orderlog_filename)[:20000]

# Preprocessing
order_log = filter(order_log, lambda row: row['SECCODE'] in feature_seccodes)
order_log = preprocess_orderlog(order_log)

In [67]:
# Make orderbooks, spectrum, and vwaps
order_books, df_spec, df_vwap, df_spread, df_aggressive_band, df_aggressive_time, exec_time = make_orderbooks(order_log)

57.1625
57.3075
{49: {'ORDERNO': 49, 'SECCODE': 'USD000000TOD', 'PRICE': 57.805, 'VOLUME': 1000}, 77: {'ORDERNO': 77, 'SECCODE': 'USD000000TOD', 'PRICE': 57.88, 'VOLUME': 1000000}, 154: {'ORDERNO': 154, 'SECCODE': 'USD000000TOD', 'PRICE': 57.445, 'VOLUME': 16000}, 171: {'ORDERNO': 171, 'SECCODE': 'USD000000TOD', 'PRICE': 57.58, 'VOLUME': 20000}, 172: {'ORDERNO': 172, 'SECCODE': 'USD000000TOD', 'PRICE': 58.5, 'VOLUME': 1000}, 177: {'ORDERNO': 177, 'SECCODE': 'USD000000TOD', 'PRICE': 57.85, 'VOLUME': 20000}, 179: {'ORDERNO': 179, 'SECCODE': 'USD000000TOD', 'PRICE': 57.48, 'VOLUME': 60000}, 188: {'ORDERNO': 188, 'SECCODE': 'USD000000TOD', 'PRICE': 57.44, 'VOLUME': 50000}, 191: {'ORDERNO': 191, 'SECCODE': 'USD000000TOD', 'PRICE': 58.585, 'VOLUME': 1000}, 193: {'ORDERNO': 193, 'SECCODE': 'USD000000TOD', 'PRICE': 58.15, 'VOLUME': 10000}, 195: {'ORDERNO': 195, 'SECCODE': 'USD000000TOD', 'PRICE': 57.5, 'VOLUME': 1000}, 230: {'ORDERNO': 230, 'SECCODE': 'USD000000TOD', 'PRICE': 58.5, 'VOLUME': 1

In [68]:
{k:v for k, v in sorted(test_asks.items(), key=lambda row: row[1]['PRICE'])}

{10151: {'ORDERNO': 10151,
  'SECCODE': 'USD000000TOD',
  'PRICE': 57.18,
  'VOLUME': 98000},
 10219: {'ORDERNO': 10219,
  'SECCODE': 'USD000000TOD',
  'PRICE': 57.18,
  'VOLUME': 100000},
 10328: {'ORDERNO': 10328,
  'SECCODE': 'USD000000TOD',
  'PRICE': 57.1825,
  'VOLUME': 500000},
 9429: {'ORDERNO': 9429,
  'SECCODE': 'USD000000TOD',
  'PRICE': 57.185,
  'VOLUME': 100000},
 10299: {'ORDERNO': 10299,
  'SECCODE': 'USD000000TOD',
  'PRICE': 57.185,
  'VOLUME': 50000},
 10300: {'ORDERNO': 10300,
  'SECCODE': 'USD000000TOD',
  'PRICE': 57.185,
  'VOLUME': 50000},
 10350: {'ORDERNO': 10350,
  'SECCODE': 'USD000000TOD',
  'PRICE': 57.185,
  'VOLUME': 1010000},
 10301: {'ORDERNO': 10301,
  'SECCODE': 'USD000000TOD',
  'PRICE': 57.1875,
  'VOLUME': 50000},
 10353: {'ORDERNO': 10353,
  'SECCODE': 'USD000000TOD',
  'PRICE': 57.1875,
  'VOLUME': 50000},
 10157: {'ORDERNO': 10157,
  'SECCODE': 'USD000000TOD',
  'PRICE': 57.19,
  'VOLUME': 50000},
 10175: {'ORDERNO': 10175,
  'SECCODE': 'USD000

In [75]:
test_asks

{49: {'ORDERNO': 49,
  'SECCODE': 'USD000000TOD',
  'PRICE': 57.805,
  'VOLUME': 1000},
 77: {'ORDERNO': 77,
  'SECCODE': 'USD000000TOD',
  'PRICE': 57.88,
  'VOLUME': 1000000},
 154: {'ORDERNO': 154,
  'SECCODE': 'USD000000TOD',
  'PRICE': 57.445,
  'VOLUME': 16000},
 171: {'ORDERNO': 171,
  'SECCODE': 'USD000000TOD',
  'PRICE': 57.58,
  'VOLUME': 20000},
 172: {'ORDERNO': 172,
  'SECCODE': 'USD000000TOD',
  'PRICE': 58.5,
  'VOLUME': 1000},
 177: {'ORDERNO': 177,
  'SECCODE': 'USD000000TOD',
  'PRICE': 57.85,
  'VOLUME': 20000},
 179: {'ORDERNO': 179,
  'SECCODE': 'USD000000TOD',
  'PRICE': 57.48,
  'VOLUME': 60000},
 188: {'ORDERNO': 188,
  'SECCODE': 'USD000000TOD',
  'PRICE': 57.44,
  'VOLUME': 50000},
 191: {'ORDERNO': 191,
  'SECCODE': 'USD000000TOD',
  'PRICE': 58.585,
  'VOLUME': 1000},
 193: {'ORDERNO': 193,
  'SECCODE': 'USD000000TOD',
  'PRICE': 58.15,
  'VOLUME': 10000},
 195: {'ORDERNO': 195,
  'SECCODE': 'USD000000TOD',
  'PRICE': 57.5,
  'VOLUME': 1000},
 230: {'ORDERNO

In [69]:
{k:v for k, v in sorted(test_bids.items(), key=lambda row: row[1]['PRICE'])}

{}

In [70]:
df_spec[df_spec['TIMESTAMP'] == 100206583723]

Unnamed: 0,SECCODE,TIMESTAMP,BID_ASK
5673,USD000000TOD,100206583723,"[0.4417, 0.0152, 3.8182, 3.566, 1.1373, 0.346,..."


In [71]:
df_vwap[df_vwap['TIMESTAMP'] == 100206583723]

Unnamed: 0,SECCODE,TIMESTAMP,BID,ASK
5673,USD000000TOD,100206583723,"[57.165, 57.165, 57.165, 57.165, 57.165, 57.165]","[57.87925, 57.879625, 57.87985, 57.879925, 57...."


In [72]:
df_aggressive_band[df_aggressive_band['TIMESTAMP'] == 100206583723]

Unnamed: 0,SECCODE,TIMESTAMP,BID,ASK
5673,USD000000TOD,100206583723,"[1.402, 3.2992, 152.7747, 152.7747, 152.7747]","[0.001, 0.003, 10.9654, 10.9654, 10.9654]"


In [73]:
df_aggressive_time[df_aggressive_time['TIMESTAMP'] == 100206583723]

Unnamed: 0,SECCODE,TIMESTAMP,BID,ASK
5673,USD000000TOD,100206583723,"[14020000.0, 6598400.0, 101849800.0, 50924900....","[10000.0, 6000.0, 7310266.666666667, 3655133.3..."


In [74]:
df_spread[df_spread['TIMESTAMP'] == 100206583723]

Unnamed: 0,SECCODE,TIMESTAMP,SPREAD
5673,USD000000TOD,100206583723,-58.0


In [None]:
def generate_spectrums(moex_dir: str, months: list, save_dir: str) -> None:
    spectrums_dir = os.path.join(save_dir, 'features')
    try:
        os.mkdir(spectrums_dir)
    except Exception:
        print(f"{spectrums_dir} dir already exists. It may be overwritten.")

    for month in months:
        working_dir = os.path.join(moex_dir, month)
        save_to = os.path.join(spectrums_dir, month)
        try:
            os.mkdir(save_to)
        except Exception:
            print(f"{save_to} dir already exists. It may be overwritten.")
        
        print(f"processing {month}")
        for filename in os.listdir(working_dir):
            # acquiring orderlog
            if 'orderlog' in filename.lower():
                print(f"processing {filename}")
                # reading it
                orderlog_path = os.path.join(working_dir, filename)
                order_log = read_orderlog(orderlog_path)
                # preprocessing
                order_log = filter(order_log, lambda row: row['SECCODE'] in feature_seccodes)
                order_log = preprocess_orderlog(order_log)
                
                # creating order book for each seccode
                order_books = dict()
                for secc in feature_seccodes:
                    order_books[secc] = OrderBook(secc)

                # creating spectrum for each seccode
                spectrums = dict()
                for secc in feature_seccodes:
                    spectrums[secc] = FeatureGenerator(seccode=secc, px_step=instruments_info[secc]['PRICE_STEP'])
                    
                _, df_spec, df_vwap, df_spread, df_aggressive_band, df_aggressive_time, exec_time = make_orderbooks(order_log)
                
                # saving spectrum
                for secc in feature_seccodes:
                    save_name = filename.lower().replace('orderlog', 'spectrum')[:-4] + '_' + secc + '.csv'
                    df_cur = df_spec[df_spec['SECCODE'] == secc].sort_values(by=['TIMESTAMP'])
                    df_cur.to_csv(os.path.join(save_to, save_name))
                    df_cur = None
                    
                    save_name = filename.lower().replace('orderlog', 'vwap')[:-4] + '_' + secc + '.csv'
                    df_cur = df_vwap[df_vwap['SECCODE'] == secc].sort_values(by=['TIMESTAMP'])
                    df_cur.to_csv(os.path.join(save_to, save_name))
                    df_cur = None
                    
                    save_name = filename.lower().replace('orderlog', 'aggressive_band')[:-4] + '_' + secc + '.csv'
                    df_cur = df_aggressive_band[df_aggressive_band['SECCODE'] == secc].sort_values(by=['TIMESTAMP'])
                    df_cur.to_csv(os.path.join(save_to, save_name))
                    df_cur = None
                    
                    save_name = filename.lower().replace('orderlog', 'aggressive_time')[:-4] + '_' + secc + '.csv'
                    df_cur = df_aggressive_time[df_aggressive_time['SECCODE'] == secc].sort_values(by=['TIMESTAMP'])
                    df_cur.to_csv(os.path.join(save_to, save_name))
                    df_cur = None
                    
                    save_name = filename.lower().replace('orderlog', 'spread')[:-4] + '_' + secc + '.csv'
                    df_cur = df_spread[df_spread['SECCODE'] == secc].sort_values(by=['TIMESTAMP'])
                    df_cur.to_csv(os.path.join(save_to, save_name))
                    df_cur = None
                
                order_log = None
                order_books = None
                spectrums = None
                df_spec = None
                
        print('\n')

# All Files

In [None]:
generate_spectrums(moex_dir=r"D:\Downloads\data", months=['2018-03', '2018-04', '2018-03'], save_dir='.')