# Main Code


In [2]:
import gzip
import numpy as np
import pandas as pd
import time
import os
from utils import *
from orderbook import OrderBook
from spectrum import Spectrum

In [2]:
def make_orderbooks(order_log: list, do_spec: bool):
    """
    function for making orderbooks for each spectrum
    
    return orderbooks and dataframe with spectrums if do_spec is true
    otherwise, returns just orderbooks
    """
    # creating order book for each seccode
    order_books = dict()
    for secc in SECCODES:
        order_books[secc] = OrderBook(secc)

    # creating spectrum for each seccode
    spectrums = dict()
    for secc in SECCODES:
        spectrums[secc] = Spectrum(seccode=secc)

    start = time.time()


    k = 0
    list_spec = []
    col_names = ['SECCODE', 'TIMESTAMP', 'BID_ASK']


    for row_log in order_log:
        if instruments_info[row_log['SECCODE']]['SCHEDULE'] <= row_log['TIME']:
            continue

        is_ask = row_log['BUYSELL'] == 'S'

        order_book = order_books[row_log['SECCODE']]
        spectrum = spectrums[row_log['SECCODE']]
        correct = False
        
        # CATCHING AGGRESSORS
        if row_log['PRICE'] == 0:
            continue
        if is_ask:
            if spectrum.best_bid > 0 and row_log['PRICE'] <= spectrum.best_bid:
                continue
        else:
            if spectrum.best_ask < 1e19 and row_log['PRICE'] >= spectrum.best_ask:
                continue

        # handle post
        if row_log['ACTION'] == Action.POST:
            order_book.add_entry(entry=row_log, 
                                 ask=is_ask)
            if do_spec:
                spectrum.update_post(order_book=order_book, new_price=row_log['PRICE'], 
                                 volume=row_log['VOLUME'], ask=is_ask)
                
            correct = True

        # handle revoke
        elif row_log['ACTION'] == Action.REVOKE:
            correct = order_book.revoke(orderno=row_log['ORDERNO'], volume=row_log['VOLUME'], 
                              ask=is_ask, row_numb=row_log['NO'])
            if do_spec and correct:
                spectrum.update_revoke(order_book=order_book, new_price=row_log['PRICE'], 
                                 volume=row_log['VOLUME'], ask=is_ask)

        elif row_log['ACTION'] == Action.MATCH:
            correct = order_book.match(orderno=row_log['ORDERNO'], volume=row_log['VOLUME'], 
                              ask=is_ask, row_numb=row_log['NO'])
            if do_spec and correct:
                spectrum.update_match(order_book=order_book, new_price=row_log['PRICE'], 
                                 volume=row_log['VOLUME'], ask=is_ask)


        if do_spec and correct:

            # для каждой новой row считаем спектрум и добавляем в файлик
            values = spectrum.bids_normalized.copy()
            values.extend(spectrum.asks_normalized.copy())
            d = [ row_log['SECCODE'], row_log['TIME'], values ]
            list_spec.append(d)
        #     k += 1
        #     if k == 1000:
        #         df_spec = pd.DataFrame(list_spec, columns=col_names)
        #         df_spec.to_csv('spectrums-'+str(k//1000)+'.csv')

        # TODO: do spectrum opearation only if no error during order book update
    
    df_spec = None
    if do_spec:
        # saving spectrum
        df_spec = pd.DataFrame(list_spec, columns=col_names)
        
    
    return order_books, df_spec
   
                
                       

def generate_spectrums(moex_dir: str, months: list, save_dir: str) -> None:
    """
    function for generating spectrums
    resulting csv files are correclty sorted
    
    moex_dir - an absolute path to MOEX-FX folder storing months folders
    months - a list of month to preprocess for exmaple, ['2018-03', '2018-05']
    save_dir - a path to where spectrums should be saved
    """
    
    spectrums_dir = os.path.join(save_dir, 'spectrums')
    try:
        os.mkdir(spectrums_dir)
    except Exception:
        print(f"{spectrums_dir} dir already exists. It may be overwritten.")
    
    
    for month in months:
        working_dir = os.path.join(moex_dir, month)
        save_to = os.path.join(spectrums_dir, month)
        try:
            os.mkdir(save_to)
        except Exception:
            print(f"{save_to} dir already exists. It may be overwritten.")
        
        print(f"processing {month}")
        for filename in os.listdir(working_dir):
            # acquiring orderlog
            if 'orderlog' in filename.lower():
                print(f"processing {filename}")
                # reading it
                orderlog_path = os.path.join(working_dir, filename)
                order_log = read_orderlog(orderlog_path)
                # preprocessing
                order_log = preprocess_orderlog(order_log)
                
                # creating order book for each seccode
                order_books = dict()
                for secc in SECCODES:
                    order_books[secc] = OrderBook(secc)

                # creating spectrum for each seccode
                spectrums = dict()
                for secc in SECCODES:
                    spectrums[secc] = Spectrum(seccode=secc)
                    
                _, df_spec = make_orderbooks(order_log, True)
                
                # saving spectrum
                for secc in SECCODES:
                    save_name = filename.lower().replace('orderlog', 'spectrum')[:-4] + '_' + secc + '.csv'
                    df_cur = df_spec[df_spec['SECCODE'] == secc].sort_values(by=['TIMESTAMP'])
                    df_cur.to_csv(os.path.join(save_to, save_name))
                    df_cur = None
                
                order_log = None
                order_books = None
                spectrums = None
                df_spec = None
                
        print('\n')

## Tasks 1-3 (for a single file)

In [4]:
# Reading
WORKING_DIR = r"D:\Data\MOEX-FX\2018-03\\"
orderlog_filename = WORKING_DIR + 'OrderLog20180301.txt'
order_log = read_orderlog(orderlog_filename)
# preprocesiing
order_log = preprocess_orderlog(order_log)
# making orderbooks and spectrum
order_books, df_spec = make_orderbooks(order_log, True)
df_spec.to_csv('spectrum_singl.csv')

### Kolmogorov-Smirnov for a single file with spectrums

In [3]:
from pdfspec import PdfSpec

# see tasks 1-3 last line
df_spec = pd.read_csv('spectrum_singl.csv')

# separating spectrums by their seccodes
spectrums_by_seccode = dict()
for secc in SECCODES:
    spectrums_by_seccode[secc] = df_spec[df_spec['SECCODE'] == secc].copy()
    spectrums_by_seccode[secc].sort_values(by=['TIMESTAMP'], inplace=True)

# release memory
df_spec = None

# pdfs of spectrums separated by seccode
pdfs_spec = dict()
for secc in SECCODES:
    pdfs_spec[secc] = PdfSpec(seccode=secc)
    
# calculating averages
for secc in SECCODES:
    pdfs_spec[secc].calc_avgs(spectrums_by_seccode[secc])
    # just to make sure that sums are approximately equtal to 1
    print(sum(pdfs_spec[secc].bids_count), sum(pdfs_spec[secc].asks_time))

# release memory
spectrums_by_seccode = None

0.9999988856337011 0.999999999999995
0.9999993639653042 1.0000000000000102
0.9999999999999948 0.9999999999999939
1.0000000000001537 0.9999999999999998
1.0000000000000004 0.9999999999999986
0.9999986448433577 0.9999999999999997


In [5]:
for secc in SECCODES:
    print(f"Performing KS-test for {secc}")
    pval = PdfSpec.kstest(pdfs_spec[secc].bids_count,
          pdfs_spec[secc].asks_count)
    print(f"\tbid count vs ask count - pval = {pval}")
    pval = PdfSpec.kstest(pdfs_spec[secc].bids_time,
          pdfs_spec[secc].asks_time)
    print(f"\tbid time vs ask time - pval = {pval}")
    pval = PdfSpec.kstest(pdfs_spec[secc].bids_time,
          pdfs_spec[secc].bids_count)
    print(f"\tbid time vs bid count - pval = {pval}")
    pval = PdfSpec.kstest(pdfs_spec[secc].asks_time,
          pdfs_spec[secc].asks_count)
    print(f"\task time vs ask count - pval = {pval}")
    

Performing KS-test for USD000000TOD
	bid count vs ask count - pval = 0.7869297884777761
	bid time vs ask time - pval = 0.7869297884777761
	bid time vs bid count - pval = 1.0
	ask time vs ask count - pval = 1.0
Performing KS-test for USD000UTSTOM
	bid count vs ask count - pval = 0.9944575548290717
	bid time vs ask time - pval = 0.9944575548290717
	bid time vs bid count - pval = 1.0
	ask time vs ask count - pval = 1.0
Performing KS-test for EUR_RUB__TOD
	bid count vs ask count - pval = 1.0
	bid time vs ask time - pval = 0.9944575548290717
	bid time vs bid count - pval = 1.0
	ask time vs ask count - pval = 1.0
Performing KS-test for EUR_RUB__TOM
	bid count vs ask count - pval = 0.9944575548290717
	bid time vs ask time - pval = 0.9944575548290717
	bid time vs bid count - pval = 1.0
	ask time vs ask count - pval = 1.0
Performing KS-test for EURUSD000TOD
	bid count vs ask count - pval = 0.00021650176448938054
	bid time vs ask time - pval = 0.00021650176448938054
	bid time vs bid count - pval

## Task 4

In [None]:
# uncomment to generate spectrums for all days
generate_spectrums(moex_dir=r"D:\Data\MOEX-FX", months=['2018-03', '2018-04', '2018-03'], save_dir='.')

In [8]:
# Reading

from periods import PdfPeriod

WORKING_DIR = "/media/rufina/Seagate/DataMining/spectrums/2018-03/"

seccodes_new = []

# find instruments that trade till 23:50
for i in instruments_info:
    if instruments_info[i]['SCHEDULE'] == 235000000000:
        seccodes_new.append(i)
        
# print(seccodes_new)    --- ['USD000UTSTOM', 'EUR_RUB__TOM', 'EURUSD000TOM']

output1 = {}             # store tables for first task - comparison by periods
output2 = {}             # store tables for second task - comparison by days
last_entry = {}          # store pdf for the previous day

for sec in seccodes_new:
    output1[sec] = [['Date', "10:00 vs 15:00", "15:00 vs 19:00", "10:00 vs 19:00"]]
    output2[sec] = [['Date', "10:00-15:00", "15:00-19:00", "19:00-23:50"]]
    last_entry[sec] = []
    
for spectrum_file in os.listdir(WORKING_DIR): # tradelog20180301_EURUSD000TOD.csv
    secc = spectrum_file[17:-4]
    date = spectrum_file[8:16]
    if secc not in seccodes_new:
        pass
    else:
        print(f"Proccesing. Date: {date}. Instrument: {secc}\n")
        
        # for output 1
        df_spec = pd.read_csv(WORKING_DIR + spectrum_file)
        current_pdf = PdfPeriod(seccode=secc)
        current_pdf.calc_avgs(df_spec)
        output1[secc].append(current_pdf.compare_by_periods(date))
        
        # for output 2
        if len(last_entry[secc]) != 0: 
            output2[secc].append(current_pdf.compare_by_days(last_entry[secc], date))
            last_entry[secc][0] = current_pdf.bids
            last_entry[secc][1] = current_pdf.asks
        else: 
            last_entry[secc].append(current_pdf.bids)
            last_entry[secc].append(current_pdf.asks)
            output2[secc].append([date, '-', '-', '-'])


for i in output1:
    pd.DataFrame(output1[i]).to_csv(i+'_output1.csv', index=False)
    
for i in output2:
    pd.DataFrame(output2[i]).to_csv(i+'_output2.csv', index=False)


Proccesing. Date: 20180301. Instrument: EURUSD000TOM

Proccesing. Date: 20180301. Instrument: EUR_RUB__TOM

Proccesing. Date: 20180301. Instrument: USD000UTSTOM

Proccesing. Date: 20180302. Instrument: EURUSD000TOM

Proccesing. Date: 20180302. Instrument: EUR_RUB__TOM

Proccesing. Date: 20180302. Instrument: USD000UTSTOM

Proccesing. Date: 20180305. Instrument: EURUSD000TOM

Proccesing. Date: 20180305. Instrument: EUR_RUB__TOM

Proccesing. Date: 20180305. Instrument: USD000UTSTOM

Proccesing. Date: 20180306. Instrument: EURUSD000TOM

Proccesing. Date: 20180306. Instrument: EUR_RUB__TOM

Proccesing. Date: 20180306. Instrument: USD000UTSTOM

Proccesing. Date: 20180307. Instrument: EURUSD000TOM

Proccesing. Date: 20180307. Instrument: EUR_RUB__TOM

Proccesing. Date: 20180307. Instrument: USD000UTSTOM

Proccesing. Date: 20180309. Instrument: EURUSD000TOM

Proccesing. Date: 20180309. Instrument: EUR_RUB__TOM

Proccesing. Date: 20180309. Instrument: USD000UTSTOM

Proccesing. Date: 20180312. 

"\norderlog_filename = WORKING_DIR + 'OrderLog20180301.txt'\norder_log = read_orderlog(orderlog_filename)\n# preprocesiing\norder_log = preprocess_orderlog(order_log)\n# making orderbooks and spectrum\norder_books, df_spec = make_orderbooks(order_log, True)\ndf_spec.to_csv('spectrum_singl.csv')\n"

20180301


2
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
3
