In [None]:
import numpy as np
import pandas as pd
import time
import os
from utils import *
from orderbook import OrderBook
from spectrum import Spectrum
import matplotlib.pyplot as plt

## Orderbooks and Spectrums

In [None]:
# Reading
WORKING_DIR = r"D:\Innopolis University\2021 Spring Semester\Data Mining\data-mining\\"
orderlog_filename = WORKING_DIR + 'OrderLog20180301.txt'
order_log = read_orderlog(orderlog_filename)
# preprocesiing
order_log = preprocess_orderlog(order_log)

In [None]:
SECCODES = ['USD000UTSTOM', 'EUR_RUB__TOM']

# creating order book for each seccode
order_books = dict()
for secc in SECCODES:
    order_books[secc] = OrderBook(secc)

# creating spectrum for each seccode
spectrums = dict()
for secc in SECCODES:
    spectrums[secc] = Spectrum(seccode=secc)

start = time.time()

list_spec = []
col_names = ['SECCODE', 'TIMESTAMP', 'BID_ASK']
do_spec = True
# FOR TASK 7
mp_increments_USD = []
mp_increments_EUR = []
BEST_BID = -1
BEST_ASK = 1e19
prevUSD = -1e19
prevEUR = -1e19

init_time = 100000000000

for row_log in order_log:
    
    if row_log['SECCODE'] not in SECCODES:
        continue
        
    if instruments_info[row_log['SECCODE']]['SCHEDULE'] <= row_log['TIME']:
        continue

    is_ask = row_log['BUYSELL'] == 'S'

    order_book = order_books[row_log['SECCODE']]
    spectrum = spectrums[row_log['SECCODE']]
    correct = False

    # CATCHING AGGRESSORS
    if row_log['PRICE'] == 0:
        continue
    if is_ask:
        if spectrum.best_bid > 0 and row_log['PRICE'] <= spectrum.best_bid:
            continue
    else:
        if spectrum.best_ask < 1e19 and row_log['PRICE'] >= spectrum.best_ask:
            continue

    # handle post
    if row_log['ACTION'] == Action.POST:
        order_book.add_entry(entry=row_log, 
                             ask=is_ask)
        if do_spec:
            spectrum.update_post(order_book=order_book, new_price=row_log['PRICE'], 
                             volume=row_log['VOLUME'], ask=is_ask)

        correct = True

    # handle revoke
    elif row_log['ACTION'] == Action.REVOKE:
        correct = order_book.revoke(orderno=row_log['ORDERNO'], volume=row_log['VOLUME'], 
                          ask=is_ask, row_numb=row_log['NO'])
        if do_spec and correct:
            spectrum.update_revoke(order_book=order_book, new_price=row_log['PRICE'], 
                             volume=row_log['VOLUME'], ask=is_ask)

    elif row_log['ACTION'] == Action.MATCH:
        correct = order_book.match(orderno=row_log['ORDERNO'], volume=row_log['VOLUME'], 
                          ask=is_ask, row_numb=row_log['NO'])
        if do_spec and correct:
            spectrum.update_match(order_book=order_book, new_price=row_log['PRICE'], 
                             volume=row_log['VOLUME'], ask=is_ask)

    # TASK 7
    if do_spec and correct:        
        if row_log['SECCODE'] == 'USD000UTSTOM':
            if prevUSD == -1e19:
                incrUSD = 0
                mid_px = 0
            else:
                if spectrum.best_bid == BEST_BID:
                    continue
                elif spectrum.best_ask == BEST_ASK:
                    continue
                else:
                    mid_px = (spectrum.best_bid + spectrum.best_ask) / 2
                
                incrUSD = mid_px - prevUSD
            
            prevUSD = mid_px
            
            mp_increments_USD.append((incrUSD, row_log['TIME'] - init_time))
        else:
            if prevEUR == -1e19:
                incrEUR = 0
                mid_px = 0
            else:
                if spectrum.best_bid == BEST_BID:
                    continue
                elif spectrum.best_ask == BEST_ASK:
                    continue
                else:
                    mid_px = (spectrum.best_bid + spectrum.best_ask) / 2
                
                incrEUR = mid_px - prevEUR
            
            prevEUR = mid_px
            
            mp_increments_EUR.append((incrEUR, row_log['TIME'] - init_time))

## Get midpoint increments time series

In [None]:
mp_increments_USD[:5]

In [None]:
mp_increments_EUR[:5]

In [None]:
mp_increments_USD = mp_increments_USD[2:]
mp_increments_EUR = mp_increments_EUR[2:]

In [None]:
mp_increments_USD[:5]

In [None]:
mp_increments_EUR[:5]

In [None]:
increments_USD = [ pair[0] for pair in mp_increments_USD ]
increments_EUR = [ pair[0] for pair in mp_increments_EUR ]

In [None]:
timestamps_USD = [ pair[1] for pair in mp_increments_USD ]
timestamps_EUR = [ pair[1] for pair in mp_increments_EUR ]

In [None]:
size_USD = len(mp_increments_USD)
size_EUR = len(mp_increments_EUR)

## Center time series

In [None]:
avg_USD = sum(increments_USD) / size_USD
avg_EUR = sum(increments_EUR) / size_EUR

In [None]:
print(avg_USD, avg_EUR)

In [None]:
centered_mp_increments_USD = [ increment - avg_USD for increment in increments_USD ]
centered_mp_increments_EUR = [ increment - avg_EUR for increment in increments_EUR ]

In [None]:
centered_mp_increments_USD[:5]

In [None]:
centered_mp_increments_EUR[:5]

## Hayashi & Yoshida method

In [None]:
def find_first(arr, bound, tau):
    # index первого времени, которое больше time_start
    value = bound + tau
    if value < arr[0]: return 0
    if value > arr[-1]: return -1
    lo = 0
    hi = len(arr)-1
    
    while (lo <= hi):
        mid = (hi + lo) // 2;
        if value < arr[mid]:
            hi = mid - 1
        elif value > arr[mid]:
            lo = mid + 1
        else:
            return mid
    
    return hi

In [None]:
def find_last(arr, bound, tau):
   # index первого времени, которое больше time_start
    value = bound + tau
    if value < arr[0]: return -1
    if value > arr[-1]: return len(arr)-1
    lo = 0
    hi = len(arr)-1
    
    while (lo <= hi):
        mid = (hi + lo) // 2;
        if value < arr[mid]:
            hi = mid - 1
        elif value > arr[mid]:
            lo = mid + 1
        else:
            return mid
    
    return lo

In [None]:
varX = np.var(centered_mp_increments_USD)
varY = np.var(centered_mp_increments_EUR)

In [None]:
print(varX, varY)

In [None]:
maxCorr = 0
bestTau = -5001
output_file = []

for tau in range(-5000,5100,100):
    print('Processing tau: ', tau)
    
    cov = 0
    n = 0
    
    for i in range(size_USD - 1):
        time_start = timestamps_USD[i]
        time_end = timestamps_USD[i + 1]
        
        i_first = find_first(timestamps_EUR, time_start, tau)
        i_last = find_last(timestamps_EUR, time_end, tau)
        
        if i_first == -1 or i_last == -1:
            continue
        
        for j in range (i_first, i_last + 1):
            cov += centered_mp_increments_USD[i] * centered_mp_increments_EUR[j]
            n += 1
    
    corr = (1 / (n - 1)) * cov / ((varX * varY)**(0.5))
    result = f'{tau}: {corr}'
    output_file.append(result)
    
    # print('Covariance: ', cov)
    print('Correlation: ', corr)
    
    if corr > maxCorr:
        maxCorr = corr
        bestTau = tau

In [None]:
output_file.append(f'Tau: {bestTau}')

if bestTau > 0:
    output_file.append('USD000UTSTOM is leading')
else:
    output_file.append('EUR_RUB__TOM is leading')
    
pd.DataFrame(output_file).to_csv("task7.csv")