In [277]:
import pandas as pd
import json
import numpy as np
import time

with open('data.json') as f:
    data = json.load(f)

%matplotlib qt


In [278]:
# Reading to dataframe
# Creating additional columns
gen = ((k, v['assetA']['bid'], v['assetA']['ask'], v['assetB']['bid'], v['assetB']['ask']) for k, v in data.items())
data_list = [x for x in gen]
df = pd.DataFrame(data_list, columns=['timestamp', 'assetA_bid', 'assetA_ask', 'assetB_bid', 'assetB_ask'])
df['delta_t'] = df['timestamp'].astype('uint64').diff()
df['trade_freq'] = df.apply (lambda row: 1/row['delta_t'], axis=1)
df['trade_freq_avg'] = df['trade_freq'].rolling(window=15).mean()
df['ts'] = df['timestamp']
df = df.set_index('timestamp')


In [279]:
# Lets chart some analytics - trade frequency, tick frequency
import matplotlib.pyplot as plt

fig0, (ax00, ax01) = plt.subplots(2, 1)
ax00.set_title('Tick time delta histogram (up to 2 seconds)')
ax01.set_title('Tick time delta averaged in time')
df['delta_t'].hist(ax = ax00, bins = np.arange(10,2000,100))
df['delta_t'].rolling(window=30).mean().rolling(window=30).mean().rolling(window=30).mean().plot(ax = ax01)

# Conclutions:
# 1. the samples are highlty uneaven with rare time deltas up to several minutes. Most time deltas between the samples however are between 50 and 500 ms
# 2. Due to (1) several approaches can be taked to fight non uniformity of samples
#   a. upsample missing timepoints
#   b. craete new more uniform ticks by special transformation (time, volume, cumulative price ticks or entropy based ticks)
#   c. train predictor in such a way that accounts for timme deltas between samples
#   d. assume low impact of non uniformity and treat as uniform (probably a bad idea but will do for start)

<matplotlib.axes._subplots.AxesSubplot at 0x24f7acd1ca0>

In [280]:
# Plot the ask/bid data for analysis
# Get the timeline data 
start = df.iloc[[0]].index.astype('uint64')
end = df.iloc[[-1]].index.astype('uint64')
timedelta = end-start
print(f'The time period is: {timedelta/1000/60/60/24} days')
print(f'Maximum possible trades in this period is: {timedelta/1000/30} trades')

# Setting up the grids
delta_t = end.to_numpy()-start.to_numpy()
number_of_ticks = len(df)
minor_grid_min = 0.5
ticksPerHour = number_of_ticks/(delta_t/1000/60/60)
ticksPerMinorGrid = number_of_ticks/(delta_t/1000/60)*minor_grid_min

hour_grid = np.arange(-0.2*number_of_ticks, 1.2*number_of_ticks, ticksPerHour)
minor_grid = np.arange(-0.2*number_of_ticks, 1.2*number_of_ticks, ticksPerMinorGrid)

# Plot the next graphs:
# Subplot 1
# a. A spread
# b. B spread
# c. A ask and bid
# c. B ask and bid

SpreadA = df['assetA_ask'].sub(df['assetA_bid'])
SpreadB = df['assetB_ask'].sub(df['assetB_bid'])
bidA = df['assetA_bid']
askA = df['assetA_ask']
bidB = df['assetB_bid']
askB = df['assetB_ask']

fig1, (ax10, ax11, ax12) = plt.subplots(3, 1)
#ax10.set_xticks(hour_grid)
#ax10.set_xticks(minor_grid, minor=True)
#ax10.grid(which='both')

for sig in [bidA, askA, bidB, askB]:
    sig_scaled = sig-7050
    sig_scaled.plot(ax = ax10)
    sig_scaled.rolling(window=30).mean().plot(ax = ax10, style=':')

i = -20
for sig in [SpreadA, SpreadB]:
    i=i+20
    sig_scaled = sig*5+i # Just a scaling to fit on one axes
    sig_scaled.plot(ax = ax10)
    sig_scaled.rolling(window=30).mean().plot(ax = ax10, style=':')

ax10.legend(['bidA', 'bidA avg', 'askA', 'askA avg', 'bidB', 'bidB avg', 'askB', 'askB avg', 'SpreadA', 'SpreadA avg', 'SpreadB', 'SpreadB avg'])
ax10.set_title('A and B ask/bid and spread')

# Subplot 2
# a. A ask and bid zoomin

zoomin_range = np.arange(4000,5000,1)

for sig in [bidA, askA]:
    sig.iloc[zoomin_range].plot(ax = ax11)
    sig_mov = sig.rolling(window=30).mean()
    sig_mov.iloc[zoomin_range].plot(ax = ax11, style=':')
    sig.iloc[zoomin_range].plot(ax = ax11, style = '.')

ax11.legend(['bidA', 'bidA avg', 'bidA scatter', 'askA', 'askA avg', 'askA scatter'])
ax11.set_title('A Signals zoomin')

# Subplot 3
# a. B ask and bid zoomin 

for sig in [bidB, askB]:
    sig.iloc[zoomin_range].plot(ax = ax12)
    sig_mov = sig.rolling(window=30).mean()
    sig_mov.iloc[zoomin_range].plot(ax = ax12, style=':')
    sig.iloc[zoomin_range].plot(ax = ax12, style = '.')

ax12.legend(['bidB', 'bidB avg', 'bidB scatter', 'askB', 'askB avg', 'askB scatter'])
ax12.set_title('B Signals zoomin')

# Conclutions:
# 1. Zoomout: 
#   a. The trading dataframe is of one day aproximately
#   b. A and B are hightly correlated at least at low resolution
#   c. There are negative spreads in the data up to tens of seconds. Three of possible explanations are 
#       i. Bullish of Bearish trades beyond optimal opposite price point 
#       ii. missing data points/lags in an order book that creates opposite price optimization lag
#       iii. Lagging price correction 

# 2. Zoomin
#   a. Uneven sampling can be seen
#   b. A and B have different trading dynamics
#   c. A and B have different spread
#   d. A and B have different trading patterns
#   e. As a result of (b), (c), and (d) we can deduct different number of traders, hightly different volumes traded for assets A ndd B 

The time period is: Float64Index([1.083294814814815], dtype='float64', name='timestamp') days
Maximum possible trades in this period is: Float64Index([3119.889066666667], dtype='float64', name='timestamp') trades


Text(0.5, 1.0, 'B Signals zoomin')

In [281]:
# Plot the ask/bid data analitics

fig2, (ax20, ax21, ax22) = plt.subplots(3, 1)

# Subplot 1
# a. A trailing window var
# b. B trailing window var

ax20.set_title('Rolling window signal variance')

i=0
for sig in [bidA, askA, bidB, askB]:
    i=i+10
    sig_mov = sig.rolling(window=30).var()+i
    sig_mov.plot(ax = ax20)

ax20.legend(['bidA var', 'askA var','bidB var', 'askB var'])
ax20.get_xaxis().set_visible(False)

# Subplot 2
# a. ask/bid trailing window cov

ax21.set_title('Rolling window ask/bid covariance')
cov_A = bidA.rolling(window=30).cov(other = askA)+10
cov_B = bidB.rolling(window=30).cov(other = askB)-10
cov_A.plot(ax = ax21)
cov_B.plot(ax = ax21)
ax21.legend(['A bid/ask cov', 'B bid/ask cov'])
ax21.get_xaxis().set_visible(False)

# Subplot 3
# a. AB trailing window cov

ax22.set_title('Rolling window A/B covariance')
cov_bid = bidA.rolling(window=30).cov(other = bidB)+10
cov_ask = askA.rolling(window=30).cov(other = askB)-10
cov_bid.plot(ax = ax22)
cov_ask.plot(ax = ax22)
ax22.legend(['A/B bid cov', 'A/B ask cov'])
ax22.get_xaxis().set_visible(False)

# Conclutions:
#   a. Hmmmm... Not sure what can be seen here at low resolution in regard of var/covar between signals. 
#   b. Peaks in A/B and bid/ask covariance correlates to occations of magor trading (volume, price change) likely (?)
#   c. var and covar graphs for all the signals look alike which probably reflect the fact that assets are closely related. Especially it shows at major trading periods (spikes in var/covar) 


In [282]:
##################################################################
##################################################################
###############    Trading bot
##################################################################

# A simple trading system can be sketched as this:
# (A) marketInputs -> (B) features/signals -> (C) assetPredictions [t, price, prob] -> (D) portfolioOtimization -> (E) orders

# For (A) we have our initial signals

# For (B) we can generate new features like trading frequency, volume, signal variance. In general case this can include outside signals like semantic market analysis or other exchanges data. 

# For (C) possibilities are wast. The predictor can be short termed, long termed, tech analysis based, ML/NN based. In this assignment we will check 3 options for the predictor:
#   a. Ideal predictor. The predictor that knows the future exactly. Built by using existing dataframe. It is good for initial debug and as a performance reference for other predictors
#   b. overfitted NN predictor. We will train the NN on the entire dataframe and use it on the same dataframe in inference mode for trading. It is not a fair predictor as it uses information from the future for learning. However it can be used as a refeerence for future predictors
#   c. NN predictor that is trained on a past window and is used for some time until the model get updated with new data. 

# For (D) we will use a strategy that chooses to maximize predicted return on portfolio. For the sake of assignment we assume oreders of 1 only so we will not create risk-return optimized and distributed portfolios. In real world however it is a good idea to predict prices/signals/returns with probabilities and use them to maintain return-risk optimized portfolios be choosing assets to be close to efficiency frontier. 

# (E) Orders should be of size of one every time and spaced at least 30 seconds each


# Lets start!

In [283]:
class silding_window():
# base class for sliding window functions 
    def __init__(self, size):
        self.size = size
        self.arr = []
        self.val = None
        self.last = None

    def update(self, val):
        if val == None:
            val = self.last
        self.last = val
        if len(self.arr) == self.size:
            self.arr.pop(0)
            self.arr.append(val)
            self.val = self.func()
            return

        self.arr.append(val)

    def getval(self):
        return self.val

    def next_tick(self, val):
        self.update(val)
        return self.getval

    def func(self):
    # This function is to be rewritten in downstream classees
        return None

class avg_silding_window(silding_window):
# mooving average class
    def func(self):
        return np.mean(self.arr)

def lookAheadTradeReturnPredictorFunc(pred, active_asset, N):
    # Input:
    #   Array of [K x 2N] (2 * trading delay) prices predictions for K assets
    # Output:
    #   [K x N] Array of minimal return predictions for each of next N ticks (1*trading delay delta) for each asset
    # Note: later to each element of future return predictions in non active asset the delta of active asset for the preceeding period should be added to account for a price of not taking a trading action 
    # Note: Each of the points in the array answers the question "what would be my return after minimal trading delay if I choose to go into asset k at timepoint 0<=n<N". Why so many optional point n instead of just a single time point in the present? Because by being gridy we are risking missing the "good" trade point in the nearest future because of the introduced trading delay   
    # A1 [1   2 3 1  3 1 3  4 4 23 32  32 34 ]
    # A2 [-1 -2 3 1 -3 1 3 -4 4 2  12 -12 14 ]
    # Note: How the trade return is calculated? At tick 0<=n<N asset price[buy/ask] (active asset is an exception) is taken and substracted from the sell/bid price at point n'=n+N (Next nearest possible trade)
    
    try:
        # first compute the return of active asset up to timepoint n if no decicion is made now at n=0
        if active_asset != -1 and active_asset < len(pred):
            bid = pred[active_asset][1]
            aa_ret_up_to_n = [bid[i]-bid[0] for i in range(0,N)]
        else:
            # no active asset means no return on it
            aa_ret_up_to_n = [0 for i in range(0,N)]

        returns = []
        for asset, data in enumerate(pred):
            [ask, bid] = data
            ret = []
            if asset == active_asset:
                for n in range(0,N):
                    r = bid[n+N] - bid[n] + aa_ret_up_to_n[n] # if asset is active then we dont have to buy it hence bid instead of ask
                    ret.append(r)
            else:
                for n in range(0,N):
                    r = bid[n+N] - ask[n] + aa_ret_up_to_n[n]
                    ret.append(r)
            returns.append(ret)
        return returns
    except: # in case of too short prediction vector for example
        return []

def lookAheadTradeDescitionMakerFunc(return_pred, greediness_w ):
    # Input:
    # [K x N] trading return predictions array
    # [N] Greediness vector. This is a vector that favors the earlier trades by multiplying the predicted returns vector for each asset. This is to caount for time value lost in delayed trades and rising uncertanty for farther predictions
    # Output:
    # (asset_idx, min_return) Returns index of asset with best predicted return and the predicted return itself 
    max_asset = -1
    max_return = 0
    max_time = 0
    for asset, ret in enumerate(return_pred):
        # weight the trading delay
        for i in range(len(ret)):
            ret[i] = ret[i]*greediness_w[i]
        # find the max for this asset return    
        m = max(ret)
        if m > max_return:
            max_return = m
            max_asset = asset
            max_time = ret.index(m)

    return (max_return, max_asset, max_time)

class idealPredictor():
    # This just looks into the future of dataframe
    def __init__(self, df, cols, lookahead):
        self.df = df
        self.lookahead = lookahead
        self.cols = cols
    
    def update(self, data):
        pass

    def predict(self, ts):
        try:
            its = df.index.get_loc(ts)
            sub_df = df.iloc[its:its+self.lookahead][self.cols]
            pred = []
            for c in self.cols:
                pred.append(sub_df[c].to_numpy())
            return pred
        except:
            return []

class tradingBot():
    # Note: 
    # Simple bot scheme
    # bare_signals -> features -> price_prediction  -> trade_orders  
    #                                     strategy  ->
    def __init__(self, asset_names, trading_delay, ticks_per_trade_delay, predictor, calcRetFunc, calcDescisionFunc ):
        self.last_trade = 0
        self.trading_delay = trading_delay 
        self.orders = []
        self.last_order = {}
        self.cur_asset_id = -1
        self.cur_qty = 0
        self.cur_ts = 0
        self.N = ticks_per_trade_delay
        self.predictor = predictor
        self.calcRetFunc = calcRetFunc
        self.calcDescisionFunc = calcDescisionFunc
        self.greediness_w = [1 for i in range(self.N)]
        self.last_data = []
        self.asset_names = asset_names

        
    def compute_tick(self, ts, data):
        # this handles new data arriving to the bot
        # done
        self.append_data(data)
        self.pred = self.predictor.update(data)
        self.cur_ts = ts
        # perform action if enough time passed since last action
        if (int(ts)-int(self.last_trade)) > self.trading_delay*1000:
            pred = self.predictor.predict(self.cur_ts)
            # convert prediction signals to ask/bid pairs list: [[askA, bidA], [askB, bidB]..... ]
            self.pred = [[pred[2*i], pred[2*i+1]] for i in range(int(len(pred)/2))]
            new_asset_id = self.compute_trade()
            self.update_state(new_asset_id)

    def append_data(self, data):
        # EXTEND LATER
        self.last_data = data
        #for asset, val in items(data):
        pass

    def compute_trade(self):
        # this computes trade based on the current market
        return_pred = self.calcRetFunc(self.pred, self.cur_asset_id, self.N)
        # kinda error handling - REFACTOR!!!
        if return_pred == []:
            return self.cur_asset_id
        (expected_ret, asset_id, trade_time) = self.calcDescisionFunc(return_pred, self.greediness_w)
        # if the optimum in the future skip the trade
        if trade_time>0:
            return self.cur_asset_id
        return asset_id

    def update_state(self, tgt_asset_id):
        # this updates bot state based on target asset_id
        # done
        tgt_asset_id = int(tgt_asset_id)
        if tgt_asset_id != self.cur_asset_id:
            if self.cur_asset_id == -1:
                self.issue_trades([('Buy', tgt_asset_id)])
            elif tgt_asset_id == -1:
                self.issue_trades([('Sell', self.cur_asset_id)])
            else:
                self.issue_trades([('Sell', self.cur_asset_id), ('Buy', tgt_asset_id)])
    
    def issue_trades(self, orders):
        # done
        actions = []
        for (order_type, asset_id) in orders:
            # update the quantities and asset stocks
            if order_type == 'Sell':
                self.cur_qty = self.cur_qty*self.get_sell_price(asset_id)
                self.cur_asset_id = -1
            elif order_type == 'Buy':
                self.cur_asset_id = asset_id
                self.cur_qty = self.cur_qty/self.get_buy_price(asset_id)
            # update the orders
            actions.append(order_type+self.asset_names[asset_id])
            
        # update an order book
        self.orders.append({
                "time": self.cur_ts,
                "actions": actions
                })
        self.last_trade = self.cur_ts

    def get_sell_price(self, asset_id):
        return self.last_data[asset_id]['bid']

    def get_buy_price(self, asset_id):
        return self.last_data[asset_id]['ask']
       
    def getOrderList(self):
        return self.orders


            


In [284]:
# "print unit tests" for return prediction calculation and descition functions
p = idealPredictor(df, ['assetA_ask', 'assetA_bid', 'assetB_ask', 'assetB_bid'], 8)
#print(df.head(3))
a= p.predict('1577836803078')
print(a[0])
print(a[1])
print(a[2])
print(a[3])
print(type(a[2]))

#a =[]
#a.append([101, 102, 103, 104])
#a.append([100, 101, 102, 103])
#a.append([201, 202, 203, 204])
#a.append([200, 201, 202, 203])
return_pred = lookAheadTradeReturnPredictorFunc([[a[0], a[1]],[a[2], a[3]]], -1, 4)
print(return_pred)
(max_return, max_asset, max_time) = lookAheadTradeDescitionMakerFunc(return_pred, [1 for n in range(4)])
print(max_return)
print(max_asset)
print(max_time)

[7189.   7190.09 7190.09 7190.43 7190.39 7190.33 7190.25 7190.21]
[7188.89 7190.05 7190.05 7190.05 7190.05 7189.92 7189.92 7189.92]
[7170.25 7170.25 7170.5  7170.5  7170.5  7170.5  7170.5  7170.5 ]
[7169.5 7169.5 7169.5 7169.5 7169.5 7169.5 7169.5 7169.5]
<class 'numpy.ndarray'>
[[1.050000000000182, -0.17000000000007276, -0.17000000000007276, -0.5100000000002183], [-0.75, -0.75, -1.0, -1.0]]
1.050000000000182
0
0


In [285]:
# Auxilary functions to run the bot
def read_tick(df):
    for i, row in df.iterrows():
        tick = {'timestamp':row['ts'],
                'assets':[  {'ask':row['assetA_ask'],'bid':row['assetA_bid']}, 
                            {'ask':row['assetB_ask'],'bid':row['assetB_bid']}
                        ]
                }
        yield tick



# Creating the bot
ticks_per_trade_delay = 60
trade_delay = 30
asset_names = ['A', 'B']
idealP = idealPredictor(df, ['assetA_ask', 'assetA_bid', 'assetB_ask', 'assetB_bid'], ticks_per_trade_delay*2)
bot = tradingBot(asset_names, trade_delay, ticks_per_trade_delay, idealP, lookAheadTradeReturnPredictorFunc, lookAheadTradeDescitionMakerFunc)

# Running the simulation
i = 0
j = 0
total_kts = round(len(df)/1000)
start = time.time()
total_process_time = 0
for t in read_tick(df):
    i=i+1
    bot.compute_tick(t['timestamp'], t['assets'])
    if i == 1000:
        end = time.time()
        delta = end-start
        total_process_time = total_process_time + delta
        avg_k_process_time = float(total_process_time)/j
        start = time.time()
        i = 0
        j=j+1
        print(f'{j}/{total_kts}k tamesamples are processed. It took {delta:.2f} seconds. Estimated time left is: {avg_k_process_time*(total_kts-j):.2f} seconds ')

print(f'Done! All {total_kts}k tamesamples were processed')

output = bot.getOrderList()

 It took 1.98 seconds. Estimated time left is: 408.76 seconds 
9/214k tamesamples are processed. It took 1.19 seconds. Estimated time left is: 244.55 seconds 
10/214k tamesamples are processed. It took 2.35 seconds. Estimated time left is: 478.96 seconds 
11/214k tamesamples are processed. It took 1.34 seconds. Estimated time left is: 271.94 seconds 
12/214k tamesamples are processed. It took 1.00 seconds. Estimated time left is: 202.31 seconds 
13/214k tamesamples are processed. It took 1.79 seconds. Estimated time left is: 359.02 seconds 
14/214k tamesamples are processed. It took 1.75 seconds. Estimated time left is: 349.75 seconds 
15/214k tamesamples are processed. It took 1.60 seconds. Estimated time left is: 319.06 seconds 
16/214k tamesamples are processed. It took 1.04 seconds. Estimated time left is: 206.21 seconds 
17/214k tamesamples are processed. It took 1.33 seconds. Estimated time left is: 262.41 seconds 
18/214k tamesamples are processed. It took 0.94 seconds. Estimate

In [287]:
#print(output)

import json
with open('IdealPredictorOutput.json', 'w') as fp:
    json.dump(output, fp)