# imports

In [None]:
import datetime

import plotly
import plotly.graph_objects as go
import seaborn as sns
sns.set(style="ticks")
sns.set_style("darkgrid")
import random 
import sys

import json
from numba.types import bool_, int_, float32

import numpy as np
import pandas as pd

import os
import inspect
from tqdm import tqdm
from multiprocessing.pool import ThreadPool
from datetime import datetime
import matplotlib.pyplot as plt

import plotly.express as px
import plotly.graph_objects as go
from tqdm.notebook import trange, tqdm
from plotly.subplots import make_subplots


get_ipython().run_line_magic('load_ext', 'autoreload')

get_ipython().run_line_magic('autoreload', '2')
get_ipython().run_line_magic('reload_ext', 'autoreload')

# download_data

In [None]:
name = 'BinanceFuturesBTCUSDT'

instrument_tick_size = 0.1
instrument_contract_size = 0.001

start_time = '2023-04-01 00:00:00'
end_time = '2023-04-21 00:00:00'

In [None]:
def aggregate_trades(trades, instrument_contract_size, instrument_tick_size):
    trades['volume_usd'] = (trades['volume'] * instrument_contract_size) * (trades['price'] * instrument_tick_size)

    from collections import defaultdict


    trades_aggregated_by_exchange_ts = defaultdict(lambda: defaultdict(list))
    for trade_ts, row in zip(trades['exchange_ts'], trades.values):
        price = row[3]
        maker_side = row[5]
        volume = row[4]
        volume_usd = row[6]

        trades_aggregated_by_exchange_ts[trade_ts][(price, maker_side)].append((
            volume,
            volume_usd,
        ))

    trades_data = []
    for exchange_ts, ts_data in trades_aggregated_by_exchange_ts.items():
        ts_fr = datetime.datetime.fromtimestamp(exchange_ts / 1e9)
        for (price, maker_side), volumes in ts_data.items():     
            np_volumes = np.array(volumes)
            trades_data.append((
                exchange_ts,
                price,
                np_volumes[:,0].sum(),
                np_volumes[:,1].sum(),
                maker_side,
                ts_fr
            ))

    trades_aggregated = pd.DataFrame(trades_data, columns=[
        'exchange_ts',
        'price',
        'volume',
        'volume_usd',
        'maker_side',
        'ts_fr',
    ])
    return trades_aggregated

In [None]:
def get_trades_agg(market_data_path, instrument_tick_size, instrument_contract_size):
    trades = pd.read_csv(market_data_path)
    
    trades_agg = aggregate_trades(trades=trades.drop(['instrument', 'trade_id'], axis=1),\
                                  instrument_tick_size=instrument_tick_size, 
                                  instrument_contract_size=instrument_contract_size)
    
    trades_agg['time'] = pd.to_datetime(trades_agg['exchange_ts'])
    return trades_agg

In [None]:
order_books = pd.read_csv('order_books') # add_path

In [None]:
trades_agg = get_trades_agg(
    'trades', # add_path
    instrument_tick_size, instrument_contract_size)

In [None]:
trades_agg['price'] = trades_agg['price'].astype(np.int64)

# make stats

In [None]:
def add_target_to_ob(order_books, trades_agg, max_ts):
    print(max_ts)
    order_books['target_ask'] = 0
    order_books['target_bid'] = 0
    np_result = order_books[['target_ask', 'target_bid']].to_numpy()
    
    np_ob_ts = order_books['exchange_ts'].to_numpy()
    
    np_trades_ask_ts = trades_agg[trades_agg.maker_side == -1.0]['exchange_ts'].to_numpy()
    np_trades_bid_ts = trades_agg[trades_agg.maker_side == 1.0]['exchange_ts'].to_numpy()
    
    np_best_ask = order_books['ask_price_1'].to_numpy()
    np_best_bid = order_books['bid_price_1'].to_numpy()
    
    np_trades_ask = trades_agg[trades_agg.maker_side == -1.0]['price'].reset_index(drop=True).to_numpy()
    np_trades_bid = trades_agg[trades_agg.maker_side == 1.0]['price'].reset_index(drop=True).to_numpy()
    
    @nb.njit()
    def fast_add(np_result, np_ob_ts, np_trades_ask_ts, np_trades_bid_ts, max_ts,
                 np_best_ask, np_best_bid,
                 np_trades_ask, np_trades_bid):
        ind_ts_ask_min = 0
        ind_ts_bid_min = 0
        
        ind_ts_ask_max = 0
        ind_ts_bid_max = 0
        
        
        for ind in range(len(np_ob_ts)):
            
            while ind_ts_ask_min + 1 != len(np_trades_ask) and\
                    np_ob_ts[ind] >= np_trades_ask_ts[ind_ts_ask_min]:
                ind_ts_ask_min += 1
            
            while ind_ts_ask_max + 1 != len(np_trades_ask) and\
                    np_ob_ts[ind] + max_ts >= np_trades_ask_ts[ind_ts_ask_max]:
                ind_ts_ask_max += 1
            
            mx = np_best_ask[ind]
            for i in range(ind_ts_ask_min, ind_ts_ask_max):
                mx = max(mx, np_trades_ask[i])
            
            np_result[ind, 0] = mx - np_best_ask[ind]
            
            
            while ind_ts_bid_min + 1 != len(np_trades_bid) and\
                    np_ob_ts[ind] >= np_trades_bid_ts[ind_ts_bid_min]:
                ind_ts_bid_min += 1
            
            while ind_ts_bid_max + 1 != len(np_trades_bid) and\
                    np_ob_ts[ind] + max_ts >= np_trades_bid_ts[ind_ts_bid_max]:
                ind_ts_bid_max += 1
            
            mn = np_best_bid[ind]
            for i in range(ind_ts_bid_min, ind_ts_bid_max):
                mn = min(mn, np_trades_bid[i])
                
            np_result[ind, 1] = np_best_bid[ind] - mn
        
    fast_add(np_result, np_ob_ts, np_trades_ask_ts, np_trades_bid_ts, max_ts,
                 np_best_ask, np_best_bid,
                 np_trades_ask, np_trades_bid)
    order_books['target_ask'] = np_result[:, 0]
    order_books['target_bid'] = np_result[:, 1]

In [None]:
def add_midprice(order_books, ts):
    feature_name = f'midprice_{int(ts/1e9)}sec'
    order_books[feature_name] = 0.0
    order_books['midprice'] = (order_books['ask_price_1'] + order_books['bid_price_1']) / 2.0
    
    np_ob_ts = order_books['exchange_ts'].to_numpy()
    np_midprice = order_books['midprice'].to_numpy()
    np_result = order_books[feature_name].to_numpy()
    
    @nb.njit()
    def fast_add_midprice(np_ob_ts, np_midprice, np_result, ts):
        mx_ind = 0
        
        for ind in range(len(np_ob_ts)):
            while mx_ind + 1 < len(np_ob_ts) and np_ob_ts[ind] + ts >= np_ob_ts[mx_ind]:
                mx_ind += 1
            np_result[ind] = np_midprice[mx_ind]
        
        
    fast_add_midprice(np_ob_ts, np_midprice, np_result, ts)
    order_books[feature_name] = np_result

In [None]:
add_target_to_ob(order_books, trades_agg, 100000000)

In [None]:
add_midprice(order_books, 5 * 1000000000)

In [None]:
data = order_books[(order_books.target_ask != 0) | (order_books.target_bid != 0)]

In [None]:
data['target_ask'] -= 1
data['target_bid'] -= 1

In [None]:
data['midprice_profit_5sec_ask'] = (data['ask_price_1'] + data['target_ask'] - data['midprice_5sec']
                                    - data['ask_price_1'] * (-5e-5)) * instrument_contract_size

data['midprice_profit_5sec_bid'] = (data['midprice_5sec'] - (data['bid_price_1'] - data['target_bid']) 
                                    - data['bid_price_1'] * (-5e-5)) * instrument_contract_size

# profit calculating

In [None]:
print('profit: ', data[(data.target_bid >= mn) & (data.target_bid < mx)].midprice_profit_5sec_bid.sum() * instrument_tick_size)
print('cnt: ', len(data[(data.target_bid >= mn) & (data.target_bid < mx)]))

In [None]:
mn = 0
mx = 100

to_plot = data[(data.target_bid >= mn) & (data.target_bid < mx)]

plt.figure(figsize=(10,6))
plt.hist(to_plot['target_bid'], weights=to_plot['midprice_profit_5sec_bid'] * instrument_tick_size, bins = (mx - mn))
plt.ylabel('mid-price profit 5sec sum')
plt.xlabel('max_distance')
plt.show()

mn = 15
mx = 100

to_plot = data[(data.target_bid >= mn) & (data.target_bid < mx)]

plt.figure(figsize=(10,6))
plt.hist(to_plot['target_bid'], weights=to_plot['midprice_profit_5sec_bid'] * instrument_tick_size, bins = (mx - mn))
plt.ylabel('mid-price profit 5sec sum')
plt.xlabel('max_distance')
plt.show()

# CATBOOST

In [None]:
def get_catboost_by_side(side):
    mn = 15

    df = data[(data.target_bid >= mn)].reset_index(drop=True)

    features_to_model = []

    for i in range(1, mx):
        features_to_model.append(f'{side}_volume_{i}')

    full_x = df.loc[:, features_to_model]

    full_y = df.loc[:, [f'target_{side}', f'{side}_price_1', 'midprice_5sec', f'midprice_profit_5sec_{side}']]
    
    threshold_1 = (int)((4/5) * len(full_x))
    threshold_2 = (int)((4/5) * len(full_x))
    x_train = full_x[:threshold_1]
    y_train = full_y[:threshold_1]

    x_test = full_x[threshold_2:].reset_index(drop=True)
    y_test = full_y[threshold_2:].reset_index(drop=True)


    target = f'target_{side}'
    print(len(x_train))
    
    from catboost import CatBoostRegressor
    catboost = CatBoostRegressor(loss_function='RMSE', n_estimators=1500)
    catboost.fit(x_train, y_train[target], verbose=False, plot=True)
    
    return [catboost, x_train, x_test, y_train, y_test]

In [None]:
ask_catboost = get_catboost_by_side(side)
bid_catboost = get_catboost_by_side(side)

In [None]:
importances = pd.DataFrame()
importances['names'] = ask_catboost[0].feature_names_
importances['importances'] = ask_catboost[0].feature_importances_
importances.sort_values('importances')[::-1].reset_index(drop=True)[:60]

In [None]:
importances = pd.DataFrame()
importances['names'] = bid_catboost[0].feature_names_
importances['importances'] = bid_catboost[0].feature_importances_
importances.sort_values('importances')[::-1].reset_index(drop=True)[:60]

# predict profit calculating

In [None]:
mn = 15

In [None]:
def calc_rand_profit(data, mn, mx, side):
    np_target = data[f'target_{side}'].to_numpy()
    np_midprice_5sec = data['midprice_5sec'].to_numpy()
    np_price = data[f'{side}_price_1'].to_numpy()
    
    final_ans = 0
    
    for i in tqdm(range(len(data))):
        target = np_target[i]
        cur_ans = 0
        for j in range(mn, mx):
            if target < j:
                cur_ans += 0
            else:
                if side == 'bid':
                    cur_ans += (np_midprice_5sec[i] - (np_price[i] - j) 
                                        - np_price[i] * (-5e-5)) * instrument_contract_size
                else:
                    cur_ans += ((np_price[i] + j) - np_midprice_5sec[i]
                                        - np_price[i] * (-5e-5)) * instrument_contract_size
        cur_ans /= (mx - mn)
        final_ans += cur_ans
    return final_ans

In [None]:
def calc_pred_profit(np_predict, data, mn, side):
    
    np_target = data[f'target_{side}'].to_numpy()
    np_midprice_5sec = data['midprice_5sec'].to_numpy()
    np_price = data[f'{side}_price_1'].to_numpy()
    
    final_ans = 0
    
    for i in tqdm(range(len(data))):
        predict = np_predict[i]
        target = np_target[i]
        cur_ans = 0
        if target < predict:
            cur_ans += 0
        else:
            if side == 'bid':
                cur_ans += (np_midprice_5sec[i] - (np_price[i] - predict) 
                                    - np_price[i] * (-5e-5)) * instrument_contract_size
            else:
                cur_ans += ((np_price[i] + predict) - np_midprice_5sec[i]
                                    - np_price[i] * (-5e-5)) * instrument_contract_size
                
        final_ans += cur_ans
    return final_ans

In [None]:
rnd_profit = calc_rand_profit(data, mn, 'ask') +
                calc_rand_profit(data, mn, 'bid')
rnd_profit

In [None]:
ask_train_predict = ask_catboost[0].predict(ask_catboost[1])
bid_train_predict = bid_catboost[0].predict(bid_catboost[1])

calc_pred_profit(ask_train_predict, ask_catboost[3], mn) +
    calc_pred_profit(bid_train_predict, bid_catboost[3], mn)

In [None]:
ask_test_predict = ask_catboost[0].predict(ask_catboost[2])
bid_test_predict = bid_catboost[0].predict(bid_catboost[2])

calc_pred_profit(ask_test_predict, ask_catboost[4], mn) +
    calc_pred_profit(bid_test_predict, bid_catboost[4], mn)