In [None]:
import sys
sys.path.append("..")

import datetime as dt
from ipywidgets import interact
import itertools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pandas_ta as pta
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import statsmodels.api as sm
import vectorbtpro as vbt

from lib import bitget_loader, utils

# Setup

In [None]:
symbol = 'SOLUSDT'
is_start = dt.date(2022,1,1)
is_end = dt.date(2022,12,31)

os_start = dt.date(2023,1,1)
os_end = dt.date(2023,5,30)

df = bitget_loader.load_klines_in_date_range(symbol, is_start, os_end).tz_convert(None) # convert to tz naive so I can loc with dates with pandas
df = df.drop_duplicates() # bitget has overlaps in their data

# Target

In [None]:
df['y_return'] = np.where(df['close']-df['open']>0, 1, np.where(df['close']-df['open']<0, -1, np.nan))
df['y_return'] = df['y_return'].shift(-1)
df = df.dropna() # last row has no target, so drop it

# Features

## Momentum

In [None]:
df['x_return_zs'] = pta.zscore(df['close'].pct_change(), 200)

In [None]:
for l in [7, 25, 99]:
    df[f'sma_{l}'] = df['close'].rolling(l).mean()
    df[f'x_sma_{l}_roc'] = df[f'sma_{l}'].pct_change()
    df[f'x_sma_{l}_distance'] = (df['close']-df[f'sma_{l}'])/df[f'sma_{l}']

for p in itertools.combinations([7, 25, 99], 2):
    df[f'x_sma_{p[0]}_{p[1]}_distance'] = (df[f'sma_{p[0]}']-df[f'sma_{p[1]}'])/df[f'sma_{p[1]}']

In [None]:
for l in [7, 25, 99]:
    df[[f'x_adx_{l}', f'x_dmp_{l}', f'x_dmn_{l}']] = pta.adx(df['high'], df['low'], df['close'], l)

## Volume

In [None]:
df['logvolume'] = np.log(df['volume'])

In [None]:
fig, ax = plt.subplots(1,2, figsize=(12, 4))
df['volume'][df['volume'].rank(pct=True)<0.9].hist(ax=ax[0])
df['logvolume'].hist(ax=ax[1])
ax[0].set_title("Distribution of Volume")
ax[1].set_title("Distribution of log(Volume)")

In [None]:
df['x_logvolume_zs'] = pta.zscore(df['logvolume'], 20)
df['x_logvolume_zs_ma'] = df['x_logvolume_zs'].rolling(20).mean()
df['x_relative_volume_zs'] = df['logvolume'].groupby(df.index.time).apply(lambda d: pta.zscore(d, 20)).droplevel(0).sort_index()
df['x_relative_volume_zs_ma'] = df['x_relative_volume_zs'].rolling(20).mean()
df['x_volume_corr'] = df['volume'].rolling(20).corr(df['close'].pct_change().abs())

## Volatility

In [None]:
df['tr'] = pta.true_range(df['high'], df['low'], df['close'])
df['logtr'] = np.log(df['tr'])

df['x_tr_zs'] = pta.zscore(df['logtr'], 20)
df['x_tr_zs_ma'] = df['x_tr_zs'].rolling(20).mean()
df['x_relative_tr_zs'] = df['logtr'].groupby(df.index.time).apply(lambda d: pta.zscore(d, 20)).droplevel(0).sort_index()
df['x_relative_tr_zs_ma'] = df['x_relative_tr_zs'].rolling(20).mean()
df['x_range_zs'] = pta.zscore(np.log((df['high']-df['low'])/df['open']), 200)

## Plot

In [None]:
fig = go.FigureWidget(make_subplots(rows=3, cols=1, shared_xaxes=True, row_heights=[0.6, 0.2, 0.2]))
fig.add_trace(go.Candlestick(), row=1, col=1)
fig.add_trace(go.Scatter(), row=2, col=1)
fig.add_trace(go.Scatter(), row=3, col=1)
fig.update_layout(height=600, margin=dict(l=20,r=20,b=20,t=20), xaxis=dict(rangeslider=dict(visible=False)))

@interact(date=np.unique(df.index.date), col=df.columns, col2=df.columns)
def update(date, col, col2):
   with fig.batch_update():
      _sdf = df.loc[str(date)]
      fig.data[0].x, fig.data[0].open, fig.data[0].high = _sdf.index, _sdf['open'], _sdf['high']
      fig.data[0].low, fig.data[0].close = _sdf['low'], _sdf['close']
      fig.data[1].x, fig.data[1].y = _sdf.index, _sdf[col]
      fig.data[2].x, fig.data[2].y = _sdf.index, _sdf[col2]
      fig.update_layout()
fig


# Training

In [None]:
from pycaret.classification import ClassificationExperiment

In [None]:
x_train = df.loc[:is_end][utils.get_prefixed_cols(df, 'x_')]
y_train = df.loc[:is_end]['y_return']

In [None]:
exp = ClassificationExperiment()
exp.setup(
    data=x_train, target=y_train,
    train_size=0.7,
    data_split_shuffle=False,
    data_split_stratify=False,
    numeric_imputation='drop',
    remove_multicollinearity=True,
    multicollinearity_threshold=0.9,
    normalize=True,
    pca=False,
    feature_selection=True,
    n_features_to_select=0.5,
    remove_outliers=False,
    fold_strategy='kfold',
    fold=5,
    fold_shuffle=False,
    )

In [None]:
best = exp.compare_models()

In [None]:
best = exp.create_model('lr')

In [None]:
exp.tune_model(best)

## Backtest in Modelling Holdout

In [None]:
bdf = df.loc[exp.test.index]
bdf[['prediction_label', 'prediction_score']] = exp.predict_model(best)[['prediction_label', 'prediction_score']]

In [None]:
le = bdf['prediction_label'] == 1
le &= bdf['prediction_score'] > 0.65
se = bdf['prediction_label'] == -1
se &= bdf['prediction_score'] > 0.65

pf = vbt.Portfolio.from_signals(
    bdf['close'], open=bdf['open'], high=bdf['high'], low=bdf['low'],
    entries=le, short_entries=se,
    freq='1min',
    td_stop=2,
    time_delta_format=0,
)

In [None]:
pf.stats()

In [None]:
pf.trades.records['return'].skew()

In [None]:
pf.value.plot()

In [None]:
pf = vbt.Portfolio.from_signals(
    bdf['close'], open=bdf['open'], high=bdf['high'], low=bdf['low'],
    entries=le, short_entries=se,
    freq='1min',
    td_stop=2,
    time_delta_format=0,
    sl_stop=0.01,
    slippage=0.0001,
)

In [None]:
pf.stats()

In [None]:
(pf.trades.records['return']/0.01).cumsum().rename('realized_r').vbt.plot().show(renderer='png')

# Backtest OOS

In [None]:
final_model = exp.finalize_model(best)

In [None]:
os_df = df.loc[os_start:]
os_df['prediction_label'] = final_model.predict(os_df[utils.get_prefixed_cols(oos_df, 'x_')]).values
os_df['prediction_score'] = final_model.predict_proba(os_df[utils.get_prefixed_cols(os_df, 'x_')])[:,1]
os_df['prediction_score'] = np.where(os_df['prediction_label']==1, os_df['prediction_score'], 1-os_df['prediction_score'])

In [None]:
le = os_df['prediction_label'] == 1
le &= os_df['prediction_score'] > 0.65
se = os_df['prediction_label'] == -1
se &= os_df['prediction_score'] > 0.65

pf = vbt.Portfolio.from_signals(
    os_df['close'], open=os_df['open'], high=os_df['high'], low=os_df['low'],
    entries=le, short_entries=se,
    freq='1min',
    td_stop=2,
    time_delta_format=0,
    sl_stop=0.01,
    slippage=0.0001,
)

In [None]:
pf.stats()

In [None]:
pf.value.plot()

In [None]:
(pf.trades.records['return']/0.01).cumsum().rename('realized_r').vbt.plot().show(renderer='png')