In [None]:
import sys
sys.path.append("..")

import datetime as dt
from ipywidgets import interact
import itertools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
import statsmodels.api as sm
import tqdm
import vectorbtpro as vbt

import legitindicators as li
import pandas_ta as pta

from lib import bitget_loader, utils, indicators

# Setup

In [None]:
symbol = 'SOLUSDT'
freq = '5min'

is_start = dt.date(2021,1,1)
is_end = dt.date(2022,12,31)

os_start = dt.date(2023,1,1)
os_end = dt.date(2023,6,30)

df = pd.read_parquet(f"../data/candles/{symbol}.pq")

In [None]:
df = df.resample(freq).agg({'open':'first', 'high':'max', 'low':'min', 'close':'last', 'volume':'sum', 'quoteVolume':'sum', 'nTrades':'sum', 'upVolume':'sum', 'upQuoteVolume':'sum'})

In [None]:
df['volume'] = df['volume'].replace({0:np.nan})
df['quoteVolume'] = df['quoteVolume'].replace({0:np.nan})
df['nTrades'] = df['nTrades'].replace({0:np.nan})
df['upVolume'] = df['upVolume'].replace({0:np.nan})
df['upQuoteVolume'] = df['upQuoteVolume'].replace({0:np.nan})

In [None]:
df

# Target

In [None]:
df['y_return'] = np.where(df['close']-df['open']>0, 1, np.where(df['close']-df['open']<0, -1, np.nan))
#df['y_return'] = df['close'].diff(5)>0
df['y_return'] = df['y_return'].shift(-1)
df = df.iloc[:-1].copy() # last row has no target, so drop it

# Features

## Momentum

In [None]:
df['x_return_zs'] = pta.zscore(df['close'].pct_change(), 90)

In [None]:
for l in [7, 25, 99]:
    df[f'sma_{l}'] = df['close'].rolling(l).mean()
    df[f'x_sma_{l}_roc'] = df[f'sma_{l}'].pct_change()
    df[f'x_sma_{l}_distance'] = (df['close']-df[f'sma_{l}'])/df[f'sma_{l}']

for p in itertools.combinations([7, 25, 99], 2):
    df[f'x_sma_{p[0]}_{p[1]}_distance'] = (df[f'sma_{p[0]}']-df[f'sma_{p[1]}'])/df[f'sma_{p[1]}']

## Volume

In [None]:
df['logvolume'] = np.log(df['volume'])

In [None]:
fig, ax = plt.subplots(1,2, figsize=(12, 4))
df['volume'][df['volume'].rank(pct=True)<0.9].hist(ax=ax[0])
df['logvolume'].hist(ax=ax[1])
ax[0].set_title("Distribution of Volume")
ax[1].set_title("Distribution of log(Volume)")

In [None]:
df['x_logvolume_zs'] = pta.zscore(df['logvolume'], 90)
df['x_logvolume_zs_ma'] = df['x_logvolume_zs'].rolling(20).mean()
df['x_relative_volume_zs'] = df['logvolume'].groupby(df.index.time).apply(lambda d: pta.zscore(d, 40)).droplevel(0).sort_index()
df['x_relative_volume_zs_ma'] = df['x_relative_volume_zs'].rolling(20).mean()
df['x_volume_corr'] = df['volume'].rolling(20).corr(df['close'].pct_change().abs())

In [None]:
df['logticks'] = np.log(df['nTrades'])
df['x_logticks_zs'] = pta.zscore(df['logticks'], 90)
df['x_logticks_zs_ma'] = df['x_logticks_zs'].rolling(20).mean()
df['x_relative_ticks_zs'] = df['logticks'].groupby(df.index.time).apply(lambda d: pta.zscore(d, 40)).droplevel(0).sort_index()
df['x_relative_ticks_zs_ma'] = df['x_relative_ticks_zs'].rolling(20).mean()
df['x_ticks_corr'] = df['nTrades'].rolling(20).corr(df['close'].pct_change().abs())

## Volatility

In [None]:
df['tr'] = pta.true_range(df['high'], df['low'], df['close'])
df['logtr'] = np.log(df['tr'])

df['x_tr_zs'] = pta.zscore(df['logtr'], 90)
df['x_tr_zs_ma'] = df['x_tr_zs'].rolling(20).mean()
df['x_relative_tr_zs'] = df['logtr'].groupby(df.index.time).apply(lambda d: pta.zscore(d, 40)).droplevel(0).sort_index()
df['x_relative_tr_zs_ma'] = df['x_relative_tr_zs'].rolling(20).mean()
df['x_range_zs'] = pta.zscore(np.log((df['high']-df['low'])/df['open']), 90)

In [None]:
df['natr'] = df['tr'].ewm(20).mean()/df['close']

## Delta and Ticks

In [None]:
df['volume_delta'] = 2*df['upVolume'] - df['volume']
for l in [1, 4, 12]:
    df[f'x_volume_delta_{l}_zs'] = pta.zscore(df['volume_delta'].rolling(l).sum(), 200)
df['x_delta_corr'] = df['volume_delta'].rolling(20).corr(df['close'].pct_change())

df['x_quote_vol_per_trade'] = pta.zscore(np.log(df['volume']/df['nTrades']), 90)
df['x_vol_per_trade'] = pta.zscore(np.log(df['quoteVolume']/df['nTrades']), 90)

## Test

In [None]:
#df['x_dvolm'] = li.damiani_volatmeter(df[['open', 'high', 'low', 'close']].values, 13, 20, 40, 100, 1.4)

## Plot

In [None]:
fig = go.FigureWidget(make_subplots(rows=3, cols=1, shared_xaxes=True, row_heights=[0.6, 0.2, 0.2]))
fig.add_trace(go.Candlestick(), row=1, col=1)
fig.add_trace(go.Scatter(), row=2, col=1)
fig.add_trace(go.Scatter(), row=3, col=1)
fig.update_layout(height=600, margin=dict(l=20,r=20,b=20,t=20), xaxis=dict(rangeslider=dict(visible=False)))

@interact(date=np.unique(df.index.date), col=df.columns, col2=df.columns)
def update(date, col, col2):
   with fig.batch_update():
      _sdf = df.loc[str(date)]
      fig.data[0].x, fig.data[0].open, fig.data[0].high = _sdf.index, _sdf['open'], _sdf['high']
      fig.data[0].low, fig.data[0].close = _sdf['low'], _sdf['close']
      fig.data[1].x, fig.data[1].y = _sdf.index, _sdf[col]
      fig.data[2].x, fig.data[2].y = _sdf.index, _sdf[col2]
      fig.update_layout()
fig


# Training

In [None]:
from pycaret.classification import ClassificationExperiment

In [None]:
ymask = ~pd.isna(df['y_return'])

x_train = df[ymask].loc[:is_end][utils.get_prefixed_cols(df, 'x_')].replace({np.inf:np.nan, -np.inf:np.nan})
y_train = df[ymask].loc[:is_end]['y_return']

In [None]:
exp = ClassificationExperiment()
exp.setup(
    data=x_train, target=y_train,
    train_size=0.7,
    data_split_shuffle=False,
    data_split_stratify=False,
    numeric_imputation='mean',
    remove_multicollinearity=False,
    #multicollinearity_threshold=0.8,
    normalize=False,
    pca=False,
    feature_selection=False,
    n_features_to_select=0.5,
    remove_outliers=True,
    fold_strategy='kfold',
    fold=5,
    fold_shuffle=False,
    )

In [None]:
exp.X_transformed.columns

In [None]:
#best = exp.compare_models(n_select=3, cross_validation=False)

In [None]:
best = exp.create_model('lr', cross_validation=False)

In [None]:
exp.plot_model(best, 'auc')

In [None]:
exp.plot_model(best, 'threshold')

## Modelling Holdout Accuracy

## Backtest in Modelling Holdout

### 1 Bar Exit Threshold Optimization

In [None]:
bdf = df.loc[exp.test.index]
bdf['prediction_label'] = exp.predict_model(best)['prediction_label']
bdf['prediction_score'] = exp.predict_model(best)['prediction_score']

In [None]:
le = []
se = []
for th in np.arange(0.5, 0.7, 0.05):
    _le = bdf['prediction_label'] == 1
    _le &= bdf['prediction_score'] > th
    _le = utils.crossover(_le, 0.5)
    _se = bdf['prediction_label'] == -1
    _se &= bdf['prediction_score'] > th
    _se = utils.crossover(_se, 0.5)
    le.append(_le.rename(th))
    se.append(_se.rename(th))
le = pd.concat(le, axis=1)
se = pd.concat(se, axis=1)

In [None]:
pf_l = vbt.Portfolio.from_signals(
    bdf['close'], open=bdf['open'], high=bdf['high'], low=bdf['low'],
    entries=le,
    # short_entries=se,
    freq=freq,
    td_stop=2,
    time_delta_format=0,
)

pf_s = vbt.Portfolio.from_signals(
    bdf['close'], open=bdf['open'], high=bdf['high'], low=bdf['low'],
    #entries=le,
    short_entries=se,
    freq=freq,
    td_stop=2,
    time_delta_format=0,
)

pf = vbt.Portfolio.from_signals(
    bdf['close'], open=bdf['open'], high=bdf['high'], low=bdf['low'],
    entries=le,
    short_entries=se,
    freq=freq,
    td_stop=2,
    time_delta_format=0,
)

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(15, 4))
pf.trades.win_rate.plot(ax=ax[0], color='blue', label='Combined')
pf.total_profit.plot(ax=ax[1], color='blue', label='Combined')
pf.sharpe_ratio.plot(ax=ax[2], color='blue', label='Combined')

pf_l.trades.win_rate.plot(ax=ax[0], color='green', label='Longs')
pf_l.total_profit.plot(ax=ax[1], color='green', label='Longs')
pf_l.sharpe_ratio.plot(ax=ax[2], color='green', label='Longs')

pf_s.trades.win_rate.plot(ax=ax[0], color='red', title='Win Rate', xlabel='Threshold', label='Shorts')
pf_s.total_profit.plot(ax=ax[1], color='red', title='Total Profit', xlabel='Threshold', label='Shorts')
pf_s.sharpe_ratio.plot(ax=ax[2], color='red', title='Sharpe Ratio', xlabel='Threshold', label='Shorts')

In [None]:
best_th = 0.6

le = bdf['prediction_label'] == 1
le &= bdf['prediction_score'] > best_th
le = utils.crossover(le, 0.5)

se = bdf['prediction_label'] == -1
se &= bdf['prediction_score'] > best_th
se = utils.crossover(se, 0.5)

In [None]:
le.sum(), se.sum()

### Signal Half-Life

In [None]:
pf_l = vbt.Portfolio.from_signals(
    bdf['close'], open=bdf['open'], high=bdf['high'], low=bdf['low'],
    entries=le,
    # short_entries=se,
    freq=freq,
    td_stop=vbt.Param(np.arange(2, 20)),
    time_delta_format=0,
)

pf_s = vbt.Portfolio.from_signals(
    bdf['close'], open=bdf['open'], high=bdf['high'], low=bdf['low'],
    #entries=le,
    short_entries=se,
    freq=freq,
    td_stop=vbt.Param(np.arange(2, 20)),
    time_delta_format=0,
)

pf = vbt.Portfolio.from_signals(
    bdf['close'], open=bdf['open'], high=bdf['high'], low=bdf['low'],
    entries=le,
    short_entries=se,
    freq=freq,
    td_stop=vbt.Param(np.arange(2, 20)),
    time_delta_format=0,
)

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(15, 4))
pf.trades.win_rate.plot(ax=ax[0], color='blue', label='Combined')
pf.total_profit.plot(ax=ax[1], color='blue', label='Combined')
pf.sharpe_ratio.plot(ax=ax[2], color='blue', label='Combined')

pf_l.trades.win_rate.plot(ax=ax[0], color='green', label='Longs')
pf_l.total_profit.plot(ax=ax[1], color='green', label='Longs')
pf_l.sharpe_ratio.plot(ax=ax[2], color='green', label='Longs')

pf_s.trades.win_rate.plot(ax=ax[0], color='red', title='Win Rate', xlabel='Exit N Bars', label='Shorts')
pf_s.total_profit.plot(ax=ax[1], color='red', title='Total Profit', xlabel='Exit N Bars', label='Shorts')
pf_s.sharpe_ratio.plot(ax=ax[2], color='red', title='Sharpe Ratio', xlabel='Exit N Bars', label='Shorts')

In [None]:
best_td_stop = 9

### TPSL Opt, Pct Based

In [None]:
tpsl_mults = np.arange(0.001, 0.02, 0.001)
pf = vbt.Portfolio.from_signals(
    bdf['close'], open=bdf['open'], high=bdf['high'], low=bdf['low'],
    entries=le, short_entries=se,
    freq=freq,
    td_stop=best_td_stop,
    time_delta_format=0,
    sl_stop=vbt.Param(tpsl_mults),
    tp_stop=vbt.Param(tpsl_mults),
)

In [None]:
#stat_result = pf.trades.get_profit_factor().unstack()
#stat_result = pf.trades.win_rate.unstack()
stat_result = pf.sharpe_ratio.unstack()
stat_result = stat_result.sort_index().sort_index(axis=1)
stat_result.index = stat_result.index.round(4)
stat_result.columns = stat_result.columns.round(4)

In [None]:
ax = sns.heatmap(stat_result, annot=False)
ax.set_title('Sharpe Ratio by SL and TP (%)')

In [None]:
pf = vbt.Portfolio.from_signals(
    bdf['close'], open=bdf['open'], high=bdf['high'], low=bdf['low'],
    entries=le, short_entries=se,
    freq=freq,
    td_stop=best_td_stop,
    time_delta_format=0,
    sl_stop=0.005,
    tp_stop=0.01
)

In [None]:
pf.stats()

In [None]:
pf.value.plot()

### TPSL Opt, ATR Based

In [None]:
tpsl_mults = np.arange(0.05, 2, 0.05)
pf = vbt.Portfolio.from_signals(
    bdf['close'], open=bdf['open'], high=bdf['high'], low=bdf['low'],
    entries=le, short_entries=se,
    freq=freq,
    td_stop=best_td_stop,
    time_delta_format=0,
    sl_stop=vbt.Param([x*bdf['natr'] for x in tpsl_mults]),
    tp_stop=vbt.Param([x*bdf['natr'] for x in tpsl_mults]),
    #slippage=0.0001,
)

In [None]:
stat_result = pf.trades.get_profit_factor().unstack()
stat_result.index = stat_result.index.str[7:].astype(int)
stat_result.columns = stat_result.columns.str[7:].astype(int)
stat_result = stat_result.sort_index().sort_index(axis=1)
stat_result.index = tpsl_mults
stat_result.columns = tpsl_mults
stat_result = stat_result.replace({np.inf:np.nan})

In [None]:
stat_result.index = stat_result.index.round(2)
stat_result.columns = stat_result.columns.round(2)

In [None]:
ax = sns.heatmap(stat_result, annot=False)
ax.set_title('Sharpe Ratio by SL and TP (%)')

In [None]:
best_sl = 1
best_tp = 1.5

### Final Optimized System

In [None]:
pf = vbt.Portfolio.from_signals(
    bdf['close'], open=bdf['open'], high=bdf['high'], low=bdf['low'],
    entries=le, short_entries=se,
    freq=freq,
    td_stop=best_td_stop,
    time_delta_format=0,
    sl_stop=best_sl*bdf['natr'],
    tp_stop=best_tp*bdf['natr'],
    #slippage=0.0001,
)

In [None]:
pf.stats()

In [None]:
pf.value.plot()

# Backtest OOS

In [None]:
final_model = exp.finalize_model(best)
# final_model = exp.finalize_model(best, model_only=True)

In [None]:
os_df = df.sort_index().loc[os_start:]
os_df['prediction_label'] = final_model.predict(os_df[utils.get_prefixed_cols(os_df, 'x_')]).values
os_df['prediction_score'] = final_model.predict_proba(os_df[utils.get_prefixed_cols(os_df, 'x_')])[:,1]
os_df['prediction_score'] = np.where(os_df['prediction_label']==1, os_df['prediction_score'], 1-os_df['prediction_score'])

In [None]:
le = os_df['prediction_label'] == 1
le &= os_df['prediction_score'] > best_th
se = os_df['prediction_label'] == -1
se &= os_df['prediction_score'] > best_th

le = utils.crossover(le, 0.5)
se = utils.crossover(se, 0.5)

pf = vbt.Portfolio.from_signals(
    os_df['close'], open=os_df['open'], high=os_df['high'], low=os_df['low'],
    entries=le, short_entries=se,
    freq=freq,
    td_stop=best_td_stop,
    time_delta_format=0,
    sl_stop=best_sl*os_df['natr'],
    tp_stop=best_tp*os_df['natr'],
    slippage=0.0001,
)

In [None]:
pf.stats()

In [None]:
pf.value.plot()

In [None]:
records = pf.trades.records
records['dt'] = os_df.index[records['entry_idx']]
records['exit_dt'] = os_df.index[records['exit_idx']]
records['sl'] = best_sl*os_df['natr'].iloc[records['entry_idx']].values
records['realized_r'] = records['return']/records['sl']
records = records.set_index('dt')

In [None]:
records['realized_r'].cumsum().vbt.plot().show(renderer='png')

# Rolling Weekly Train-Predict

In [None]:
training_window = 104

df['weeknum'] = (df.index.weekday.diff() < 0).cumsum()
df['prediction_label'] = np.nan
df['prediction_score'] = np.nan

for week in tqdm.tqdm(range(training_window+2, df['weeknum'].max()+1)):
    train_df = df[df['weeknum'].between(week-training_window, week-1)]
    pred_df = df[df['weeknum']==week]
    
    x_train = train_df[utils.get_prefixed_cols(train_df, 'x_')]
    ymask = ~train_df['y_return'].isna()
    xmask = ~x_train.isna().any(axis=1)
    
    final_model.fit(x_train[ymask&xmask], train_df['y_return'][ymask&xmask])

    df['prediction_label'].update(pd.Series(final_model.predict(pred_df[utils.get_prefixed_cols(pred_df, 'x_')]).values, pred_df.index))
    df['prediction_score'].update(pd.Series(final_model.predict_proba(pred_df[utils.get_prefixed_cols(pred_df, 'x_')])[:,1], pred_df.index))

# Flip score based on label
df['prediction_score'] = np.where(df['prediction_label']==1, df['prediction_score'], 1-df['prediction_score'])

In [None]:
le = df['prediction_label'] == 1
le &= df['prediction_score'] > best_th
se = df['prediction_label'] == -1
se &= df['prediction_score'] > best_th

le = utils.crossover(le, 0.5)
se = utils.crossover(se, 0.5)

pf = vbt.Portfolio.from_signals(
    df['close'], open=df['open'], high=df['high'], low=df['low'],
    entries=le, short_entries=se,
    freq=freq,
    td_stop=best_td_stop,
    time_delta_format=0,
    sl_stop=best_sl*df['natr'],
    tp_stop=best_tp*df['natr'],
    slippage=0.0001,
)

In [None]:
pf.stats()

In [None]:
pf.value.plot()

In [None]:
records = pf.trades.records
records['dt'] = df.index[records['entry_idx']]
records['exit_dt'] = df.index[records['exit_idx']]
records['sl'] = best_sl*df['natr'].iloc[records['entry_idx']].values
records['realized_r'] = records['return']/records['sl']
records = records.set_index('dt')

In [None]:
records['realized_r'].cumsum().plot()
records[records['direction']==0]['realized_r'].cumsum().plot()
records[records['direction']==1]['realized_r'].cumsum().plot()

In [None]:
#fig = go.FigureWidget(make_subplots(rows=1, cols=1, shared_xaxes=True))
#fig.add_trace(go.Candlestick(), row=1, col=1)
#fig.add_trace(go.Scatter(), row=1, col=1)
#fig.add_trace(go.Scatter(), row=1, col=1)
#fig.add_trace(go.Scatter(mode='markers', marker=dict(symbol='x', size=12, color='green')), row=1, col=1)
#fig.add_trace(go.Scatter(mode='markers', marker=dict(symbol='x', size=12, color='red')), row=1, col=1)
#fig.update_layout(height=800, margin=dict(l=20,r=20,b=20,t=20), xaxis=dict(rangeslider=dict(visible=False)))

#@interact(r=records.index, col=df.columns, col2=df.columns)
#def update(r, col, col2):
#   with fig.batch_update():
#      _r = records.loc[r]
#      _sdf = df1.loc[str(_r.name.date())]
      
#      print(_r['return'])
#      fig.data[0].x, fig.data[0].open, fig.data[0].high = _sdf.index, _sdf['open'], _sdf['high']
#      fig.data[0].low, fig.data[0].close = _sdf['low'], _sdf['close']
#    #   fig.data[1].x, fig.data[1].y = [_r.name, _r['exit_dt']], [_r['sl'],_r['sl']]
#    #   fig.data[2].x, fig.data[2].y = [_r.name, _r['exit_dt']], [_r['tp'],_r['tp']]
#      fig.data[3].x, fig.data[3].y = [_r.name], [_r['entry_price']]
#      fig.data[4].x, fig.data[4].y = [_r['exit_dt']], [_r['exit_price']]
#      fig.update_layout()
#fig