In [14]:
import json
import pandas as pd
import numpy as np
import ccxt
from tqdm import tqdm
import seaborn as sns
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score 
import pandas_ta as ta
from pytorch_tabnet.tab_model import TabNetClassifier 

In [15]:
chart_df = pd.read_csv("chart_df_with_deberta_sentiments.csv") 
targets = [] 
high = chart_df["high"].values 
low = chart_df["low"].values 
close = chart_df["close"].values 

threshold = 0.01 

for i in range(len(close)-1):
    high_vol = (high[i+1] - close[i]) / close[i] 
    low_vol = (low[i+1] - close[i]) / close[i] 
    if high_vol >= threshold: 
        targets.append(0) 
    elif low_vol <= -threshold:
        targets.append(1) 
    else:
        targets.append(2) 
        
targets.append(None) 

chart_df["Targets"] = targets 

In [16]:
chart_df.set_index(pd.DatetimeIndex(chart_df["datetime"]), inplace=True)

### addition of chart features ### 
chart_df["bop"] = chart_df.ta.bop(lookahead=False) 
chart_df["ebsw"] = chart_df.ta.ebsw(lookahead=False) 
chart_df["cmf"] = chart_df.ta.cmf(lookahead=False) 
chart_df["vwap"] = chart_df.ta.vwap(lookahead=False) 
chart_df["rsi/100"] = chart_df.ta.rsi(lookahead=False) / 100
chart_df["high/low"] = chart_df["high"] / chart_df["low"] 
chart_df["close/open"] = chart_df["close"] / chart_df["open"] 
chart_df["high/open"] = chart_df["high"] / chart_df["open"] 
chart_df["low/open"] = chart_df["low"] / chart_df["open"] 
chart_df["hwma"] = chart_df.ta.hwma(lookahead=False) 
chart_df["linreg"] = chart_df.ta.linreg(lookahead=False) 
chart_df["hwma/close"] = chart_df["hwma"] / chart_df["close"] 
chart_df["linreg/close"] = chart_df["linreg"] / chart_df["close"]
chart_df["sma"] = chart_df.ta.sma(lookahead=False) 
chart_df["sma/close"] = chart_df["sma"] / chart_df["close"] 


### addition of recent differenced features ### 
for l in tqdm(range(1, 12), position=0, leave=True): 
    for col in ["high", "low", "volume", "vwap"]:
        val = chart_df[col].values 
        val_ret = [None for _ in range(l)]
        for i in range(l, len(val)):
            if val[i-l] == 0: 
                ret = 1 
            else:
                ret = val[i] / val[i-l]  
            val_ret.append(ret) 
        chart_df["{}_change_{}".format(col, l)] = val_ret
        
### drop unnecessary columns ### 
chart_df.drop(columns={"open","high","low","close","volume","vwap","hwma","linreg", "sma"}, inplace=True) 


chart_df.dropna(inplace=True)

print(chart_df.shape)  

100%|██████████| 11/11 [00:00<00:00, 46.56it/s]

(10929, 63)





In [17]:
columns = chart_df.columns 

train_columns = [] 

for c in columns:
    if c not in ["year","datetime","Targets"]: 
        train_columns.append(c) 
        
        
train_idx = int(chart_df.shape[0] * 0.8) 
val_idx = int(chart_df.shape[0] * 0.1)
train_df, val_df, test_df = chart_df.iloc[:train_idx], chart_df.iloc[train_idx:train_idx+val_idx], chart_df.iloc[train_idx+val_idx:]


train_df.shape, val_df.shape, test_df.shape

((8743, 63), (1092, 63), (1094, 63))

In [18]:
chart_df.dropna(inplace=True)
X_train = train_df[train_columns] 
Y_train = train_df["Targets"]

X_val = val_df[train_columns] 
Y_val = val_df["Targets"] 

X_test = test_df[train_columns] 
Y_test = test_df["Targets"] 


class_weights = compute_class_weight(class_weight = "balanced",
                                     classes = np.unique(Y_train),
                                     y = Y_train) 

d = {0:class_weights[0], 1:class_weights[1], 2:class_weights[2]} 

print(d) 

{0: 0.8696906396100667, 1: 1.2178576403398802, 2: 0.9717683672335223}


In [19]:
X_train = X_train.values 
Y_train = Y_train.values 

X_val = X_val.values 
Y_val = Y_val.values 

X_test = X_test.values 
Y_test = Y_test.values 



In [34]:
clf = TabNetClassifier(n_d = 10, 
                       n_a = 10, 
                       n_steps = 6) 

clf.fit(
    X_train, 
    Y_train, 
    eval_set=[(X_val, Y_val)], 
    eval_metric=["logloss", "balanced_accuracy"], 
    weights=d, 
    max_epochs=200, 
    patience=200 
)

Device used : cuda
epoch 0  | loss: 2.05617 | val_0_logloss: 1.54466 | val_0_balanced_accuracy: 0.33235 |  0:00:00s
epoch 1  | loss: 1.29919 | val_0_logloss: 1.13229 | val_0_balanced_accuracy: 0.33498 |  0:00:00s
epoch 2  | loss: 1.13934 | val_0_logloss: 1.13859 | val_0_balanced_accuracy: 0.35422 |  0:00:01s
epoch 3  | loss: 1.08385 | val_0_logloss: 1.16988 | val_0_balanced_accuracy: 0.31845 |  0:00:01s
epoch 4  | loss: 1.04898 | val_0_logloss: 1.12237 | val_0_balanced_accuracy: 0.32512 |  0:00:02s
epoch 5  | loss: 1.02266 | val_0_logloss: 1.21318 | val_0_balanced_accuracy: 0.32034 |  0:00:02s
epoch 6  | loss: 1.0104  | val_0_logloss: 1.21495 | val_0_balanced_accuracy: 0.32706 |  0:00:03s
epoch 7  | loss: 0.99415 | val_0_logloss: 1.37174 | val_0_balanced_accuracy: 0.33333 |  0:00:03s
epoch 8  | loss: 0.97279 | val_0_logloss: 1.21554 | val_0_balanced_accuracy: 0.34256 |  0:00:04s
epoch 9  | loss: 0.99039 | val_0_logloss: 1.18382 | val_0_balanced_accuracy: 0.33333 |  0:00:04s
epoch 10 | 

In [35]:
cnt = 0 
Y_pred = clf.predict(X_test) 
for i in range(len(Y_pred)): 
    if Y_test[i] == float(Y_pred[i]): 
        cnt += 1 
        
        
print("accuracy = {}".format(cnt / len(pred) * 100))

accuracy = 44.24131627056673


In [36]:
f1_score(Y_test, Y_pred, average='macro')

0.433729748008911

In [51]:
import joblib 
import lightgbm as lgbm 
from xgboost import XGBClassifier  

lgbm = joblib.load('lgbm_btc.pkl') 
xgboost = XGBClassifier() 
xgboost.load_model("xgboost_btc_3") 

In [54]:
lgbm_probs = lgbm.predict_proba(X_test) 
xgboost_probs = xgboost.predict_proba(X_test) 
tabnet_probs = clf.predict_proba(X_test) 

In [56]:
avg_probs = (lgbm_probs + xgboost_probs + tabnet_probs) / 3.0 

avg_probs

array([[0.22124264, 0.21160304, 0.5671543 ],
       [0.36422471, 0.32866346, 0.3071118 ],
       [0.13771362, 0.64507033, 0.21721604],
       ...,
       [0.14050208, 0.36051689, 0.49898102],
       [0.11572027, 0.24580406, 0.63847565],
       [0.20679073, 0.19489644, 0.59831286]])

In [62]:
Y_pred = np.argmax(avg_probs, axis=1)

cnt = 0 
for i in range(len(Y_pred)):
    if Y_pred[i] == Y_test[i]:
        cnt += 1 
        
print("accuracy = {}%".format(cnt / len(Y_pred) * 100.))

accuracy = 48.446069469835464%


In [63]:
clf.save_model("tabnet_btc") 

Successfully saved model at tabnet_btc.zip


'tabnet_btc.zip'

In [64]:
f1_score(Y_test, Y_pred, average="macro")

0.4656160131978309