In [219]:
import json
import pandas as pd
import numpy as np
import ccxt
from tqdm import tqdm
import seaborn as sns
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score 
import pandas_ta as ta
from xgboost import XGBClassifier  

In [220]:
chart_df = pd.read_csv("chart_df_with_deberta_sentiments.csv") 
targets = [] 
high = chart_df["high"].values 
low = chart_df["low"].values 
close = chart_df["close"].values 

threshold = 0.01 

for i in range(len(close)-1):
    high_vol = (high[i+1] - close[i]) / close[i] 
    low_vol = (low[i+1] - close[i]) / close[i] 
    if high_vol >= threshold: 
        targets.append(0) 
    elif low_vol <= -threshold:
        targets.append(1) 
    else:
        targets.append(2) 
        
targets.append(None) 

chart_df["Targets"] = targets 

In [221]:
chart_df.set_index(pd.DatetimeIndex(chart_df["datetime"]), inplace=True)

### addition of chart features ### 
chart_df["bop"] = chart_df.ta.bop(lookahead=False) 
chart_df["ebsw"] = chart_df.ta.ebsw(lookahead=False) 
chart_df["cmf"] = chart_df.ta.cmf(lookahead=False) 
chart_df["vwap"] = chart_df.ta.vwap(lookahead=False) 
chart_df["rsi/100"] = chart_df.ta.rsi(lookahead=False) / 100
chart_df["high/low"] = chart_df["high"] / chart_df["low"] 
chart_df["close/open"] = chart_df["close"] / chart_df["open"] 
chart_df["high/open"] = chart_df["high"] / chart_df["open"] 
chart_df["low/open"] = chart_df["low"] / chart_df["open"] 
chart_df["hwma"] = chart_df.ta.hwma(lookahead=False) 
chart_df["linreg"] = chart_df.ta.linreg(lookahead=False) 
chart_df["hwma/close"] = chart_df["hwma"] / chart_df["close"] 
chart_df["linreg/close"] = chart_df["linreg"] / chart_df["close"]
chart_df["sma"] = chart_df.ta.sma(lookahead=False) 
chart_df["sma/close"] = chart_df["sma"] / chart_df["close"] 


### addition of recent differenced features ### 
for l in tqdm(range(1, 12), position=0, leave=True): 
    for col in ["high", "low", "volume", "vwap"]:
        val = chart_df[col].values 
        val_ret = [None for _ in range(l)]
        for i in range(l, len(val)):
            if val[i-l] == 0: 
                ret = 1 
            else:
                ret = val[i] / val[i-l]  
            val_ret.append(ret) 
        chart_df["{}_change_{}".format(col, l)] = val_ret
        
### drop unnecessary columns ### 
chart_df.drop(columns={"open","high","low","close","volume","vwap","hwma","linreg", "sma"}, inplace=True) 


chart_df.dropna(inplace=True)

print(chart_df.shape)  

100%|██████████| 11/11 [00:00<00:00, 47.49it/s]

(10929, 63)





In [222]:
columns = chart_df.columns 

train_columns = [] 

for c in columns:
    if c not in ["year","datetime","Targets"]: 
        train_columns.append(c) 
        
        
train_idx = int(chart_df.shape[0] * 0.8) 
val_idx = int(chart_df.shape[0] * 0.1)
train_df, val_df, test_df = chart_df.iloc[:train_idx], chart_df.iloc[train_idx:train_idx+val_idx], chart_df.iloc[train_idx+val_idx:]


train_df.shape, val_df.shape, test_df.shape

((8743, 63), (1092, 63), (1094, 63))

In [223]:
chart_df.dropna(inplace=True)
X_train = train_df[train_columns] 
Y_train = train_df["Targets"]

X_val = val_df[train_columns] 
Y_val = val_df["Targets"] 

X_test = test_df[train_columns] 
Y_test = test_df["Targets"] 


class_weights = compute_class_weight(class_weight = "balanced",
                                     classes = np.unique(Y_train),
                                     y = Y_train) 

d = {0:class_weights[0], 1:class_weights[1], 2:class_weights[2]} 

print(d) 

{0: 0.8696906396100667, 1: 1.2178576403398802, 2: 0.9717683672335223}


In [224]:
clf = XGBClassifier(silent=False, 
                    n_estimators=200,
                    class_weight=d, 
                    metric="logloss")

clf.fit(X_train, 
        Y_train, 
        eval_set=[(X_val, Y_val)],
        verbose=20)

Parameters: { "class_weight", "metric", "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-mlogloss:1.08611
[20]	validation_0-mlogloss:1.10654
[40]	validation_0-mlogloss:1.12927
[60]	validation_0-mlogloss:1.15398
[80]	validation_0-mlogloss:1.17782
[100]	validation_0-mlogloss:1.20497
[120]	validation_0-mlogloss:1.23242
[140]	validation_0-mlogloss:1.25229
[160]	validation_0-mlogloss:1.27764
[180]	validation_0-mlogloss:1.30256
[199]	validation_0-mlogloss:1.32233


In [225]:
Y_pred = clf.predict(X_test) 
cnt = 0 
for i in range(len(Y_pred)):
    if Y_pred[i] == Y_test.values[i]:
        cnt += 1 
        
print("accuracy = {}%".format(cnt / len(Y_pred) * 100.))

accuracy = 47.80621572212066%


In [226]:
f1_score(Y_test, Y_pred, average='macro')

0.4599714868386508

In [227]:
#clf.save_model("xgboost_btc_3") 

In [228]:
test_model = XGBClassifier() 
test_model.load_model("xgboost_btc_3") 

In [229]:
Y_pred = test_model.predict(X_test) 
cnt = 0 
for i in range(len(Y_pred)): 
    if Y_pred[i] == Y_test.values[i]: 
        cnt += 1 

print(cnt / len(Y_pred) * 100) 

47.80621572212066
