In [1]:
!pip install seaborn 
!pip install ccxt 
!pip install tabpfn 
!pip install pandas-ta 
!pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [1]:
import numpy as np 
import pandas as pd 
import json 
import ccxt 
from tqdm import tqdm 
from xgboost import XGBClassifier 
import json
from tabpfn import TabPFNClassifier 
import seaborn as sns
import pandas_ta as ta
from sklearn.utils.class_weight import compute_class_weight
import optuna
from sklearn.metrics import accuracy_score

In [2]:
with open("/content/BTC_USDT-4h_20230531.json") as f: 
  d = json.load(f) 

chart_df = pd.DataFrame(d) 
chart_df = chart_df.rename(columns={0:"timestamp", 
                                    1:"open",
                                    2:"high",
                                    3:"low",
                                    4:"close",
                                    5:"volume"})  


def process(df):
  binance = ccxt.binance() 
  dates = df["timestamp"].values 
  timestamp = [] 
  for i in range(len(dates)):
    date_string = binance.iso8601(int(dates[i])) 
    date_string = date_string[:10] + " " + date_string[11:-5] 
    timestamp.append(date_string) 
  df["datetime"] = timestamp 
  df = df.drop(columns={"timestamp"}) 
  return df

chart_df = process(chart_df) 

hours, days, months = [], [], [] 
for dt in tqdm(chart_df["datetime"].values):
  dtobj = pd.to_datetime(dt) 
  hours.append(dtobj.hour) 
  days.append(dtobj.day) 
  months.append(dtobj.month) 

chart_df["months"] = months
chart_df["days"] = days 
chart_df["hours"] = hours 
chart_df.set_index(pd.DatetimeIndex(chart_df["datetime"]), inplace=True) 
chart_df.drop(columns={"datetime"}, inplace=True)


100%|██████████| 12662/12662 [00:00<00:00, 14811.30it/s]


In [3]:
def preprocess(df, threshold = 0.0075): 
  targets = [] 
  high = chart_df["high"].values 
  low = chart_df["low"].values 
  close = chart_df["close"].values 
  for i in range(close.shape[0]-1): 
    high_vol = (high[i+1] - close[i]) / close[i] 
    low_vol = (low[i+1] - close[i]) / close[i] 
    if high_vol >= threshold:
      targets.append(0) 
    elif low_vol <= -threshold:
      targets.append(1)
    else:
      targets.append(2) 
  targets.append(None)
  df["targets"] = targets 

  columns_to_drop = ["open", "high", "low", "close", "volume"] 
  
  df["bop"] = chart_df.ta.bop(lookahead=False) 
  df["obv"] = chart_df.ta.obv(lookahead=False)
  df["ebsw"] = chart_df.ta.ebsw(lookahead=False) 
  df["cmf"] = chart_df.ta.cmf(lookahead=False)
  df["rsi/100"] = chart_df.ta.rsi(lookahead=False) / 100   
  df["linreg"] = chart_df.ta.linreg(lookahead=False) 
  df["linreg_ratio"] = df["linreg"] / df["close"] 
  columns_to_drop.append("linreg")
  
  windows = [5, 10, 20, 60, 120] 
  for window in windows: 
    df[f"close_ma{window}"] = df["close"].rolling(window).mean() 
    df[f"volume_ma{window}"] = df["volume"].rolling(window).mean() 
    df[f"close_ma{window}_ratio"] = (df["close"] - df[f"close_ma{window}"]) / df[f"close_ma{window}"] 
    df[f"volume_ma{window}_ratio"] = (df["volume"] - df[f"volume_ma{window}"]) / df[f"volume_ma{window}"] 
    columns_to_drop.append(f"close_ma{window}")
    columns_to_drop.append(f"volume_ma{window}")

  df["open_lastclose_ratio"] = np.zeros(len(df)) 
  df.loc[1:, "open_lastclose_ratio"] = (df["open"][1:].values - df["close"][:-1].values) / df["close"][:-1].values 
  df["high_close_ratio"] = (df["high"].values - df["close"].values) / df["close"].values 
  df["low_close_ratio"] = (df["low"].values - df["close"].values) / df["close"].values 
  df["close_lastclose_ratio"] = np.zeros(len(df))
  df.loc[1:, "close_lastclose_ratio"] = (df["close"][1:].values - df["close"][:-1].values) / df["close"][:-1].values 
  df["volume_lastvolume_ratio"] = np.zeros(len(df))  
  df.loc[1:, "volume_lastvolume_ratio"] = ((df["volume"][1:].values - df["volume"][:-1].values) / df["volume"][:-1].replace(to_replace=0, method="ffill").replace(to_replace=0, method="bfill").values)
  df.dropna(inplace=True) 
  df.drop(columns=columns_to_drop,inplace=True) 
  return df

In [4]:
chart_df = preprocess(chart_df)

  df.loc[1:, "open_lastclose_ratio"] = (df["open"][1:].values - df["close"][:-1].values) / df["close"][:-1].values
  df.loc[1:, "close_lastclose_ratio"] = (df["close"][1:].values - df["close"][:-1].values) / df["close"][:-1].values
  df.loc[1:, "volume_lastvolume_ratio"] = ((df["volume"][1:].values - df["volume"][:-1].values) / df["volume"][:-1].replace(to_replace=0, method="ffill").replace(to_replace=0, method="bfill").values)


In [5]:
train_size = int(0.8 * chart_df.shape[0]) 
val_size = int(0.1 * chart_df.shape[0]) 
train_df = chart_df.iloc[:train_size] 
val_df = chart_df.iloc[train_size:train_size+val_size] 
test_df = chart_df.iloc[train_size+val_size:] 

train_df.shape, val_df.shape, test_df.shape 

((10033, 25), (1254, 25), (1255, 25))

In [6]:
input_features = [] 
for col in train_df.columns: 
  if col != "targets": 
    input_features.append(col) 

X_train = train_df[input_features].values  
Y_train = train_df["targets"].values  

X_valid = val_df[input_features].values 
Y_valid = val_df["targets"].values 

X_test = test_df[input_features].values 
Y_test = test_df["targets"].values 

X_train.shape, Y_train.shape, X_valid.shape, Y_valid.shape, X_test.shape, Y_test.shape

((10033, 24), (10033,), (1254, 24), (1254,), (1255, 24), (1255,))

In [7]:
X_full = np.concatenate([X_train, X_valid], axis=0)
Y_full = np.concatenate([Y_train, Y_valid], axis=0)  

X_full.shape, Y_full.shape

((11287, 24), (11287,))

In [8]:
class_weights = compute_class_weight(class_weight="balanced", 
                                     classes=np.unique(Y_train), 
                                     y=np.array(Y_train)) 

In [9]:
class_weights

array([0.67236295, 1.12000447, 1.61328188])

In [10]:
# optuna - will this give better results?  
def objective(trial): 
  param = {
      "tree_method": "gpu_hist", 
      "lambda": trial.suggest_loguniform("lambda", 1e-3, 10), 
      "alpha": trial.suggest_loguniform("alpha", 1e-3, 10), 
      "colsample_bytree": trial.suggest_categorical("colsample_bytree", [0.5, 0.7, 0.9, 1.0]), 
      "subsample": trial.suggest_categorical("subsample", [0.5, 0.7, 0.9, 1.0]), 
      "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.5), 
      "n_estimators": trial.suggest_int("n_estimators", 100, 1500), 
      "max_depth": trial.suggest_int("max_depth", 3, 10), 
      "random_state": 42, 
      "min_child_weight": trial.suggest_int("min_child_weight", 3, 10), 
  }
  model = XGBClassifier(**param) 
  model.fit(X_full, Y_full, verbose=False)  
  preds = model.predict(X_test) 
  pred_labels = np.rint(preds) 
  accuracy = accuracy_score(Y_test, pred_labels) 
  return accuracy 

study = optuna.create_study(direction="maximize") 
study.optimize(objective, n_trials=100) 

trial = study.best_trial 

print(trial.value)
print("="*100) 
for key, value in trial.params.items(): 
  print("{}:{}".format(key, value))

[I 2023-05-31 10:21:34,267] A new study created in memory with name: no-name-5bb033fe-8439-438f-9199-d1d7e0187597
  "lambda": trial.suggest_loguniform("lambda", 1e-3, 10),
  "alpha": trial.suggest_loguniform("alpha", 1e-3, 10),
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.5),
[I 2023-05-31 10:21:36,087] Trial 0 finished with value: 0.500398406374502 and parameters: {'lambda': 2.332595155691973, 'alpha': 0.6934668663531006, 'colsample_bytree': 0.7, 'subsample': 0.5, 'learning_rate': 0.16902190768891953, 'n_estimators': 311, 'max_depth': 3, 'min_child_weight': 3}. Best is trial 0 with value: 0.500398406374502.
  "lambda": trial.suggest_loguniform("lambda", 1e-3, 10),
  "alpha": trial.suggest_loguniform("alpha", 1e-3, 10),
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.5),
[I 2023-05-31 10:21:37,740] Trial 1 finished with value: 0.5282868525896415 and parameters: {'lambda': 0.7079519849189722, 'alpha': 0.001372548282267134, 'colsample_bytree

0.5322709163346614
lambda:0.41612353749453695
alpha:0.03172778795858678
colsample_bytree:0.9
subsample:0.7
learning_rate:0.03875879673183694
n_estimators:307
max_depth:4
min_child_weight:9


In [11]:
trial.params

{'lambda': 0.41612353749453695,
 'alpha': 0.03172778795858678,
 'colsample_bytree': 0.9,
 'subsample': 0.7,
 'learning_rate': 0.03875879673183694,
 'n_estimators': 307,
 'max_depth': 4,
 'min_child_weight': 9}

In [12]:
print("done!")

done!


In [17]:
clf = XGBClassifier(**trial.params, tree_method="gpu_hist")  

clf.fit(X_full, Y_full, verbose=100) 

In [18]:
Y_pred = clf.predict(X_test) 
cnt = 0 
for i in range(len(Y_pred)): 
  if Y_pred[i] == Y_test[i]: 
    cnt += 1 
  
cnt / len(Y_pred) * 100 

52.589641434262944

In [19]:
Y_pred

array([2, 2, 2, ..., 1, 1, 0])