In [1]:
!nvidia-smi

Sat Jul  1 11:12:35 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0    42W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
import pandas as pd
import numpy as np
import os

chart_df = pd.read_csv("/content/drive/MyDrive/chart_with_sentiment_scores.csv")

In [7]:
# add technical indicators
import pandas_ta as ta
from tqdm.auto import tqdm
hours, days, months = [], [], []
for dt in tqdm(chart_df["datetime"]):
  dtobj = pd.to_datetime(dt)
  hours.append(dtobj.hour)
  days.append(dtobj.day)
  months.append(dtobj.month)

chart_df["hours"] = hours
chart_df["days"] = days
chart_df["months"] = months

chart_df["ebsw"] = chart_df.ta.ebsw(lookahead=False)
chart_df["cmf"] = chart_df.ta.cmf(lookahead=False)
chart_df["bop"] = chart_df.ta.bop(lookahead=False)
chart_df["rsi/100"] = chart_df.ta.rsi(lookahead=False) / 100.0
chart_df["hwma"] = chart_df.ta.hwma(lookahead=False)
chart_df["linreg"] = chart_df.ta.linreg(lookahead=False)
chart_df["hwma/close"] = chart_df["hwma"] / chart_df["close"]
chart_df["linreg/close"] = chart_df["linreg"] / chart_df["close"]
chart_df["high/low"] = chart_df["high"] / chart_df["low"]
chart_df["high/open"] = chart_df["high"] / chart_df["open"]
chart_df["low/open"] = chart_df["low"] / chart_df["open"]
chart_df["close/open"] = chart_df["close"] / chart_df["open"]
chart_df["high/close"] = chart_df["high"] / chart_df["close"]
chart_df["low/close"]  = chart_df["low"] / chart_df["close"]

for l in range(1, 6):
  for col in ["open", "high", "low", "close", "volume"]:
    val = chart_df[col].values
    val_ret = [None for _ in range(l)]
    for i in range(l, len(val)):
      if val[i-l] == 0:
        ret = 1
      else:
        ret = val[i] / val[i-l]
      val_ret.append(ret)
    chart_df[f"{col}_change_{l}"] = val_ret

chart_df = chart_df.drop(columns={"datetime", "open", "high", "low", "close", "volume", "linreg", "hwma"})

  0%|          | 0/9335 [00:00<?, ?it/s]

In [9]:
chart_df.dropna(inplace=True)

In [12]:
train_columns = []

for col in chart_df.columns:
  if col != "targets":
    train_columns.append(col)

X = chart_df[train_columns]
Y = chart_df["targets"]

In [19]:
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import accuracy_score
import catboost as cb

def objective(trial):
  params = {
      "iterations": trial.suggest_int("iterations", 50, 5000),
      "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
      "depth": trial.suggest_int("depth", 3, 10),
      "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
      "border_count": trial.suggest_int("border_count", 32, 255),
      "task_type": "GPU",
      "loss_function": "MultiClass",
      "eval_metric": "Accuracy",
      "random_seed": 42,
      "verbose": False
  }
  tscv = TimeSeriesSplit(n_splits=5)
  accuracies = []
  for train_idx, val_idx in tscv.split(X):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    Y_train, Y_val = Y.iloc[train_idx], Y.iloc[val_idx]
    model = cb.CatBoostClassifier(**params)
    model.fit(X_train, Y_train, eval_set=(X_val, Y_val), early_stopping_rounds=10, verbose=False)
    Y_pred = model.predict(X_val)
    accuracy = accuracy_score(Y_val, Y_pred)
    accuracies.append(accuracy)
  return sum(accuracies) / len(accuracies)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)
best_params = study.best_params
print(f"Best Parameters: {best_params}")

[I 2023-07-01 11:23:54,391] A new study created in memory with name: no-name-cf4cde7e-1f4a-4c11-9c33-b52d009bc361
[I 2023-07-01 11:24:00,556] Trial 0 finished with value: 0.46559070367979344 and parameters: {'iterations': 2405, 'learning_rate': 0.012336740933882715, 'depth': 8, 'l2_leaf_reg': 9.647821727261267, 'border_count': 123}. Best is trial 0 with value: 0.46559070367979344.
[I 2023-07-01 11:24:06,909] Trial 1 finished with value: 0.46998063266623624 and parameters: {'iterations': 1369, 'learning_rate': 0.05613660914127877, 'depth': 4, 'l2_leaf_reg': 5.199876280440501, 'border_count': 55}. Best is trial 1 with value: 0.46998063266623624.
[I 2023-07-01 11:24:13,383] Trial 2 finished with value: 0.4661071659134926 and parameters: {'iterations': 4160, 'learning_rate': 0.016769978801398324, 'depth': 6, 'l2_leaf_reg': 7.903509110725789, 'border_count': 139}. Best is trial 1 with value: 0.46998063266623624.
[I 2023-07-01 11:24:19,781] Trial 3 finished with value: 0.46946417043253713 an

Best Parameters: {'iterations': 1721, 'learning_rate': 0.049627662077003816, 'depth': 8, 'l2_leaf_reg': 2.2763608031118214, 'border_count': 253}


In [20]:
clf = cb.CatBoostClassifier(**best_params)

clf

<catboost.core.CatBoostClassifier at 0x7f853fd343d0>