In [1]:
!nvidia-smi

Sat Jul  1 11:42:56 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    50W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import pandas as pd
import numpy as np
import os

chart_df = pd.read_csv("/content/drive/MyDrive/chart_with_sentiment_scores.csv")

In [5]:
# add technical indicators
import pandas_ta as ta
from tqdm.auto import tqdm
hours, days, months = [], [], []
for dt in tqdm(chart_df["datetime"]):
  dtobj = pd.to_datetime(dt)
  hours.append(dtobj.hour)
  days.append(dtobj.day)
  months.append(dtobj.month)

chart_df["hours"] = hours
chart_df["days"] = days
chart_df["months"] = months

chart_df["ebsw"] = chart_df.ta.ebsw(lookahead=False)
chart_df["cmf"] = chart_df.ta.cmf(lookahead=False)
chart_df["bop"] = chart_df.ta.bop(lookahead=False)
chart_df["rsi/100"] = chart_df.ta.rsi(lookahead=False) / 100.0
chart_df["hwma"] = chart_df.ta.hwma(lookahead=False)
chart_df["linreg"] = chart_df.ta.linreg(lookahead=False)
chart_df["hwma/close"] = chart_df["hwma"] / chart_df["close"]
chart_df["linreg/close"] = chart_df["linreg"] / chart_df["close"]
chart_df["high/low"] = chart_df["high"] / chart_df["low"]
chart_df["high/open"] = chart_df["high"] / chart_df["open"]
chart_df["low/open"] = chart_df["low"] / chart_df["open"]
chart_df["close/open"] = chart_df["close"] / chart_df["open"]
chart_df["high/close"] = chart_df["high"] / chart_df["close"]
chart_df["low/close"]  = chart_df["low"] / chart_df["close"]

for l in range(1, 6):
  for col in ["open", "high", "low", "close", "volume"]:
    val = chart_df[col].values
    val_ret = [None for _ in range(l)]
    for i in range(l, len(val)):
      if val[i-l] == 0:
        ret = 1
      else:
        ret = val[i] / val[i-l]
      val_ret.append(ret)
    chart_df[f"{col}_change_{l}"] = val_ret

chart_df = chart_df.drop(columns={"datetime", "open", "high", "low", "close", "volume", "linreg", "hwma"})

  0%|          | 0/9335 [00:00<?, ?it/s]

In [6]:
chart_df.dropna(inplace=True)

In [7]:
chart_df.head()

Unnamed: 0,targets,news_positive_scores,news_negative_scores,hours,days,months,ebsw,cmf,bop,rsi/100,...,open_change_4,high_change_4,low_change_4,close_change_4,volume_change_4,open_change_5,high_change_5,low_change_5,close_change_5,volume_change_5
39,2.0,0.0,0.0,4,1,4,0.0,0.221446,-0.045573,0.659537,...,1.004453,1.007434,1.006192,1.007774,1.143676,1.010897,1.006783,1.008813,1.004198,1.477008
40,2.0,0.0,0.0,8,1,4,0.57735,0.201553,-0.005917,0.658919,...,1.007774,1.006996,1.003691,1.005484,1.730719,1.004198,1.006005,1.00378,1.007735,1.223106
41,2.0,13.904989,8.095011,12,1,4,0.796874,0.172388,0.514761,0.690842,...,1.00548,1.011357,1.006952,1.01002,3.14004,1.007732,1.011994,1.007534,1.009474,2.201117
42,2.0,8.3051,3.694901,16,1,4,0.983671,0.021719,-0.328696,0.6599,...,1.010377,1.003634,1.016228,1.001828,0.437554,1.009456,1.0107,1.00845,1.008163,2.220246
43,0.0,8.310017,6.689983,20,1,4,0.986626,0.075315,0.291923,0.674547,...,1.002053,1.002638,1.003106,1.003805,0.600674,1.008761,1.003386,1.016425,1.00355,0.273292


In [8]:
train_columns = []

for col in chart_df.columns:
  if col != "targets":
    train_columns.append(col)

X = chart_df[train_columns]
Y = chart_df["targets"]

In [16]:
import warnings

# Disable all user warnings
warnings.filterwarnings("ignore")

In [19]:
import optuna
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import TimeSeriesSplit

def objective(trial):
  params = {
      "objective": "multiclass",
      "num_class": 3,
      "metric": "multi_logloss",
      "boosting_type": "gbdt",
      "num_leaves": trial.suggest_int("num_leaves", 10, 100),
      "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
      "max_depth": trial.suggest_int("max_depth", 3, 10),
      "min_child_samples": trial.suggest_int("min_child_samples", 1, 20),
      "subsample": trial.suggest_float("subsample", 0.5, 1.0),
      "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
      "reg_alpha": trial.suggest_float("reg_alpha", 0, 1),
      "reg_lambda": trial.suggest_float("reg_lambda", 0, 1),
      "random_state": 42,
      "n_estimators": trial.suggest_int("n_estimators", 50, 5000)
  }
  tscv = TimeSeriesSplit(n_splits=5)
  accuracies = []
  for train_idx, val_idx in tscv.split(X):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    Y_train, Y_val = Y.iloc[train_idx], Y.iloc[val_idx]
    model = lgb.LGBMClassifier(**params)
    model.fit(X_train, Y_train, eval_set=[(X_val, Y_val)], verbose=False)
    Y_pred = model.predict(X_val)
    accuracy = accuracy_score(Y_val, Y_pred)
    accuracies.append(accuracy)
  return sum(accuracies) / len(accuracies)
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)
best_params = study.best_params
print(f"Best Parameters: {best_params}")

[I 2023-07-01 12:00:00,324] A new study created in memory with name: no-name-12c58db1-7ae1-47c6-82f3-e7dfdacd2786
[I 2023-07-01 12:00:53,742] Trial 0 finished with value: 0.45823111684958046 and parameters: {'num_leaves': 26, 'learning_rate': 0.04464018917014528, 'max_depth': 7, 'min_child_samples': 5, 'subsample': 0.6922439695596225, 'colsample_bytree': 0.9759518564239358, 'reg_alpha': 0.7505837717488202, 'reg_lambda': 0.15478221137444248, 'n_estimators': 4695}. Best is trial 0 with value: 0.45823111684958046.
[I 2023-07-01 12:01:45,063] Trial 1 finished with value: 0.45668173014848285 and parameters: {'num_leaves': 91, 'learning_rate': 0.057009116148804106, 'max_depth': 4, 'min_child_samples': 1, 'subsample': 0.5486075620905257, 'colsample_bytree': 0.8629080866814253, 'reg_alpha': 0.1021467823184995, 'reg_lambda': 0.5868171965520286, 'n_estimators': 4246}. Best is trial 0 with value: 0.45823111684958046.
[I 2023-07-01 12:01:52,931] Trial 2 finished with value: 0.46468689477081987 and

Best Parameters: {'num_leaves': 98, 'learning_rate': 0.010726258300226442, 'max_depth': 6, 'min_child_samples': 4, 'subsample': 0.6186710352439998, 'colsample_bytree': 0.5690642326572914, 'reg_alpha': 0.37843237624912734, 'reg_lambda': 0.14538913425870043, 'n_estimators': 255}
