In [1]:
import pandas as pd
import numpy as np

# Data cleaning and preparation

In [2]:
messages = pd.read_csv("AMZN_2012-06-21_34200000_57600000_message_10.csv")
orderbook = pd.read_csv("AMZN_2012-06-21_34200000_57600000_orderbook_10.csv")

In [3]:
window = 50

In [4]:
columns = []
for i in range(10):
    columns += [f"ask_price_{i+1}", f"ask_size_{i+1}", f"bid_price_{i+1}", f"bid_size_{i+1}"]
orderbook.columns = columns

In [5]:
messages.columns = ["time", "type", "order_id", "size", "price", "direction"] 

In [6]:
print(orderbook.shape)
print(orderbook.head())
print(messages.shape)
print(messages.head())

(269747, 40)
   ask_price_1  ask_size_1  bid_price_1  bid_size_1  ask_price_2  ask_size_2  \
0      2239500         100      2238100          21      2239900         100   
1      2239500         100      2238100          21      2239600          20   
2      2239500         100      2238100          21      2239600          20   
3      2239500         100      2238100          21      2239600          20   
4      2239500         100      2238100          21      2239600          20   

   bid_price_2  bid_size_2  ask_price_3  ask_size_3  ...  bid_price_8  \
0      2231800         100      2240000         220  ...      2204000   
1      2231800         100      2239900         100  ...      2204000   
2      2237500         100      2239900         100  ...      2213000   
3      2237500         100      2239900         100  ...      2213000   
4      2237500         100      2239900         100  ...      2226200   

   bid_size_8  ask_price_9  ask_size_9  bid_price_9  bid_size_9  as

In [7]:
midprice = (orderbook['ask_price_1']+orderbook['bid_price_1'])/2
future_price = midprice.shift(-window).dropna() # where window is the event horizon over which we will assess stock price changes.
delta = future_price - midprice
labels = delta.apply(lambda x: 1 if x>0 else (-1 if x<0 else 0))
midprice = midprice[:-window]
labels = labels[:-window]

In [8]:
print(labels.value_counts())

-1    121777
 1    113046
 0     34874
Name: count, dtype: int64


In [9]:
print(len(midprice), len(labels), len(future_price))

269697 269697 269697


In [10]:
midprice[50]

2238700.0

In [11]:
future_price[0]

2238700.0

In [12]:
midprice.iloc[-1]

2205900.0

In [13]:
future_price.iloc[-51]

2205900.0

In [14]:
future_price.iloc[-1]

2205750.0

In [15]:
delta

0        -100.0
1        -100.0
2        -100.0
3        -100.0
4        -100.0
          ...  
269742      NaN
269743      NaN
269744      NaN
269745      NaN
269746      NaN
Length: 269747, dtype: float64

# Feature Engineering

### Spread, Depth ratio, orderbook imbalance and Rolling Trade imbalance

In [16]:
spread = orderbook['ask_price_1'] - orderbook['bid_price_1']

In [17]:
bid_vol_1 = orderbook['bid_size_1']
ask_vol_1 = orderbook['ask_size_1']
depth_ratio = bid_vol_1/ask_vol_1
orderbook_imbalance = (bid_vol_1-ask_vol_1)/(bid_vol_1 + ask_vol_1)

In [18]:
trade_mask = (messages["type"] == 4).astype(int)

In [19]:
trade_dir = messages["direction"]

In [20]:
trade_dir.value_counts()

direction
-1    139403
 1    130344
Name: count, dtype: int64

In [21]:
trade_signal = trade_mask*trade_dir

In [22]:
rolling_trade_imbalance = trade_signal.rolling(window=window).sum() # feature

In [23]:
print(trade_mask.shape, trade_dir.shape, trade_signal.shape, rolling_trade_imbalance.shape)

(269747,) (269747,) (269747,) (269747,)


### Cancel to add ratio

In [24]:
adds = (messages['type'] == 1).astype(int)
cancels = (messages['type'] == 2).astype(int)
rolling_adds = adds.rolling(window).sum()
rolling_cancels = cancels.rolling(window).sum()
cancel_to_add_ratio = rolling_cancels/(rolling_adds + 1e-6)

In [25]:
cancel_to_add_ratio

0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
         ... 
269742    0.0
269743    0.0
269744    0.0
269745    0.0
269746    0.0
Name: type, Length: 269747, dtype: float64

### Aggressive volume ratio (market order volume/total volume)

In [26]:
is_exec = (messages['type'] == 4).astype(int)
exec_vol = is_exec * messages['size']
rolling_market_order_vol = exec_vol.rolling(window).sum()
rolling_total_vol = messages['size'].rolling(window).sum()
aggressive_volume_ratio = rolling_market_order_vol/(rolling_total_vol + 1e-6)

In [27]:
aggressive_volume_ratio

0              NaN
1              NaN
2              NaN
3              NaN
4              NaN
            ...   
269742    0.030232
269743    0.030232
269744    0.031825
269745    0.033816
269746    0.033816
Length: 269747, dtype: float64

### Rolling midprice change and rolling return volatility

In [28]:
rolling_mid_price_change = midprice.rolling(window=window).apply(lambda x: x[-1]-x[0], raw=True)

In [29]:
rolling_mid_price_change

0           NaN
1           NaN
2           NaN
3           NaN
4           NaN
          ...  
269692   -350.0
269693   -350.0
269694   -350.0
269695   -350.0
269696   -350.0
Length: 269697, dtype: float64

In [30]:
mid_price_return = midprice.pct_change().fillna(0)
rolling_return_volatility = mid_price_return.rolling(window=window).std()

### Message rate

In [31]:
time = messages["time"]
time_diff = time.diff()

In [32]:
instant_msg_rate = 1 / time_diff

In [33]:
instant_msg_rate = instant_msg_rate.replace(np.inf, np.nan)

In [34]:
instant_msg_rate

0                NaN
1                NaN
2                NaN
3                NaN
4                NaN
             ...    
269742      3.085058
269743     32.002293
269744     19.511078
269745    333.040702
269746    896.833639
Name: time, Length: 269747, dtype: float64

In [35]:
instant_msg_rate = instant_msg_rate.fillna(instant_msg_rate.max())

In [36]:
instant_msg_rate

0         4.347820e+06
1         4.347820e+06
2         4.347820e+06
3         4.347820e+06
4         4.347820e+06
              ...     
269742    3.085058e+00
269743    3.200229e+01
269744    1.951108e+01
269745    3.330407e+02
269746    8.968336e+02
Name: time, Length: 269747, dtype: float64

In [37]:
rolling_message_rate = instant_msg_rate.rolling(window=window).mean()

In [38]:
rolling_message_rate = rolling_message_rate.fillna(rolling_message_rate.max())

In [39]:
rolling_message_rate

0         2.957504e+06
1         2.957504e+06
2         2.957504e+06
3         2.957504e+06
4         2.957504e+06
              ...     
269742    1.898794e+05
269743    1.898800e+05
269744    1.898798e+05
269745    1.898829e+05
269746    1.899007e+05
Name: time, Length: 269747, dtype: float64

# Feature matrix

In [40]:
X = pd.DataFrame({
    'spread': spread,
    'depth_ratio': depth_ratio,
    'orderbook_imbalance': orderbook_imbalance,
    'rolling_trade_imbalance': rolling_trade_imbalance,
    'cancel_to_add_ratio': cancel_to_add_ratio,
    'aggressive_volume_ratio': aggressive_volume_ratio,
    'rolling_mid_price_change': rolling_mid_price_change,
    'rolling_return_volatility': rolling_return_volatility,
    'rolling_message_rate': rolling_message_rate,
    'mid_price': midprice
})

In [41]:
X

Unnamed: 0,spread,depth_ratio,orderbook_imbalance,rolling_trade_imbalance,cancel_to_add_ratio,aggressive_volume_ratio,rolling_mid_price_change,rolling_return_volatility,rolling_message_rate,mid_price
0,1400,0.21,-0.652893,,,,,,2.957504e+06,2238800.0
1,1400,0.21,-0.652893,,,,,,2.957504e+06,2238800.0
2,1400,0.21,-0.652893,,,,,,2.957504e+06,2238800.0
3,1400,0.21,-0.652893,,,,,,2.957504e+06,2238800.0
4,1400,0.21,-0.652893,,,,,,2.957504e+06,2238800.0
...,...,...,...,...,...,...,...,...,...,...
269742,1100,2.49,0.426934,4.0,0.0,0.030232,,,1.898794e+05,
269743,1300,2.49,0.426934,4.0,0.0,0.030232,,,1.898800e+05,
269744,1300,2.49,0.426934,4.0,0.0,0.031825,,,1.898798e+05,
269745,1200,2.49,0.426934,4.0,0.0,0.033816,,,1.898829e+05,


In [42]:
print(X.isna().sum())

spread                        0
depth_ratio                   0
orderbook_imbalance           0
rolling_trade_imbalance      49
cancel_to_add_ratio          49
aggressive_volume_ratio      49
rolling_mid_price_change     99
rolling_return_volatility    99
rolling_message_rate          0
mid_price                    50
dtype: int64


In [43]:
X = X.dropna().reset_index(drop=True)
labels = labels.iloc[-len(X):].reset_index(drop=True)

In [44]:
X

Unnamed: 0,spread,depth_ratio,orderbook_imbalance,rolling_trade_imbalance,cancel_to_add_ratio,aggressive_volume_ratio,rolling_mid_price_change,rolling_return_volatility,rolling_message_rate,mid_price
0,2400,0.740,-0.149425,-2.0,0.0,0.125997,-100.0,0.000022,2.957504e+06,2238700.0
1,2400,0.740,-0.149425,-2.0,0.0,0.123769,-100.0,0.000022,2.870691e+06,2238700.0
2,2400,0.740,-0.149425,-2.0,0.0,0.121592,-100.0,0.000022,2.783746e+06,2238700.0
3,2400,0.740,-0.149425,-2.0,0.0,0.124047,-100.0,0.000022,2.696807e+06,2238700.0
4,2400,7.400,0.761905,-3.0,0.0,0.141786,-100.0,0.000022,2.696807e+06,2238700.0
...,...,...,...,...,...,...,...,...,...,...
269643,1000,0.625,-0.230769,-3.0,0.0,0.096890,-350.0,0.000022,1.307632e+06,2205900.0
269644,1000,0.625,-0.230769,-3.0,0.0,0.096890,-350.0,0.000021,1.307630e+06,2205900.0
269645,1000,0.625,-0.230769,-3.0,0.0,0.086581,-350.0,0.000021,1.307592e+06,2205900.0
269646,1000,0.625,-0.230769,-3.0,0.0,0.078256,-350.0,0.000021,1.220640e+06,2205900.0


# First Model (M1) using XGBoost

In [45]:
import xgboost as xgb

In [46]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, log_loss

In [47]:
X_train, X_test, y_train, y_test = train_test_split(
    X, labels, test_size=0.2, random_state=42, stratify=labels
)

In [48]:
y_train_mapped = y_train.map({-1: 0, 0: 1, 1: 2})
y_test_mapped  = y_test.map({-1: 0, 0: 1, 1: 2})

In [49]:
clf = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='multi:softmax',
    num_class=3,
    random_state=42
)

In [50]:
clf.fit(X_train, y_train_mapped)

In [51]:
y_pred_mapped = clf.predict(X_test)

In [52]:
y_pred_mapped

array([2, 0, 0, ..., 2, 2, 0])

In [53]:
inverse_map = {0: -1, 1: 0, 2: 1}
y_pred = pd.Series(y_pred_mapped).map(inverse_map)

In [54]:
print(confusion_matrix(y_test, y_pred, labels=[-1, 0, 1]))
print(classification_report(y_test, y_pred, target_names=["down", "neutral", "up"]))

[[18291    51  6007]
 [ 3916   327  2732]
 [ 7908    69 14629]]
              precision    recall  f1-score   support

        down       0.61      0.75      0.67     24349
     neutral       0.73      0.05      0.09      6975
          up       0.63      0.65      0.64     22606

    accuracy                           0.62     53930
   macro avg       0.65      0.48      0.47     53930
weighted avg       0.63      0.62      0.58     53930



### M1 advanced

In [55]:
n = len(X)
train_sz = int(n * 0.70)
val_sz = int(n * 0.15)

X_train, y_train = X.iloc[:train_sz], labels.iloc[:train_sz]
X_val, y_val = X.iloc[train_sz:train_sz+val_sz], labels.iloc[train_sz:train_sz+val_sz]
X_test, y_test = X.iloc[train_sz+val_sz:], labels.iloc[train_sz+val_sz:]

In [56]:
label_map = {-1: 0, 0: 1, 1: 2}
y_train_mapped = y_train.map(label_map)
y_val_mapped   = y_val.map(label_map)
y_test_mapped  = y_test.map(label_map)

In [57]:
dtrain = xgb.DMatrix(X_train, label=y_train_mapped)
dval   = xgb.DMatrix(X_val, label=y_val_mapped)
dtest  = xgb.DMatrix(X_test, label=y_test_mapped)

In [58]:
params = {
    "objective": "multi:softprob",
    "num_class": 3,
    "eval_metric": "mlogloss",
    "eta": 0.05,
    "max_depth": 6,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "min_child_weight": 1,
    "gamma": 0.1,
    "lambda": 1.0,
    "nthread": 4,
    "seed": 42
}

In [59]:
watchlist = [(dtrain, "train"), (dval, "val")]

model = xgb.train(
    params,
    dtrain,
    num_boost_round=800,
    early_stopping_rounds=40,
    evals=watchlist,
    verbose_eval=50
)

[0]	train-mlogloss:1.08549	val-mlogloss:1.08511
[50]	train-mlogloss:0.86730	val-mlogloss:0.91744
[94]	train-mlogloss:0.82292	val-mlogloss:0.92151


In [60]:
y_prob = model.predict(dtest)
y_pred_mapped = np.argmax(y_prob, axis=1)
inverse_map = {0: -1, 1: 0, 2: 1}
y_pred = pd.Series(y_pred_mapped).map(inverse_map)

In [61]:
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred, labels=[-1, 0, 1]))

Confusion Matrix:
[[10752     0  7445]
 [ 2341     9  2440]
 [ 5856    19 11586]]


In [62]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=["down", "neutral", "up"]))


Classification Report:
              precision    recall  f1-score   support

        down       0.57      0.59      0.58     18197
     neutral       0.32      0.00      0.00      4790
          up       0.54      0.66      0.60     17461

    accuracy                           0.55     40448
   macro avg       0.48      0.42      0.39     40448
weighted avg       0.53      0.55      0.52     40448



In [63]:
print("Multiclass Log Loss:", log_loss(y_test_mapped, y_prob))

Multiclass Log Loss: 0.9520949800881502


#### Parameter Optimization for M1 advanced

In [64]:
import optuna

In [65]:
from sklearn.preprocessing import LabelEncoder

In [66]:
y_mapped = labels.map({-1:0, 0:1, 1:2})

X_train, X_val, y_train, y_val = train_test_split(
    X, y_mapped, test_size=0.2, random_state=42, shuffle=False
)

dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

In [67]:
def objective(trial):
    param = {
        "objective":      "multi:softprob",
        "num_class":      3,
        "eval_metric":    "mlogloss",
        "tree_method":    "gpu_hist",
        "predictor":      "gpu_predictor",
        "verbosity":      0,
        "eta":            trial.suggest_float("eta", 0.01, 0.3, log=True),
        "max_depth":      trial.suggest_int("max_depth", 3, 10),
        "subsample":      trial.suggest_float("subsample", .5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", .5, 1.0),
        "gamma":          trial.suggest_float("gamma", 0, 5),
        "alpha":          trial.suggest_float("alpha", 0, 5),
        "lambda":         trial.suggest_float("lambda", 0, 5),
    }

    num_boost = trial.suggest_int("n_estimators", 200, 800)

    bst = xgb.train(
        param,
        dtrain,
        num_boost,
        evals=[(dval, "val")],
        early_stopping_rounds=20,
        verbose_eval=False
    )

    preds = bst.predict(dval)
    loss  = log_loss(y_val, preds)
    return loss

In [68]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30)
print("Best params:", study.best_trial.params)

[I 2025-09-04 20:28:40,867] A new study created in memory with name: no-name-02aabf94-a7f9-4fc6-aaef-0788fe7f457b
[I 2025-09-04 20:28:43,496] Trial 0 finished with value: 0.9126743503805396 and parameters: {'eta': 0.05727864561144103, 'max_depth': 3, 'subsample': 0.6597393326600587, 'colsample_bytree': 0.6769459794667884, 'gamma': 1.325469094557718, 'alpha': 1.5399997598333381, 'lambda': 1.385884028174989, 'n_estimators': 479}. Best is trial 0 with value: 0.9126743503805396.
[I 2025-09-04 20:28:49,405] Trial 1 finished with value: 0.9115010240695722 and parameters: {'eta': 0.015700691589861673, 'max_depth': 3, 'subsample': 0.6697511480357885, 'colsample_bytree': 0.868446800821784, 'gamma': 0.7223802020220083, 'alpha': 1.5744437160482638, 'lambda': 4.772072296811465, 'n_estimators': 594}. Best is trial 1 with value: 0.9115010240695722.
[I 2025-09-04 20:28:50,383] Trial 2 finished with value: 0.9135192900359135 and parameters: {'eta': 0.08818353132740343, 'max_depth': 3, 'subsample': 0.5

[I 2025-09-04 20:30:05,390] Trial 23 finished with value: 0.9121297621465179 and parameters: {'eta': 0.010468287096777575, 'max_depth': 3, 'subsample': 0.8025285882963433, 'colsample_bytree': 0.8716181694985727, 'gamma': 3.282265542379947, 'alpha': 4.213515165980959, 'lambda': 4.980958496094629, 'n_estimators': 427}. Best is trial 1 with value: 0.9115010240695722.
[I 2025-09-04 20:30:09,650] Trial 24 finished with value: 0.9169100597066316 and parameters: {'eta': 0.019760375277894017, 'max_depth': 6, 'subsample': 0.6152239499211808, 'colsample_bytree': 0.6946102611412481, 'gamma': 4.112566513405932, 'alpha': 4.643337794155991, 'lambda': 4.302656687328093, 'n_estimators': 633}. Best is trial 1 with value: 0.9115010240695722.
[I 2025-09-04 20:30:14,493] Trial 25 finished with value: 0.9122263218571682 and parameters: {'eta': 0.013595577907837846, 'max_depth': 4, 'subsample': 0.7816850680157579, 'colsample_bytree': 0.936545075700218, 'gamma': 2.867158739006119, 'alpha': 4.6073965321283525

Best params: {'eta': 0.015700691589861673, 'max_depth': 3, 'subsample': 0.6697511480357885, 'colsample_bytree': 0.868446800821784, 'gamma': 0.7223802020220083, 'alpha': 1.5744437160482638, 'lambda': 4.772072296811465, 'n_estimators': 594}


In [69]:
the_200_trail_best_params = {'eta': 0.012680711502557862, 'max_depth': 3, 'subsample': 0.8841954778507377, 'colsample_bytree': 0.7994575520433568, 'gamma': 4.424146226872127, 'alpha': 3.391444881619674, 'lambda': 1.4097101241365753, 'n_estimators': 473}
# loss value was 0.9100287748362601

#### Training M1 advanced on optimal parameters

In [70]:
n = len(X)
train_sz = int(n * 0.80)

X_train, y_train = X.iloc[:train_sz], labels.iloc[:train_sz]
X_test,  y_test  = X.iloc[train_sz:], labels.iloc[train_sz:]

In [71]:
label_map = {-1: 0, 0: 1, 1: 2}
y_train_mapped = y_train.map(label_map)
y_test_mapped  = y_test.map(label_map)

In [72]:
dtrain = xgb.DMatrix(X_train, label=y_train_mapped)
dtest = xgb.DMatrix(X_test, label=y_test_mapped)

In [73]:
the_200_trail_best_params.update({
    "objective": "multi:softprob",
    "num_class": 3,
    "eval_metric": "mlogloss",
    "tree_method": "gpu_hist",
    "predictor": "gpu_predictor",
    "verbosity": 1,
    "seed": 42
})

In [74]:
the_200_trail_best_params

{'eta': 0.012680711502557862,
 'max_depth': 3,
 'subsample': 0.8841954778507377,
 'colsample_bytree': 0.7994575520433568,
 'gamma': 4.424146226872127,
 'alpha': 3.391444881619674,
 'lambda': 1.4097101241365753,
 'n_estimators': 473,
 'objective': 'multi:softprob',
 'num_class': 3,
 'eval_metric': 'mlogloss',
 'tree_method': 'gpu_hist',
 'predictor': 'gpu_predictor',
 'verbosity': 1,
 'seed': 42}

In [75]:
final_model = xgb.train(
    params= the_200_trail_best_params,
    dtrain=dtrain,
    num_boost_round=800,
    evals=[(dtest, "test")],
    early_stopping_rounds=20,
    verbose_eval=50
)

[0]	test-mlogloss:1.09555



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "n_estimators", "predictor" } are not used.



[50]	test-mlogloss:0.99563
[100]	test-mlogloss:0.95118
[150]	test-mlogloss:0.93054
[200]	test-mlogloss:0.92054
[250]	test-mlogloss:0.91530
[300]	test-mlogloss:0.91228
[350]	test-mlogloss:0.91142
[400]	test-mlogloss:0.91098
[417]	test-mlogloss:0.91109


In [76]:
y_prob = model.predict(dtest)
y_pred_mapped = np.argmax(y_prob, axis=1)
inverse_map = {0: -1, 1: 0, 2: 1}
y_pred = pd.Series(y_pred_mapped).map(inverse_map)

In [77]:
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred, labels=[-1, 0, 1]))

Confusion Matrix:
[[15073    71  9389]
 [ 2989    37  3205]
 [ 7589    75 15502]]


In [78]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=["down", "neutral", "up"]))


Classification Report:
              precision    recall  f1-score   support

        down       0.59      0.61      0.60     24533
     neutral       0.20      0.01      0.01      6231
          up       0.55      0.67      0.60     23166

    accuracy                           0.57     53930
   macro avg       0.45      0.43      0.41     53930
weighted avg       0.53      0.57      0.53     53930



In [79]:
print("Multiclass Log Loss:", log_loss(y_test_mapped, y_prob))

Multiclass Log Loss: 0.9319435890314033


## Class weighting of the data

In [80]:
from collections import Counter

In [81]:
freq = Counter(y_train)
total = len(y_train)

In [82]:
weights = {c: total/(len(freq)*n) for c, n in freq.items()}

In [83]:
weights

{1: 0.8001735975874387, 0: 2.5105090426646184, -1: 0.7396747348605639}

In [84]:
tr_weights = pd.Series(y_train).map(weights)

In [85]:
tr_weights

0         0.800174
1         0.800174
2         0.800174
3         0.800174
4         0.800174
            ...   
215713    0.800174
215714    0.739675
215715    0.739675
215716    0.739675
215717    0.739675
Length: 215718, dtype: float64

In [86]:
d2_train = xgb.DMatrix(X_train, label=y_train_mapped, weight=tr_weights)
d2_test = xgb.DMatrix(X_test, label=y_test_mapped)

In [87]:
the_200_trail_best_params = {'eta': 0.012680711502557862, 'max_depth': 3, 'subsample': 0.8841954778507377, 'colsample_bytree': 0.7994575520433568, 'gamma': 4.424146226872127, 'alpha': 3.391444881619674, 'lambda': 1.4097101241365753}

In [88]:
the_200_trail_best_params.update({
    "objective": "multi:softprob",
    "num_class" : 3,
    "eval_metric": "mlogloss",
    "tree_method": "hist",
    "device"     : "cuda",
    "seed"       : 42,
    "verbosity"  : 1
})

In [89]:
the_200_trail_best_params

{'eta': 0.012680711502557862,
 'max_depth': 3,
 'subsample': 0.8841954778507377,
 'colsample_bytree': 0.7994575520433568,
 'gamma': 4.424146226872127,
 'alpha': 3.391444881619674,
 'lambda': 1.4097101241365753,
 'objective': 'multi:softprob',
 'num_class': 3,
 'eval_metric': 'mlogloss',
 'tree_method': 'hist',
 'device': 'cuda',
 'seed': 42,
 'verbosity': 1}

In [90]:
final = xgb.train(
    the_200_trail_best_params,
    dtrain = d2_train,
    num_boost_round=800,
    evals=[(d2_test, "test")],
    early_stopping_rounds=20,
    verbose_eval=50
)

[0]	test-mlogloss:1.09766
[50]	test-mlogloss:1.06270
[100]	test-mlogloss:1.04560
[150]	test-mlogloss:1.03707
[200]	test-mlogloss:1.03115
[250]	test-mlogloss:1.02715
[300]	test-mlogloss:1.02370
[350]	test-mlogloss:1.02050
[400]	test-mlogloss:1.01919
[432]	test-mlogloss:1.01893


In [91]:
proba = final.predict(d2_test)
pred  = proba.argmax(axis=1)
print(confusion_matrix(y_test_mapped, pred))
print(classification_report(y_test_mapped, pred, target_names=["down","neutral","up"]))
print("Log-loss:", log_loss(y_test_mapped, proba))

[[12416  5100  7017]
 [ 2385  1427  2419]
 [ 5743  4560 12863]]
              precision    recall  f1-score   support

        down       0.60      0.51      0.55     24533
     neutral       0.13      0.23      0.16      6231
          up       0.58      0.56      0.57     23166

    accuracy                           0.50     53930
   macro avg       0.44      0.43      0.43     53930
weighted avg       0.54      0.50      0.51     53930

Log-loss: 1.0189309972012417


### Confidence thresholding for predictions of up or down

In [92]:
from sklearn.metrics import f1_score

In [93]:
proba

array([[0.27123365, 0.34325206, 0.3855143 ],
       [0.27002156, 0.34382296, 0.38615546],
       [0.27002156, 0.34382296, 0.38615546],
       ...,
       [0.24887885, 0.28123146, 0.4698897 ],
       [0.24428135, 0.28295285, 0.47276586],
       [0.24428135, 0.28295285, 0.47276586]], dtype=float32)

In [94]:
def apply_threshold(prob_mat, neutral_thresh):
    maxprob = prob_mat.max(axis=1)
    argmax  = prob_mat.argmax(axis=1)
    pred    = np.where(maxprob < neutral_thresh, 1, argmax)
    return pred

In [95]:
th_grid   = np.linspace(0.33, 0.80, 20)
best_f1   = -1
best_th   = None

In [96]:
for th in th_grid:
    pred = apply_threshold(proba, th)
    f1   = f1_score(y_test_mapped, pred, average="macro")
    if f1 > best_f1:
        best_f1, best_th = f1, th

In [97]:
print(f"Best neutral threshold = {best_th:.2f}  → macro-F1 = {best_f1:.3f}")

Best neutral threshold = 0.35  → macro-F1 = 0.429


In [98]:
pred_final = apply_threshold(proba, best_th)
print(confusion_matrix(y_test_mapped, pred_final))
print(classification_report(y_test_mapped, pred_final,
                            target_names=["down","neutral","up"]))

[[12024  5764  6745]
 [ 2207  1703  2321]
 [ 5432  5130 12604]]
              precision    recall  f1-score   support

        down       0.61      0.49      0.54     24533
     neutral       0.14      0.27      0.18      6231
          up       0.58      0.54      0.56     23166

    accuracy                           0.49     53930
   macro avg       0.44      0.44      0.43     53930
weighted avg       0.54      0.49      0.51     53930



#### Changing thresholding to be down risk averse and represent real world financial market needs

In [99]:
def realistic_thresholding(pmatrix, t_down, t_up, margin):
    preds = []
    for p in pmatrix:
        p_d, p_n, p_u = p
        if p_d > t_down and p_d > p_u + margin:
            preds.append(0)
        elif p_u > t_up and p_u > p_d + margin:
            preds.append(2)
        else:
            preds.append(1)
    
    return np.array(preds)

In [100]:
pred_fin = realistic_thresholding(proba, t_down=0.35, t_up=0.4, margin=0.05)

print(confusion_matrix(y_test_mapped, pred_fin))
print(classification_report(y_test_mapped, pred_fin, target_names=["down", "neutral", "up"]))

[[12703  6477  5353]
 [ 2336  1998  1897]
 [ 5573  6645 10948]]
              precision    recall  f1-score   support

        down       0.62      0.52      0.56     24533
     neutral       0.13      0.32      0.19      6231
          up       0.60      0.47      0.53     23166

    accuracy                           0.48     53930
   macro avg       0.45      0.44      0.43     53930
weighted avg       0.55      0.48      0.51     53930



## Changing to binary classification for better accuracy

In [101]:
binary_labels = labels.map({-1:0, 0:1, 1:1})

In [102]:
binary_labels

0         1
1         1
2         1
3         1
4         1
         ..
269643    0
269644    0
269645    0
269646    0
269647    0
Length: 269648, dtype: int64

In [103]:
X_train_bin, X_test_bin, y_train_bin, y_test_bin = train_test_split(
    X, binary_labels, test_size=0.2, shuffle=False)

In [104]:
from collections import Counter
class_counts = Counter(y_train_bin)
total = len(y_train_bin)
weights = {cls: total / (2 * count) for cls, count in class_counts.items()}
sample_weights = y_train_bin.map(weights)

In [105]:
dtrain = xgb.DMatrix(X_train_bin, label=y_train_bin, weight=sample_weights)
dtest  = xgb.DMatrix(X_test_bin,  label=y_test_bin)

In [106]:
params = {
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "tree_method": "hist",
    "device": "cuda",
    "eta": 0.05,
    "max_depth": 4,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "seed": 42
}

In [107]:
model_bin = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=500,
    evals=[(dtest, "test")],
    early_stopping_rounds=20,
    verbose_eval=25
)

[0]	test-logloss:0.68843
[25]	test-logloss:0.64163
[50]	test-logloss:0.63874
[59]	test-logloss:0.63922


In [108]:
proba_bin = model_bin.predict(dtest)
pred_bin  = (proba_bin > 0.5).astype(int)

In [109]:
from sklearn.metrics import roc_auc_score

In [110]:
print(confusion_matrix(y_test_bin, pred_bin))
print(classification_report(y_test_bin, pred_bin, target_names=["down", "not-down"]))
print("ROC AUC:", roc_auc_score(y_test_bin, proba_bin))

[[15583  8950]
 [11138 18259]]
              precision    recall  f1-score   support

        down       0.58      0.64      0.61     24533
    not-down       0.67      0.62      0.65     29397

    accuracy                           0.63     53930
   macro avg       0.63      0.63      0.63     53930
weighted avg       0.63      0.63      0.63     53930

ROC AUC: 0.6799769158923143


In [111]:
def objective2(trial):
    param = {
        "objective":      "binary:logistic",
        "eval_metric":    "logloss",
        "tree_method":    "gpu_hist",
        "device":      "cuda",
        "verbosity":      0,
        "eta":            trial.suggest_float("eta", 0.01, 0.3, log=True),
        "max_depth":      trial.suggest_int("max_depth", 3, 10),
        "subsample":      trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma":          trial.suggest_float("gamma", 0, 5),
        "alpha":          trial.suggest_float("alpha", 0, 5),
        "lambda":         trial.suggest_float("lambda", 0, 5),
        "seed": 42,
        "verbosity": 0
    }

    num_round = trial.suggest_int("n_round", 300, 1000)

    bst = xgb.train(
        param,
        dtrain,
        num_boost_round = num_round,
        evals=[(dtest, "test")],
        early_stopping_rounds=30,
        verbose_eval=False
    )

    preds = bst.predict(dtest)
    loss  = log_loss(y_test_bin, preds)
    return loss

In [112]:
study = optuna.create_study(direction="minimize")
study.optimize(objective2, n_trials=30, show_progress_bar=True)

[I 2025-09-04 20:30:39,949] A new study created in memory with name: no-name-87f8688c-2daa-400f-8c50-173216e5a744


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-09-04 20:30:41,555] Trial 0 finished with value: 0.6402117975449318 and parameters: {'eta': 0.013521035985330597, 'max_depth': 6, 'subsample': 0.9355804751649287, 'colsample_bytree': 0.569250743621021, 'gamma': 3.5413415056219932, 'alpha': 4.102059382800994, 'lambda': 0.49898538015993255, 'n_round': 727}. Best is trial 0 with value: 0.6402117975449318.
[I 2025-09-04 20:30:42,159] Trial 1 finished with value: 0.6503546583606544 and parameters: {'eta': 0.0609283805867355, 'max_depth': 7, 'subsample': 0.8081370080098623, 'colsample_bytree': 0.6429378661308778, 'gamma': 3.841805161470775, 'alpha': 1.907948430447965, 'lambda': 2.4823498607864143, 'n_round': 938}. Best is trial 0 with value: 0.6402117975449318.
[I 2025-09-04 20:30:42,723] Trial 2 finished with value: 0.6402437886333137 and parameters: {'eta': 0.05946869423476747, 'max_depth': 3, 'subsample': 0.6626713691298736, 'colsample_bytree': 0.5044634724833748, 'gamma': 2.287675548446341, 'alpha': 4.233097878082666, 'lambda': 1

[I 2025-09-04 20:31:05,040] Trial 23 finished with value: 0.6375972428301095 and parameters: {'eta': 0.014809714950835258, 'max_depth': 5, 'subsample': 0.6383796954564989, 'colsample_bytree': 0.7715489427106733, 'gamma': 1.3586000332974693, 'alpha': 4.499646115953045, 'lambda': 2.7499373643906724, 'n_round': 825}. Best is trial 19 with value: 0.6373315865081488.
[I 2025-09-04 20:31:06,480] Trial 24 finished with value: 0.6413859887608827 and parameters: {'eta': 0.014265281648451637, 'max_depth': 6, 'subsample': 0.6393518272748404, 'colsample_bytree': 0.6778578694328775, 'gamma': 1.412979321133482, 'alpha': 4.942456262668065, 'lambda': 2.816296957955309, 'n_round': 832}. Best is trial 19 with value: 0.6373315865081488.
[I 2025-09-04 20:31:08,280] Trial 25 finished with value: 0.6403545369780013 and parameters: {'eta': 0.010327898204240994, 'max_depth': 7, 'subsample': 0.6854099353821528, 'colsample_bytree': 0.7533510124330236, 'gamma': 1.8014766718268942, 'alpha': 4.441342705312228, 'la

In [113]:
print("Best params →", study.best_trial.params, "   best log-loss =", study.best_value)

Best params → {'eta': 0.010097761180568468, 'max_depth': 5, 'subsample': 0.613007752530996, 'colsample_bytree': 0.7771926315046294, 'gamma': 0.7959247710366713, 'alpha': 3.7941771732867844, 'lambda': 2.0807653630931897, 'n_round': 991}    best log-loss = 0.6373315865081488


In [114]:
bin_200_trial_best_params = {'eta': 0.015307161753471939, 'max_depth': 4, 'subsample': 0.5065237812985154, 'colsample_bytree': 0.7123402346361045, 'gamma': 3.199607407360512, 'alpha': 2.17738828063744, 'lambda': 1.154777261655405, 'num_boost_round': 459}

In [115]:
bin_200_trial_best_params.update({
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "tree_method": "gpu_hist",
    "device": "cuda",
    "seed": 42,
    "verbosity": 0,
})

In [116]:
final_bin = xgb.train(
    params=bin_200_trial_best_params,
    dtrain=dtrain,
    num_boost_round=bin_200_trial_best_params["num_boost_round"],
    evals=[(dtest, "test")],
    early_stopping_rounds=30,
    verbose_eval=50,
)

[0]	test-logloss:0.69152
[50]	test-logloss:0.65207
[100]	test-logloss:0.64001
[150]	test-logloss:0.63706
[200]	test-logloss:0.63676
[204]	test-logloss:0.63680


In [117]:
proba_bin = final_bin.predict(dtest)
pred_bin  = (proba_bin > 0.5).astype(int)

In [118]:
print(confusion_matrix(y_test_bin, pred_bin))
print(classification_report(y_test_bin, pred_bin, target_names=["down", "not-down"]))
print("ROC AUC:", roc_auc_score(y_test_bin, proba_bin))

[[15219  9314]
 [10605 18792]]
              precision    recall  f1-score   support

        down       0.59      0.62      0.60     24533
    not-down       0.67      0.64      0.65     29397

    accuracy                           0.63     53930
   macro avg       0.63      0.63      0.63     53930
weighted avg       0.63      0.63      0.63     53930

ROC AUC: 0.6819077714982188
