# Auto Bayesian Neural Networks

In [2]:
import pandas as pd
dfo = pd.read_csv("EURUSD_Candlestick_1_Hour_BID_01.07.2020-15.07.2023.csv")
dfo=dfo[dfo['Volume']!=0]
dfo.reset_index(drop=True, inplace=True)
dfo.head(10)

Unnamed: 0,Gmt time,Open,High,Low,Close,Volume
0,01.07.2020 00:00:00.000,1.12336,1.12336,1.12275,1.12306,4148.0298
1,01.07.2020 01:00:00.000,1.12306,1.12395,1.12288,1.12385,5375.5801
2,01.07.2020 02:00:00.000,1.12386,1.12406,1.12363,1.12382,4131.6099
3,01.07.2020 03:00:00.000,1.12382,1.12388,1.12221,1.12265,4440.6001
4,01.07.2020 04:00:00.000,1.12265,1.12272,1.12151,1.12179,4833.1001
5,01.07.2020 05:00:00.000,1.12179,1.12261,1.12156,1.1224,6689.5601
6,01.07.2020 06:00:00.000,1.1224,1.12343,1.12202,1.12333,7562.75
7,01.07.2020 07:00:00.000,1.12331,1.12331,1.12231,1.12315,8641.75
8,01.07.2020 08:00:00.000,1.12315,1.12448,1.1229,1.12311,10042.7695
9,01.07.2020 09:00:00.000,1.12313,1.12337,1.12076,1.12076,9587.4004


In [3]:
dfo['Gmt time'] = pd.to_datetime(dfo['Gmt time'], format='%d.%m.%Y %H:%M:%S.%f')

In [4]:
def label_data(df, lookahead=5, threshold=0.002):
    """
    Labels each candle based on future closing price percentage change.

    Parameters:
    -----------
    df : pd.DataFrame
        Data containing at least a 'Close' column.
    lookahead : int, optional
        Number of candles to look ahead (default is 5).
    threshold : float, optional
        Percentage change threshold for classification (default is 0.002 or 0.2%).
    """
    
    # Compute future percentage change in closing price
    df["future_return"] = df["Close"].pct_change(lookahead).shift(-lookahead)

    # Assign labels
    df["label"] = 0  # Default: Neutral
    df.loc[df["future_return"] > threshold, "label"] = 2  # Up
    df.loc[df["future_return"] < -threshold, "label"] = 1  # Down

    # Drop future_return column (not needed in final output)
    df.drop(columns=["future_return"], inplace=True)

In [5]:
label_data(dfo)

In [6]:
dfo[dfo["label"]!=0]

Unnamed: 0,Gmt time,Open,High,Low,Close,Volume,label
6,2020-07-01 06:00:00,1.12240,1.12343,1.12202,1.12333,7562.7500,1
9,2020-07-01 09:00:00,1.12313,1.12337,1.12076,1.12076,9587.4004,2
10,2020-07-01 10:00:00,1.12076,1.12113,1.12002,1.12050,11767.5898,2
11,2020-07-01 11:00:00,1.12050,1.12067,1.11848,1.12036,14733.7998,2
12,2020-07-01 12:00:00,1.12036,1.12209,1.11980,1.12177,13410.0596,2
...,...,...,...,...,...,...,...
17735,2023-07-13 12:00:00,1.11742,1.11897,1.11615,1.11795,39235.5900,2
17736,2023-07-13 13:00:00,1.11797,1.11949,1.11761,1.11942,35069.8900,2
17737,2023-07-13 14:00:00,1.11943,1.11959,1.11806,1.11957,26614.8000,2
17738,2023-07-13 15:00:00,1.11959,1.11965,1.11858,1.11927,20519.5900,2


In [8]:
import numpy as np
import pandas as pd
import autobnn as ab
import jax
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

def create_sliding_window_dataset(df, window_size=50, lookahead=5, threshold=0.002):
    if "Close" not in df.columns or "label" not in df.columns:
        raise ValueError("DataFrame must contain 'Close' and 'label' columns.")

    label_data(df=df, lookahead=lookahead, threshold=threshold)

    close_prices = df["Close"].values
    labels = df["label"].values

    n_samples = len(df) - window_size
    if n_samples <= 0:
        raise ValueError("Not enough data to create even one window."
                         "Increase your dataset or decrease window_size.")

    X_list = []
    y_list = []

    for i in range(n_samples):
        window_data = close_prices[i : i + window_size]
        last_candle_label = labels[i + window_size - 1] # label of the last candle in window_size
        X_list.append(window_data)
        y_list.append(last_candle_label)

    X = np.array(X_list)
    y = np.array(y_list)

    return X, y

In [9]:
X, y = create_sliding_window_dataset(dfo)
len(X[0])

50

In [None]:
import jax
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import autobnn as ab
from autobnn import estimators, operators, kernels

def walk_forward_autobnn_ovr(
    df, 
    window_size=100,
    train_size=50,
    step_size=5,
    threshold=0.002,
    n_classes=3  # e.g., if labels are {0,1,2}
):
    """
    Multi-class classification via One-vs-Rest using AutoBNN,
    specifically using 'normal_likelihood_logistic_noise' for each binary classifier.
    
    Steps:
      1) create_sliding_window_dataset(...) -> (X, y)
         X: (n_samples, n_features), y: (n_samples,) with integer classes
      2) For each walk-forward iteration:
         - Train n_classes binary models, each "Does y == c?" -> 0/1
         - At prediction time, get each model's 'logit' => logistic transform => probability
         - Argmax over classes => final predicted class
      3) Return single-sample accuracies for each step.
    """

    # 1. Build your dataset
    X, y = create_sliding_window_dataset(df=df, window_size=window_size, threshold=threshold)
    n_total = len(X)
    # print(X, n_total)
    if n_total < train_size + 1:
        raise ValueError("Not enough samples for walk-forward analysis.")

    accuracies = []

    # 2. Walk-forward loop
    for i in range(0, n_total - train_size, step_size):
        X_train = X[i : i + train_size]
        y_train = y[i : i + train_size]

        test_index = i + train_size
        if test_index >= n_total:
            break

        X_test = X[test_index : test_index + 1]  # shape (1, features)
        y_test = y[test_index : test_index + 1]  # shape (1,)

        # Scale
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled  = scaler.transform(X_test)

        # 3. Train one binary model per class
        estimators = []
        for c in range(n_classes):
            # Make binary labels: 1 if y == c, else 0
            y_train_c = (y_train == c).astype(int)

            model_c = ab.operators.Add(
                bnns=(
                    ab.kernels.PeriodicBNN(width=20, period=12.0),
                    ab.kernels.LinearBNN(width=20),
                    ab.kernels.MaternBNN(width=20),
                )
            )
            # Use the likelihood your autobnn version actually recognizes:
            # 'normal_likelihood_logistic_noise' is a hacky approach but workable if recognized.
            estimator_c = ab.estimators.AutoBnnMapEstimator(
                model_c,
                likelihood_model="normal_likelihood_logistic_noise",  
                seed=jax.random.PRNGKey(42),
                periods=[12],
                # We do not set k=2 because it's not a pure classification wrapper,
                # but you can pass k=2 if your version allows it. 
            )

            estimator_c.fit(X_train_scaled, y_train_c)
            estimators.append(estimator_c)

        # 4. Predict probabilities for each class on the single test sample
        #    'normal_likelihood_logistic_noise' typically returns shape (1,2): [mean, var]
        #    We'll interpret the 'mean' as a *logit*, i.e. log-odds, and do logistic transform.
        class_probs = []
        for c in range(n_classes):
            y_pred_c = estimators[c].predict(X_test_scaled)  # shape (1,2) or (1,)?
            
            # If it returns shape (1,2) => [ [mean, var] ],
            # we take the 0th column as mean (the "logit").
            # In some versions, it might be shape (1,) -> just the mean. 
            if y_pred_c.ndim == 2:
                logit = y_pred_c[0, 0]  # first row, mean col
            else:
                # shape (1,) => just the mean
                logit = y_pred_c[0]
            
            # Convert logit -> probability: prob = 1 / (1 + exp(-logit))
            prob_c = 1.0 / (1.0 + np.exp(-logit))
            class_probs.append(prob_c)

        # 5. Pick the class with the highest probability
        y_pred_class = np.argmax(class_probs)  # integer c in [0..n_classes-1]

        # 6. Single-sample accuracy (0 or 1)
        acc = accuracy_score(y_test, [y_pred_class])
        accuracies.append(acc)

    return accuracies

dfsample = dfo[:1000].copy()

# Example usage:
results = walk_forward_autobnn_ovr(
    df=dfsample,
    window_size=40,
    train_size=20,
    step_size=5,
    threshold=0.002,
    n_classes=3  # e.g. for classes {0,1,2}
)

print("Accuracies:", results)
if results:
    print("Mean Accuracy:", np.mean(results))


Accuracies: [1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
Mean Accuracy: 0.6223404255319149


In [13]:
len(results)

188

In [9]:
dfo[:60]

Unnamed: 0,Gmt time,Open,High,Low,Close,Volume,label
0,2020-07-01 00:00:00,1.12336,1.12336,1.12275,1.12306,4148.0298,0
1,2020-07-01 01:00:00,1.12306,1.12395,1.12288,1.12385,5375.5801,0
2,2020-07-01 02:00:00,1.12386,1.12406,1.12363,1.12382,4131.6099,0
3,2020-07-01 03:00:00,1.12382,1.12388,1.12221,1.12265,4440.6001,0
4,2020-07-01 04:00:00,1.12265,1.12272,1.12151,1.12179,4833.1001,0
5,2020-07-01 05:00:00,1.12179,1.12261,1.12156,1.1224,6689.5601,0
6,2020-07-01 06:00:00,1.1224,1.12343,1.12202,1.12333,7562.75,1
7,2020-07-01 07:00:00,1.12331,1.12331,1.12231,1.12315,8641.75,0
8,2020-07-01 08:00:00,1.12315,1.12448,1.1229,1.12311,10042.7695,0
9,2020-07-01 09:00:00,1.12313,1.12337,1.12076,1.12076,9587.4004,2
