# Read Data

In [19]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('NEAR_USDT_10h_avg_volatility_base.csv')

# Display the first few rows of the DataFrame
df

Unnamed: 0,opentime,openprice,highprice,lowprice,closeprice,volume,quotevolume,trades,taker_buy_volume,taker_buy_quote,volatility,next_10h_avg_volatility
0,1602748800000,1.0625,1.1872,1.0625,1.1169,1449407,1.655530e+06,9968,766784,8.779915e+05,0.117365,0.045121
1,1602752400000,1.1169,1.1301,1.0778,1.1179,1616432,1.789316e+06,8743,873713,9.684362e+05,0.048525,0.042047
2,1602756000000,1.1176,1.1650,1.1136,1.1570,1270406,1.441663e+06,8310,721173,8.190750e+05,0.046157,0.039637
3,1602759600000,1.1572,1.1575,1.1210,1.1279,481575,5.503787e+05,4446,227307,2.600841e+05,0.032560,0.038093
4,1602763200000,1.1286,1.1812,1.1025,1.1536,1835160,2.107977e+06,9266,934671,1.074642e+06,0.071383,0.032983
...,...,...,...,...,...,...,...,...,...,...,...,...
36446,1733954400000,6.8760,6.9060,6.8450,6.8760,939922,6.464904e+06,14640,349725,2.405712e+06,0.008912,0.017125
36447,1733958000000,6.8750,6.9030,6.8250,6.8450,593093,4.070402e+06,15168,279329,1.917813e+06,0.011429,0.017017
36448,1733961600000,6.8450,6.8780,6.7750,6.8030,1428245,9.753268e+06,32331,604490,4.129761e+06,0.015203,0.017420
36449,1733965200000,6.8030,6.9040,6.7840,6.8130,1238149,8.472227e+06,28296,680113,4.655383e+06,0.017689,0.017177


# Features

In [20]:
df_features = df.copy()

# Calculate Maker Buy Volume
df_features['maker_buy_volume'] = df_features['volume'] - df_features['taker_buy_volume']
df_features['maker_buy_quote'] = df_features['quotevolume'] - df_features['taker_buy_quote']

# Get Rolling 4h data (helped)
df_features['openprice_10h'] = df_features['openprice'].shift(9)
df_features['highprice_10h'] = df_features['highprice'].rolling(window=10).max()
df_features['lowprice_10h'] = df_features['lowprice'].rolling(window=10).min()
df_features['volume_10h'] = df_features['volume'].rolling(window=10).sum()
df_features['quotevolume_10h'] = df_features['quotevolume'].rolling(window=10).sum()
df_features['trades_10h'] = df_features['trades'].rolling(window=10).sum()
df_features['taker_buy_volume_10h'] = df_features['taker_buy_volume'].rolling(window=10).sum()
df_features['taker_buy_quote_10h'] = df_features['taker_buy_quote'].rolling(window=10).sum()
df_features['maker_buy_volume_10h'] = df_features['maker_buy_volume'].rolling(window=10).sum()
df_features['maker_buy_quote_10h'] = df_features['maker_buy_quote'].rolling(window=10).sum()

df_features['10h_volatility'] = (df_features['highprice_10h'] - df_features['lowprice_10h']) / df_features['lowprice_10h']
df_features['current_10h_avg_volatility'] = df_features['volatility'].rolling(window=10).mean()

# lagged (kinda help)
df_features['std_10_volume'] = df_features['volume'].rolling(window=10).std()
df_features['std_20_volume'] = df_features['volume'].rolling(window=20).std()
df_features['std_50_volume'] = df_features['volume'].rolling(window=50).std()

# lagged (kinda help)
df_features['lag_price_change_1h'] = df_features['closeprice'] - df_features['closeprice'].shift(1)
df_features['lag_price_change_2h'] = df_features['closeprice'] - df_features['closeprice'].shift(2)
df_features['lag_price_change_4h'] = df_features['closeprice'] - df_features['closeprice'].shift(4)
df_features['lag_price_change_8h'] = df_features['closeprice'] - df_features['closeprice'].shift(8)

# Rolling ma std max (help a lot)
df_features['ma_50_volatility'] = df_features['volatility'].rolling(window=50).mean()
df_features['ma_100_volatility'] = df_features['volatility'].rolling(window=100).mean()
df_features['ma_200_volatility'] = df_features['volatility'].rolling(window=200).mean()

df_features['std_50_volatility'] = df_features['volatility'].rolling(window=50).std()
df_features['std_100_volatility'] = df_features['volatility'].rolling(window=100).std()
df_features['std_200_volatility'] = df_features['volatility'].rolling(window=200).std()

df_features['max_50_volatility'] = df_features['volatility'].rolling(window=50).max()
df_features['max_100_volatility'] = df_features['volatility'].rolling(window=100).max()
df_features['max_200_volatility'] = df_features['volatility'].rolling(window=200).max()

# Lag volatility change (kinda help)
df_features['lag_volatility_change_10h'] = df_features['volatility'] - df_features['volatility'].shift(10)
df_features['lag_volatility_change_20h'] = df_features['volatility'] - df_features['volatility'].shift(20)
df_features['lag_volatility_change_40h'] = df_features['volatility'] - df_features['volatility'].shift(40)
df_features['lag_volatility_change_80h'] = df_features['volatility'] - df_features['volatility'].shift(80)

# Max Min Rolling Window (kinda help)
df_features['max_10_open'] = df_features['openprice'].rolling(window=10).max()
df_features['min_10_open'] = df_features['openprice'].rolling(window=10).min()
df_features['max_20_open'] = df_features['openprice'].rolling(window=20).max()
df_features['min_20_open'] = df_features['openprice'].rolling(window=20).min()
df_features['max_50_open'] = df_features['openprice'].rolling(window=50).max()
df_features['min_50_open'] = df_features['openprice'].rolling(window=50).min()

df_features['max_10_close'] = df_features['closeprice'].rolling(window=10).max()
df_features['min_10_close'] = df_features['closeprice'].rolling(window=10).min()
df_features['max_20_close'] = df_features['closeprice'].rolling(window=20).max()
df_features['min_20_close'] = df_features['closeprice'].rolling(window=20).min()
df_features['max_50_close'] = df_features['closeprice'].rolling(window=50).max()
df_features['min_50_close'] = df_features['closeprice'].rolling(window=50).min()

# Trades Features
df_features['ma_10_trades'] = df_features['trades'].rolling(window=10).mean()
df_features['ma_20_trades'] = df_features['trades'].rolling(window=20).mean()
df_features['ma_50_trades'] = df_features['trades'].rolling(window=50).mean()

df_features['sum_10_trades'] = df_features['trades'].rolling(window=10).sum()
df_features['sum_20_trades'] = df_features['trades'].rolling(window=20).sum()
df_features['sum_50_trades'] = df_features['trades'].rolling(window=50).sum()

# Shadow Length (kinda help)
df_features['high_open_dif'] = df_features['highprice'] - df_features['openprice']
df_features['high_close_dif'] = df_features['highprice'] - df_features['closeprice']
df_features['low_open_dif'] = df_features['openprice'] - df_features['lowprice']
df_features['low_close_dif'] = df_features['closeprice'] - df_features['lowprice']

# Price-to-Volume Ratios (kinda help)
df_features['price_volume_ratio'] = (df_features['closeprice'] - df_features['openprice']) / df_features['volume']
df_features['quoteprice_volume_ratio'] = (df_features['quotevolume']) / (df_features['highprice'] - df_features['lowprice'])

# Time specific (kinda help)
df_features['day'] = pd.to_datetime(df_features['opentime'], unit='ms').dt.day
df_features['hour'] = pd.to_datetime(df_features['opentime'], unit='ms').dt.hour
df_features['hour_of_day'] = pd.to_datetime(df_features['opentime'], unit='ms').dt.hour
df_features['is_peak_hour'] = df_features['hour_of_day'].apply(lambda x: 1 if x in [8, 12, 16] else 0)

# Volume Volatility Interaction (kinda help)
df_features['volatility_volume'] = df_features['volatility'] * df_features['volume']
df_features['volatility_quotevolume_ratio'] = df_features['volatility'] / df_features['quotevolume']
df_features['ma_10_volatility_volume'] = df_features['volatility_volume'].rolling(window=10).mean()
df_features['ma_20_volatility_volume'] = df_features['volatility_volume'].rolling(window=20).mean()

# Hourly Seasonality Volatility Ratio (kinda help)
df_features['hourly_volatility_ratio'] = df_features['volatility'] / df_features.groupby('day')['volatility'].transform('mean')

# Maker Taker ratio
df_features['taker_maker_buy_volume_ratio'] = df_features['taker_buy_volume'] / df_features['maker_buy_volume']
df_features['taker_maker_buy_quote_ratio'] = df_features['taker_buy_quote'] / df_features['maker_buy_quote']

df_features['sum_10_taker_maker_buy_volume_ratio'] = df_features['taker_buy_volume'].rolling(window=10).sum() / df_features['maker_buy_volume'].rolling(window=10).sum()
df_features['sum_10_taker_maker_buy_quote_ratio'] = df_features['taker_buy_quote'].rolling(window=10).sum() / df_features['maker_buy_quote'].rolling(window=10).sum()

df_features['ma_10_taker_maker_buy_volume_ratio'] = df_features['taker_buy_volume'].rolling(window=10).mean() / df_features['maker_buy_volume'].rolling(window=10).mean()
df_features['ma_10_taker_maker_buy_quote_ratio'] = df_features['taker_buy_quote'].rolling(window=10).mean() / df_features['maker_buy_quote'].rolling(window=10).mean()

df_features['ma_20_taker_maker_buy_volume_ratio'] = df_features['taker_buy_volume'].rolling(window=20).mean() / df_features['maker_buy_volume'].rolling(window=20).mean()
df_features['ma_20_taker_maker_buy_quote_ratio'] = df_features['taker_buy_quote'].rolling(window=20).mean() / df_features['maker_buy_quote'].rolling(window=20).mean()

# ==============================
df_features.dropna(inplace=True)
df_features

Unnamed: 0,opentime,openprice,highprice,lowprice,closeprice,volume,quotevolume,trades,taker_buy_volume,taker_buy_quote,...,ma_20_volatility_volume,hourly_volatility_ratio,taker_maker_buy_volume_ratio,taker_maker_buy_quote_ratio,sum_10_taker_maker_buy_volume_ratio,sum_10_taker_maker_buy_quote_ratio,ma_10_taker_maker_buy_volume_ratio,ma_10_taker_maker_buy_quote_ratio,ma_20_taker_maker_buy_volume_ratio,ma_20_taker_maker_buy_quote_ratio
199,1603465200000,0.7228,0.7272,0.7107,0.7187,777147,5.584407e+05,7140,409710,2.948779e+05,...,11111.409765,1.070801,1.115048,1.118815,1.013976,1.018148,1.013976,1.018148,0.945311,0.948184
200,1603468800000,0.7185,0.7205,0.6882,0.7004,1128090,7.955156e+05,7397,509055,3.588446e+05,...,13695.412890,2.164707,0.822336,0.821773,1.050453,1.055538,1.050453,1.055538,0.939595,0.942890
201,1603472400000,0.7004,0.7145,0.6952,0.7049,490411,3.463863e+05,3883,288789,2.042947e+05,...,14078.311697,1.280439,1.432329,1.437767,1.063045,1.067658,1.063045,1.067658,0.947394,0.949967
202,1603476000000,0.7043,0.7065,0.7023,0.7036,58612,4.125851e+04,770,21842,1.539406e+04,...,13994.771505,0.275828,0.594017,0.595182,1.083471,1.088635,1.083471,1.088635,0.950674,0.953501
203,1603479600000,0.7036,0.7164,0.7031,0.7149,111609,7.921972e+04,900,90443,6.421380e+04,...,13980.195480,0.872461,4.273032,4.279231,1.002243,1.005082,1.002243,1.005082,0.953609,0.955848
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36446,1733954400000,6.8760,6.9060,6.8450,6.8760,939922,6.464904e+06,14640,349725,2.405712e+06,...,27053.927875,0.398684,0.592556,0.592658,0.838499,0.838528,0.838499,0.838528,0.915435,0.913495
36447,1733958000000,6.8750,6.9030,6.8250,6.8450,593093,4.070402e+06,15168,279329,1.917813e+06,...,26125.215226,0.511287,0.890252,0.890933,0.803300,0.803824,0.803300,0.803824,0.901274,0.899749
36448,1733961600000,6.8450,6.8780,6.7750,6.8030,1428245,9.753268e+06,32331,604490,4.129761e+06,...,26657.730170,0.656808,0.733823,0.734375,0.778457,0.778963,0.778457,0.778963,0.887434,0.886019
36449,1733965200000,6.8030,6.9040,6.7840,6.8130,1238149,8.472227e+06,28296,680113,4.655383e+06,...,27250.755198,0.764198,1.218762,1.219694,0.822338,0.822888,0.822338,0.822888,0.900848,0.899617


# Expanding Window

In [None]:
import time
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from datetime import datetime, timezone

# Function to convert human-readable date to Unix timestamp in milliseconds
def to_unix_timestamp(date_str):
    return int(datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S").timestamp() * 1000)

# Parameters
epsilon = 1e-8  # A small value to replace zero std
window_size = 10  # Number of rows to use for prediction

# Separate df into features and target
features_df = df_features.drop(['opentime', 'next_10h_avg_volatility'], axis=1)
target_df = df_features['next_10h_avg_volatility']

# Convert to NumPy arrays
X = features_df.values
y = target_df.values

# Extract opentime column for chronological ordering
opentime = df_features['opentime'].values

# Initialize storage for results
expanding_results = []

# Loop through each row starting from row 10 (index 9)
for i in range(window_size, len(opentime)):  # Start from row 9 to use previous 9 for training
    # Define the training data as all previous rows up to row i-9
    X_train, y_train = X[:i-window_size], y[:i-window_size]

    # Check if the training data is not empty
    if len(X_train) == 0:
        continue  # Skip if no training data available

    # Define the test data as the next row
    X_test, y_test = X[i:i+1], y[i:i+1]

    # Normalize training data
    X_train_mean = np.mean(X_train, axis=0)
    X_train_std = np.std(X_train, axis=0)

    # Replace zero std with epsilon
    X_train_std = np.where(X_train_std == 0, epsilon, X_train_std)

    X_train_normalized = (X_train - X_train_mean) / X_train_std

    # Get the column index for 'volatility' from features_df
    close_index = features_df.columns.get_loc('volatility')

    # Normalize y_train using the mean and std of the 'volatility' column
    close_mean = X_train[:, close_index].mean()
    close_std = X_train[:, close_index].std()

    # Replace zero std with epsilon for y_train
    close_std = max(close_std, epsilon)

    y_train_normalized = (y_train - close_mean) / close_std

    # Normalize test data using the training set's statistics
    X_test_normalized = (X_test - X_train_mean) / X_train_std

    # Initialize and fit the model
    model = LinearRegression()
    model.fit(X_train_normalized, y_train_normalized)

    # Predict on the test set
    y_pred_test = model.predict(X_test_normalized)
    # Denormalize predictions
    y_pred_test_denorm = y_pred_test * close_std + close_mean

    # Record results
    expanding_results.append({
        'date': datetime.fromtimestamp(opentime[i] / 1000, tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S"),
        'actual': y[i],
        'prediction': y_pred_test_denorm[0]
    })

# Convert results to a DataFrame for analysis
expanding_results = pd.DataFrame(expanding_results)

# Print the results DataFrame
expanding_results

# Sliding Window

In [21]:
import time
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from datetime import datetime, timezone

# Function to convert human-readable date to Unix timestamp in milliseconds
def to_unix_timestamp(date_str):
    return int(datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S").timestamp() * 1000)

# Parameters
epsilon = 1e-8  # A small value to replace zero std
window_size = 1500  # Number of rows to use for training
gap_size = 10  # Gap size between training and prediction

# Separate df into features and target
features_df = df_features.drop(['opentime', 'next_10h_avg_volatility'], axis=1)
target_df = df_features['next_10h_avg_volatility']

# Convert to NumPy arrays
X = features_df.values
y = target_df.values

# Extract opentime column for chronological ordering
opentime = df_features['opentime'].values

# Initialize storage for results
sliding_window_results = []

# Loop through each row starting from row 1510 (index 1509)
for i in range(window_size + gap_size, len(opentime)):  # Start after the first 1500 rows + gap
    # Define the training data as the last 1500 rows before the gap
    X_train, y_train = X[i-window_size-gap_size:i-gap_size], y[i-window_size-gap_size:i-gap_size]

    # Check if the training data is not empty
    if len(X_train) == 0:
        continue  # Skip if no training data available

    # Define the test data as the current row
    X_test, y_test = X[i:i+1], y[i:i+1]

    # Normalize training data
    X_train_mean = np.mean(X_train, axis=0)
    X_train_std = np.std(X_train, axis=0)

    # Replace zero std with epsilon
    X_train_std = np.where(X_train_std == 0, epsilon, X_train_std)

    X_train_normalized = (X_train - X_train_mean) / X_train_std

    # Get the column index for 'volatility' from features_df
    close_index = features_df.columns.get_loc('volatility')

    # Normalize y_train using the mean and std of the 'volatility' column
    close_mean = X_train[:, close_index].mean()
    close_std = X_train[:, close_index].std()

    # Replace zero std with epsilon for y_train
    close_std = max(close_std, epsilon)

    y_train_normalized = (y_train - close_mean) / close_std

    # Normalize test data using the training set's statistics
    X_test_normalized = (X_test - X_train_mean) / X_train_std

    # Initialize and fit the model
    model = LinearRegression()
    model.fit(X_train_normalized, y_train_normalized)

    # Predict on the test set
    y_pred_test = model.predict(X_test_normalized)
    # Denormalize predictions
    y_pred_test_denorm = y_pred_test * close_std + close_mean

    # Record results
    sliding_window_results.append({
        'date': datetime.fromtimestamp(opentime[i] / 1000, tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S"),
        'actual': y[i],
        'prediction': y_pred_test_denorm[0]
    })

# Convert results to a DataFrame for analysis
sliding_window_results = pd.DataFrame(sliding_window_results)

# Print the results DataFrame
sliding_window_results

Unnamed: 0,date,actual,prediction
0,2020-12-25 13:00:00,0.020292,0.013124
1,2020-12-25 14:00:00,0.021900,0.020878
2,2020-12-25 15:00:00,0.021152,0.016404
3,2020-12-25 16:00:00,0.019744,0.017086
4,2020-12-25 17:00:00,0.020212,0.018607
...,...,...,...
34737,2024-12-11 22:00:00,0.017125,0.018843
34738,2024-12-11 23:00:00,0.017017,0.016348
34739,2024-12-12 00:00:00,0.017420,0.015940
34740,2024-12-12 01:00:00,0.017177,0.016045


# Chronological Split from latest data

In [37]:
import time
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

# Separate df into features and target
features_df = df_features.drop(['opentime', 'next_10h_avg_volatility'], axis=1)
target_df = df_features['next_10h_avg_volatility']

# Convert to NumPy arrays
X = features_df.values
y = target_df.values

# Define train-test split proportion
train_ratio = 0.93  # Example proportion for training set

# Split data chronologically
split_index = int(len(X) * train_ratio)
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

# Normalize training data
X_train_mean = np.mean(X_train, axis=0)
X_train_std = np.std(X_train, axis=0)
X_train_normalized = (X_train - X_train_mean) / X_train_std

# Get the column index for 'volatility' from features_df
close_index = features_df.columns.get_loc('volatility')

# Normalize y_train using the mean and std of the 'volatility' column
close_mean = X_train[:, close_index].mean()
close_std = X_train[:, close_index].std()
y_train_normalized = (y_train - close_mean) / close_std

# Normalize the entire dataset using the training set's statistics
X_normalized = (X - X_train_mean) / X_train_std

# Initialize and fit the model
start_time = time.time()
model = LinearRegression()
model.fit(X_train_normalized, y_train_normalized)

# Predict on the entire dataset (train + test)
y_pred_all = model.predict(X_normalized)

# Denormalize predictions
y_pred_all_denorm = y_pred_all * close_std + close_mean

# Create output DataFrame
chronological_df = pd.DataFrame({
    "date": df_features['opentime'],  # Assuming `opentime` is in the original DataFrame
    "actual": y,
    "prediction": y_pred_all_denorm
})

# Print the output DataFrame
chronological_df


Unnamed: 0,date,actual,prediction
199,1603465200000,0.021196,0.023287
200,1603468800000,0.017842,0.026006
201,1603472400000,0.017031,0.025786
202,1603476000000,0.017752,0.021653
203,1603479600000,0.016741,0.024075
...,...,...,...
36446,1733954400000,0.017125,0.017821
36447,1733958000000,0.017017,0.018356
36448,1733961600000,0.017420,0.019999
36449,1733965200000,0.017177,0.020232


# Evaluation Functions

In [33]:
import numpy as np
import polars as pl
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, f1_score

def evaluate_baseline(df_class):
    # Extract predictions and actuals
    y_pred = df_class['prediction']
    y_test = df_class['actual']

    # Convert to NumPy arrays
    pred = np.array(y_pred)
    test = np.array(y_test)

    # Regression Metrics
    me = np.mean(pred - test)  # Mean Error
    mae = mean_absolute_error(test, pred)  # Mean Absolute Error
    mape = np.mean(np.abs((test - pred) / test)) * 100  # Mean Absolute Percentage Error
    mpe = np.mean((test - pred) / test) * 100  # Mean Percentage Error
    rmse = np.sqrt(mean_squared_error(test, pred))  # Root Mean Squared Error
    r = r2_score(test, pred)  # Coefficient of Determination (R-squared)
    min_max_error = np.abs((np.min(pred) - np.min(test)) + (np.max(pred) - np.max(test)))  # Min-Max Error

    # Create Polars DataFrame for additional metrics
    df = pl.DataFrame({
        'volatility_prediction': pred,
        'volatility_actual': test
    })

    df = df.with_columns(
        abs(pl.col('volatility_prediction') - pl.col('volatility_actual')).alias('dif_volatility')
    )

    median_abs_err = df['dif_volatility'].median()  # Median Absolute Error
    var_abs_err = df['dif_volatility'].var()  # Variance of Absolute Errors

    # Results as Polars DataFrame
    results = pl.DataFrame({
        "Metric": [
            "Overall Median Absolute Error",
            "Overall Variance Absolute Error",
            "Mean Error",
            "Mean Absolute Error",
            "Mean Absolute Percentage Error",
            "Mean Percentage Error",
            "Root Mean Squared Error",
            "R-squared",
            "Min-Max Error"
        ],
        "Value": [
            median_abs_err,
            var_abs_err,
            me,
            mae,
            mape,
            mpe,
            rmse,
            r,
            min_max_error
        ]
    })

    # Classification Metrics
    # Define binary classes
    df_class['actual_class'] = (df_class['actual'] >= 0.02).astype(int)
    df_class['prediction_class'] = (df_class['prediction'] >= 0.02).astype(int)

    # Class percentages
    total_count = len(df_class)
    class_counts = df_class['actual_class'].value_counts()
    class_percentages = (class_counts / total_count) * 100

    print("Class Percentages:")
    print(class_percentages)

    # Classification Accuracy and F1-Score
    accuracy = accuracy_score(df_class['actual_class'], df_class['prediction_class'])
    f1 = f1_score(df_class['actual_class'], df_class['prediction_class'])

    print("\nAccuracy:", accuracy)
    print("F1 Score:", f1)

    return results

In [None]:
import time
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

# Separate df into features and target
features_df = df_features.drop(['opentime', 'next_10h_avg_volatility'], axis=1)
target_df = df_features['next_10h_avg_volatility']

# Convert to NumPy arrays
X = features_df.values
y = target_df.values

# Define list of train-test split proportions
train_test_ratios = [0.93]  # Example proportions for training set

# List to store results
all_results = []

# Loop through each train-test ratio
for train_ratio in train_test_ratios:
    print(f'Processing train-test split ratio: {train_ratio}')

    # Split data chronologically
    split_index = int(len(X) * train_ratio)
    X_train, X_test = X[:split_index], X[split_index:]
    y_train, y_test = y[:split_index], y[split_index:]

    # Normalize training data
    X_train_mean = np.mean(X_train, axis=0)
    X_train_std = np.std(X_train, axis=0)
    X_train_normalized = (X_train - X_train_mean) / X_train_std

    # Get the column index for 'volatility' from features_df
    close_index = features_df.columns.get_loc('volatility')

    # Normalize y_train using the mean and std of the 'volatility' column
    close_mean = X_train[:, close_index].mean()
    close_std = X_train[:, close_index].std()
    y_train_normalized = (y_train - close_mean) / close_std

    # Normalize test data using the training set's statistics
    X_test_normalized = (X_test - X_train_mean) / X_train_std

    # Initialize and fit the model
    start_time = time.time()
    model = LinearRegression()
    model.fit(X_train_normalized, y_train_normalized)

    # Predict on the test set
    y_pred_test = model.predict(X_test_normalized)
    # Denormalize predictions
    y_pred_test_denorm = y_pred_test * close_std + close_mean

    # Track the end time of the process
    end_time = time.time()

    # Evaluate the model
    eval_results, _ = evaluate_baseline(y_pred_test_denorm, y_test)


    # Append results to the list
    all_results.append({
        'train_ratio': train_ratio,
        **eval_results,

    })

# Convert results to a DataFrame for further analysis
results_df = pd.DataFrame(all_results)

# Print the results DataFrame
results_df

# Evaluations

In [35]:
evaluate_baseline(sliding_window_results)

Class Percentages:
actual_class
0    60.186518
1    39.813482
Name: count, dtype: float64

Accuracy: 0.7575844798802602
F1 Score: 0.7028857687151626


Metric,Value
str,f64
"""Overall Median Absolute Error""",0.004255
"""Overall Variance Absolute Erro…",0.000129
"""Mean Error""",0.000611
"""Mean Absolute Error""",0.00671
"""Mean Absolute Percentage Error""",33.310123
"""Mean Percentage Error""",-11.422127
"""Root Mean Squared Error""",0.013207
"""R-squared""",-0.208971
"""Min-Max Error""",0.00802


In [38]:
evaluate_baseline(chronological_df)

Class Percentages:
actual_class
0    59.665674
1    40.334326
Name: count, dtype: float64

Accuracy: 0.8074313141343926
F1 Score: 0.7679265981849007


Metric,Value
str,f64
"""Overall Median Absolute Error""",0.003282
"""Overall Variance Absolute Erro…",3.9e-05
"""Mean Error""",3.7e-05
"""Mean Absolute Error""",0.004678
"""Mean Absolute Percentage Error""",24.238317
"""Mean Percentage Error""",-9.362439
"""Root Mean Squared Error""",0.007775
"""R-squared""",0.57797
"""Min-Max Error""",0.059745


In [16]:
# # Write the DataFrame to a CSV file
# sliding_results_df.to_csv('backtest_volatility_prediction_2021.csv', index=False)

# Read Predictions

In [1]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('backtest_volatility_prediction_2021.csv')

# Display the first few rows of the DataFrame
df

Unnamed: 0,date,actual,prediction,min_prediction,max_prediction
0,2020-10-24 01:00:00,0.017018,0.021196,0.014012,0.028379
1,2020-10-24 02:00:00,0.016921,0.033499,0.022146,0.044852
2,2020-10-24 03:00:00,0.017476,0.029189,0.019297,0.039081
3,2020-10-24 04:00:00,0.019363,0.023867,0.015779,0.031956
4,2020-10-24 05:00:00,0.019680,0.020584,0.013608,0.027559
...,...,...,...,...,...
36237,2024-12-11 22:00:00,0.017125,0.017800,0.011768,0.023832
36238,2024-12-11 23:00:00,0.017017,0.018298,0.012097,0.024499
36239,2024-12-12 00:00:00,0.017420,0.019878,0.013141,0.026615
36240,2024-12-12 01:00:00,0.017177,0.020115,0.013298,0.026933


In [2]:
df_class = df.copy()

# Create the actual_class column
df_class['actual_class'] = (df_class['actual'] >= 0.02).astype(int)

# Create the prediction_class column
df_class['prediction_class'] = (df_class['prediction'] >= 0.02).astype(int)

# df_class.drop(columns=(['actual','prediction','min_prediction','max_prediction']), inplace=True)

df_class

Unnamed: 0,date,actual,prediction,min_prediction,max_prediction,actual_class,prediction_class
0,2020-10-24 01:00:00,0.017018,0.021196,0.014012,0.028379,0,1
1,2020-10-24 02:00:00,0.016921,0.033499,0.022146,0.044852,0,1
2,2020-10-24 03:00:00,0.017476,0.029189,0.019297,0.039081,0,1
3,2020-10-24 04:00:00,0.019363,0.023867,0.015779,0.031956,0,1
4,2020-10-24 05:00:00,0.019680,0.020584,0.013608,0.027559,0,1
...,...,...,...,...,...,...,...
36237,2024-12-11 22:00:00,0.017125,0.017800,0.011768,0.023832,0,0
36238,2024-12-11 23:00:00,0.017017,0.018298,0.012097,0.024499,0,0
36239,2024-12-12 00:00:00,0.017420,0.019878,0.013141,0.026615,0,0
36240,2024-12-12 01:00:00,0.017177,0.020115,0.013298,0.026933,0,1


In [3]:
from sklearn.metrics import accuracy_score, f1_score

def calculate_metrics(df_class):
    # Count percentages for actual_class
    total_count = len(df_class)
    class_counts = df_class['actual_class'].value_counts()
    class_percentages = (class_counts / total_count) * 100

    print("Class Percentages:")
    print(class_percentages)

    # Calculate accuracy and F1 score
    accuracy = accuracy_score(df_class['actual_class'], df_class['prediction_class'])
    f1 = f1_score(df_class['actual_class'], df_class['prediction_class'])

    print("\nAccuracy:", accuracy)
    print("F1 Score:", f1)

In [4]:
import numpy as np
import polars as pl
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def evaluate_baseline(y_pred, y_test):
    # Convert lists to Numpy arrays
    pred = np.array(y_pred)
    test = np.array(y_test)

    # Evaluation Metrics
    me = np.mean(pred - test)  # Mean Error
    mae = mean_absolute_error(test, pred)  # Mean Absolute Error
    mape = np.mean(np.abs((test - pred) / test)) * 100  # Mean Absolute Percentage Error
    mpe = np.mean((test - pred) / test) * 100  # Mean Percentage Error
    rmse = np.sqrt(mean_squared_error(test, pred))  # Root Mean Squared Error

    r = r2_score(test, pred)  # Coefficient of Determination (R-squared)
    min_max_error = np.abs((np.min(pred) - np.min(test)) + 
                            (np.max(pred) - np.max(test)))  # Min-Max Error

    df = pl.DataFrame({
        f'volatility_prediction': pred,
        f'volatility_actual': test
    })

    df = df.with_columns(
        abs(pl.col(f'volatility_prediction') - pl.col(f'volatility_actual')).alias(f'dif_volatility')
    )

    # overall median abs error
    median_abs_err = df[f'dif_volatility'].median()

    # overall var abs error
    var_abs_err = df[f'dif_volatility'].var()

    # Results
    results = pl.DataFrame({
        "Metric": [
            "Overall Median Absolute Error",
            "Overall Variance Absolute Error",
            "Mean Error",
            "Mean Absolute Error",
            "Mean Absolute Percentage Error",
            "Mean Percentage Error",
            "Root Mean Squared Error",
            "R-squared",
            "Min-Max Error"
        ],
        "Value": [
            median_abs_err,
            var_abs_err,
            me,
            mae,
            mape,
            mpe,
            rmse,
            r,
            min_max_error
        ]
    })

    return results

In [5]:
calculate_metrics(df_class)

Class Percentages:
actual_class
0    59.657304
1    40.342696
Name: count, dtype: float64

Accuracy: 0.7929198167871531
F1 Score: 0.7529380781512328


In [6]:
evaluate_baseline(df_class['prediction'], df_class['actual'])

Metric,Value
str,f64
"""Overall Median Absolute Error""",0.003536
"""Overall Variance Absolute Erro…",7e-05
"""Mean Error""",0.000536
"""Mean Absolute Error""",0.005395
"""Mean Absolute Percentage Error""",27.626052
"""Mean Percentage Error""",-12.08397
"""Root Mean Squared Error""",0.00997
"""R-squared""",0.306279
"""Min-Max Error""",0.023177


In [7]:
df_class_2021_end = df_class[df_class['date'] >= '2021-01-01 00:00:00']
df_class_2021_end

Unnamed: 0,date,actual,prediction,min_prediction,max_prediction,actual_class,prediction_class
1655,2021-01-01 00:00:00,0.025835,0.042180,0.027885,0.056474,1,1
1656,2021-01-01 01:00:00,0.024852,0.031988,0.021147,0.042829,1,1
1657,2021-01-01 02:00:00,0.028416,0.026450,0.017486,0.035414,1,1
1658,2021-01-01 03:00:00,0.027184,0.026298,0.017386,0.035210,1,1
1659,2021-01-01 04:00:00,0.028700,0.020679,0.013671,0.027688,1,1
...,...,...,...,...,...,...,...
36237,2024-12-11 22:00:00,0.017125,0.017800,0.011768,0.023832,0,0
36238,2024-12-11 23:00:00,0.017017,0.018298,0.012097,0.024499,0,0
36239,2024-12-12 00:00:00,0.017420,0.019878,0.013141,0.026615,0,0
36240,2024-12-12 01:00:00,0.017177,0.020115,0.013298,0.026933,0,1


In [8]:
calculate_metrics(df_class_2021_end)

Class Percentages:
actual_class
0    60.375286
1    39.624714
Name: count, dtype: float64

Accuracy: 0.8009656807471015
F1 Score: 0.7573493126542122


In [9]:
evaluate_baseline(df_class_2021_end['prediction'], df_class_2021_end['actual'])

Metric,Value
str,f64
"""Overall Median Absolute Error""",0.003433
"""Overall Variance Absolute Erro…",5.1e-05
"""Mean Error""",0.000415
"""Mean Absolute Error""",0.005102
"""Mean Absolute Percentage Error""",26.275081
"""Mean Percentage Error""",-11.305572
"""Root Mean Squared Error""",0.008789
"""R-squared""",0.465839
"""Min-Max Error""",0.017356


In [10]:
df_class_2022_end = df_class[df_class['date'] >= '2022-01-01 00:00:00']
df_class_2022_end

Unnamed: 0,date,actual,prediction,min_prediction,max_prediction,actual_class,prediction_class
10415,2022-01-01 00:00:00,0.011851,0.016457,0.010879,0.022034,0,0
10416,2022-01-01 01:00:00,0.011393,0.017015,0.011249,0.022782,0,0
10417,2022-01-01 02:00:00,0.012243,0.016336,0.010800,0.021872,0,0
10418,2022-01-01 03:00:00,0.013240,0.014324,0.009469,0.019178,0,0
10419,2022-01-01 04:00:00,0.012971,0.017110,0.011312,0.022909,0,0
...,...,...,...,...,...,...,...
36237,2024-12-11 22:00:00,0.017125,0.017800,0.011768,0.023832,0,0
36238,2024-12-11 23:00:00,0.017017,0.018298,0.012097,0.024499,0,0
36239,2024-12-12 00:00:00,0.017420,0.019878,0.013141,0.026615,0,0
36240,2024-12-12 01:00:00,0.017177,0.020115,0.013298,0.026933,0,1


In [11]:
calculate_metrics(df_class_2022_end)

Class Percentages:
actual_class
0    71.161962
1    28.838038
Name: count, dtype: float64

Accuracy: 0.8170132032369226
F1 Score: 0.6817936978184757


In [12]:
evaluate_baseline(df_class_2022_end['prediction'], df_class_2022_end['actual'])

Metric,Value
str,f64
"""Overall Median Absolute Error""",0.003033
"""Overall Variance Absolute Erro…",2.5e-05
"""Mean Error""",0.000433
"""Mean Absolute Error""",0.004141
"""Mean Absolute Percentage Error""",25.576479
"""Mean Percentage Error""",-11.410952
"""Root Mean Squared Error""",0.006517
"""R-squared""",0.535859
"""Min-Max Error""",0.056036


In [13]:
df_class_2023_end = df_class[df_class['date'] >= '2023-01-01 00:00:00']
df_class_2023_end

Unnamed: 0,date,actual,prediction,min_prediction,max_prediction,actual_class,prediction_class
19175,2023-01-01 00:00:00,0.006287,0.009168,0.006061,0.012275,0,0
19176,2023-01-01 01:00:00,0.006043,0.008684,0.005741,0.011627,0,0
19177,2023-01-01 02:00:00,0.005878,0.009588,0.006338,0.012837,0,0
19178,2023-01-01 03:00:00,0.005955,0.009454,0.006250,0.012658,0,0
19179,2023-01-01 04:00:00,0.005384,0.010014,0.006620,0.013407,0,0
...,...,...,...,...,...,...,...
36237,2024-12-11 22:00:00,0.017125,0.017800,0.011768,0.023832,0,0
36238,2024-12-11 23:00:00,0.017017,0.018298,0.012097,0.024499,0,0
36239,2024-12-12 00:00:00,0.017420,0.019878,0.013141,0.026615,0,0
36240,2024-12-12 01:00:00,0.017177,0.020115,0.013298,0.026933,0,1


In [14]:
calculate_metrics(df_class_2023_end)

Class Percentages:
actual_class
0    76.14695
1    23.85305
Name: count, dtype: float64

Accuracy: 0.8328938887912345
F1 Score: 0.6339835728952772


In [15]:
evaluate_baseline(df_class_2023_end['prediction'], df_class_2023_end['actual'])

Metric,Value
str,f64
"""Overall Median Absolute Error""",0.002897
"""Overall Variance Absolute Erro…",1.9e-05
"""Mean Error""",0.000355
"""Mean Absolute Error""",0.003918
"""Mean Absolute Percentage Error""",26.094892
"""Mean Percentage Error""",-11.724029
"""Root Mean Squared Error""",0.005822
"""R-squared""",0.500069
"""Min-Max Error""",0.020654


In [16]:
df_class_2024_end = df_class[df_class['date'] >= '2024-01-01 00:00:00']
df_class_2024_end

Unnamed: 0,date,actual,prediction,min_prediction,max_prediction,actual_class,prediction_class
27935,2024-01-01 00:00:00,0.015917,0.024569,0.016243,0.032896,0,1
27936,2024-01-01 01:00:00,0.014760,0.023588,0.015594,0.031582,0,1
27937,2024-01-01 02:00:00,0.014472,0.022743,0.015035,0.030450,0,1
27938,2024-01-01 03:00:00,0.013723,0.023594,0.015598,0.031590,0,1
27939,2024-01-01 04:00:00,0.012984,0.024845,0.016425,0.033265,0,1
...,...,...,...,...,...,...,...
36237,2024-12-11 22:00:00,0.017125,0.017800,0.011768,0.023832,0,0
36238,2024-12-11 23:00:00,0.017017,0.018298,0.012097,0.024499,0,0
36239,2024-12-12 00:00:00,0.017420,0.019878,0.013141,0.026615,0,0
36240,2024-12-12 01:00:00,0.017177,0.020115,0.013298,0.026933,0,1


In [17]:
calculate_metrics(df_class_2024_end)

Class Percentages:
actual_class
0    68.171422
1    31.828578
Name: count, dtype: float64

Accuracy: 0.7992054893463344
F1 Score: 0.6858757062146893


In [18]:
evaluate_baseline(df_class_2024_end['prediction'], df_class_2024_end['actual'])

Metric,Value
str,f64
"""Overall Median Absolute Error""",0.003258
"""Overall Variance Absolute Erro…",2.4e-05
"""Mean Error""",0.000432
"""Mean Absolute Error""",0.004372
"""Mean Absolute Percentage Error""",24.529882
"""Mean Percentage Error""",-10.426463
"""Root Mean Squared Error""",0.006572
"""R-squared""",0.417816
"""Min-Max Error""",0.015924
