# Read Data

In [2]:
# Spreads
import pandas as pd

df_spread = pd.read_csv('spread.csv')

# Filter columns
df_spread = df_spread[['timestamp', 'LINAUSDT-GTCUSDT']]

df_spread


Unnamed: 0,timestamp,LINAUSDT-GTCUSDT
0,2024-01-11,-4.968798
1,2024-01-12,-4.955794
2,2024-01-13,-4.957159
3,2024-01-14,-4.936558
4,2024-01-15,-4.938028
...,...,...
360,2025-01-05,-5.000997
361,2025-01-06,-5.005235
362,2025-01-07,-5.028604
363,2025-01-08,-5.049686


In [8]:
# Historical data
df_LINAUSDT = pd.read_csv('LINAUSDT.csv')

df_LINAUSDT.drop(columns=['Unnamed: 0', 'ignore', 'close_time'], inplace=True)

df_LINAUSDT

Unnamed: 0,timestamp,open,high,low,close,volume,quote_volume,trades,taker_buy_base,taker_buy_quote
0,2022-12-20 07:00:00,0.00523,0.00524,0.00518,0.00522,95953549.0,499752.47011,1454.0,39012818.0,203029.92707
1,2022-12-20 08:00:00,0.00522,0.00524,0.00520,0.00523,78064433.0,407240.17370,1450.0,37530600.0,195841.86391
2,2022-12-20 09:00:00,0.00523,0.00529,0.00523,0.00527,78634735.0,414112.52759,1637.0,39830057.0,209635.32397
3,2022-12-20 10:00:00,0.00527,0.00529,0.00525,0.00527,46100004.0,242907.95562,1045.0,19837958.0,104538.88716
4,2022-12-20 11:00:00,0.00527,0.00529,0.00526,0.00529,55923004.0,294952.16752,1030.0,24203601.0,127729.36851
...,...,...,...,...,...,...,...,...,...,...
17995,2025-01-08 02:00:00,0.00480,0.00484,0.00476,0.00476,64642495.0,310348.10802,2911.0,29425644.0,141363.65081
17996,2025-01-08 03:00:00,0.00477,0.00479,0.00470,0.00475,172447279.0,817394.40629,4814.0,83137452.0,394301.22048
17997,2025-01-08 04:00:00,0.00475,0.00478,0.00469,0.00470,118707872.0,560994.75155,3123.0,58510516.0,276787.96469
17998,2025-01-08 05:00:00,0.00471,0.00476,0.00469,0.00469,53914485.0,254739.07108,2380.0,27635185.0,130570.67079


In [9]:
df_GTCUSDT = pd.read_csv('GTCUSDT.csv')

df_GTCUSDT.drop(columns=['Unnamed: 0', 'ignore', 'close_time'], inplace=True)

df_GTCUSDT

Unnamed: 0,timestamp,open,high,low,close,volume,quote_volume,trades,taker_buy_base,taker_buy_quote
0,2022-12-20 07:00:00,1.381,1.385,1.371,1.384,219078.4,301789.7780,2698.0,119518.1,164683.3273
1,2022-12-20 08:00:00,1.383,1.385,1.374,1.378,146202.4,201534.9789,1804.0,80843.1,111452.7083
2,2022-12-20 09:00:00,1.378,1.382,1.370,1.376,147367.3,202854.9822,1521.0,69334.0,95480.8024
3,2022-12-20 10:00:00,1.376,1.379,1.370,1.372,110286.1,151686.0204,1298.0,54221.1,74610.2700
4,2022-12-20 11:00:00,1.371,1.381,1.369,1.376,137681.5,189355.7775,1531.0,77341.8,106407.4510
...,...,...,...,...,...,...,...,...,...,...
17995,2025-01-08 02:00:00,0.733,0.737,0.726,0.726,380304.8,278738.7397,1825.0,165776.1,121610.3918
17996,2025-01-08 03:00:00,0.726,0.727,0.715,0.720,1046747.3,754248.8342,3525.0,414410.3,298756.0311
17997,2025-01-08 04:00:00,0.720,0.724,0.710,0.712,721025.1,516095.3021,2709.0,340663.1,243975.4652
17998,2025-01-08 05:00:00,0.712,0.719,0.708,0.710,439378.6,313663.5621,1777.0,182565.3,130433.8889


# Eval Function

In [12]:
import numpy as np
import polars as pl
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Adjust the evaluate_baseline function to accept only y_pred and y_test
def evaluate_baseline(y_pred, y_test):
    # Convert lists to Numpy arrays
    pred = np.array(y_pred)
    test = np.array(y_test)

    # Evaluation Metrics
    me = np.mean(pred - test)  # Mean Error
    mae = mean_absolute_error(test, pred)  # Mean Absolute Error
    mape = np.mean(np.abs((test - pred) / test)) * 100  # Mean Absolute Percentage Error
    mpe = np.mean((test - pred) / test) * 100  # Mean Percentage Error
    rmse = np.sqrt(mean_squared_error(test, pred))  # Root Mean Squared Error

    r = r2_score(test, pred)  # Coefficient of Determination (R-squared)
    min_max_error = np.abs((np.min(pred) - np.min(test)) + 
                            (np.max(pred) - np.max(test)))  # Min-Max Error

    df = pl.DataFrame({
        f'volatility_prediction': pred,
        f'volatility_actual': test
    })

    df = df.with_columns(
        abs(pl.col(f'volatility_prediction') - pl.col(f'volatility_actual')).alias(f'dif_volatility')
    )

    # overall median abs error
    median_abs_err =  df[f'dif_volatility'].median()

    # overall var abs error
    var_abs_err =  df[f'dif_volatility'].var()

    # Results
    results = {
        "Overall Median Absolute Error" : median_abs_err,
        "Overall Variance Absolute Error" : var_abs_err,
        "Mean Error": me,
        "Mean Absolute Error ": mae,
        "Mean Absolute Percentage Error": mape,
        "Mean Percentage Error": mpe,
        "Root Mean Squared Error": rmse,
        "R-squared": r, 
        "Min-Max Error": min_max_error
    }
    
    return results, df

# Predict Next Hour Close Price

## Shift 1

In [25]:
df_base = df_GTCUSDT.copy()

df_base['next_close'] = df_base['close'].shift(-1)

df_base.dropna(inplace=True)

df_base

Unnamed: 0,timestamp,open,high,low,close,volume,quote_volume,trades,taker_buy_base,taker_buy_quote,next_close
0,2022-12-20 07:00:00,1.381,1.385,1.371,1.384,219078.4,301789.7780,2698.0,119518.1,164683.3273,1.378
1,2022-12-20 08:00:00,1.383,1.385,1.374,1.378,146202.4,201534.9789,1804.0,80843.1,111452.7083,1.376
2,2022-12-20 09:00:00,1.378,1.382,1.370,1.376,147367.3,202854.9822,1521.0,69334.0,95480.8024,1.372
3,2022-12-20 10:00:00,1.376,1.379,1.370,1.372,110286.1,151686.0204,1298.0,54221.1,74610.2700,1.376
4,2022-12-20 11:00:00,1.371,1.381,1.369,1.376,137681.5,189355.7775,1531.0,77341.8,106407.4510,1.375
...,...,...,...,...,...,...,...,...,...,...,...
17994,2025-01-08 01:00:00,0.739,0.740,0.728,0.733,520481.6,381889.9848,2098.0,189144.9,138698.2345,0.726
17995,2025-01-08 02:00:00,0.733,0.737,0.726,0.726,380304.8,278738.7397,1825.0,165776.1,121610.3918,0.720
17996,2025-01-08 03:00:00,0.726,0.727,0.715,0.720,1046747.3,754248.8342,3525.0,414410.3,298756.0311,0.712
17997,2025-01-08 04:00:00,0.720,0.724,0.710,0.712,721025.1,516095.3021,2709.0,340663.1,243975.4652,0.710


In [26]:
evaluate_baseline(df_base['close'], df_base['next_close'])

({'Overall Median Absolute Error': 0.006000000000000005,
  'Overall Variance Absolute Error': 0.0001770462297602302,
  'Mean Error': np.float64(3.739096616478693e-05),
  'Mean Absolute Error ': 0.00943957997666537,
  'Mean Absolute Percentage Error': np.float64(0.7644664778344074),
  'Mean Percentage Error': np.float64(-0.010747176761084283),
  'Root Mean Squared Error': np.float64(0.016313861083384562),
  'R-squared': 0.9985982003265816,
  'Min-Max Error': np.float64(0.0)},
 shape: (17_999, 3)
 ┌───────────────────────┬───────────────────┬────────────────┐
 │ volatility_prediction ┆ volatility_actual ┆ dif_volatility │
 │ ---                   ┆ ---               ┆ ---            │
 │ f64                   ┆ f64               ┆ f64            │
 ╞═══════════════════════╪═══════════════════╪════════════════╡
 │ 1.384                 ┆ 1.378             ┆ 0.006          │
 │ 1.378                 ┆ 1.376             ┆ 0.002          │
 │ 1.376                 ┆ 1.372             ┆ 0.004

In [22]:
df_base = df_LINAUSDT.copy()

df_base['next_close'] = df_base['close'].shift(-1)

df_base.dropna(inplace=True)

df_base

Unnamed: 0,timestamp,open,high,low,close,volume,quote_volume,trades,taker_buy_base,taker_buy_quote,next_close
0,2022-12-20 07:00:00,0.00523,0.00524,0.00518,0.00522,95953549.0,499752.47011,1454.0,39012818.0,203029.92707,0.00523
1,2022-12-20 08:00:00,0.00522,0.00524,0.00520,0.00523,78064433.0,407240.17370,1450.0,37530600.0,195841.86391,0.00527
2,2022-12-20 09:00:00,0.00523,0.00529,0.00523,0.00527,78634735.0,414112.52759,1637.0,39830057.0,209635.32397,0.00527
3,2022-12-20 10:00:00,0.00527,0.00529,0.00525,0.00527,46100004.0,242907.95562,1045.0,19837958.0,104538.88716,0.00529
4,2022-12-20 11:00:00,0.00527,0.00529,0.00526,0.00529,55923004.0,294952.16752,1030.0,24203601.0,127729.36851,0.00530
...,...,...,...,...,...,...,...,...,...,...,...
17994,2025-01-08 01:00:00,0.00485,0.00486,0.00477,0.00480,82059874.0,394745.79691,2948.0,34822299.0,167621.71018,0.00476
17995,2025-01-08 02:00:00,0.00480,0.00484,0.00476,0.00476,64642495.0,310348.10802,2911.0,29425644.0,141363.65081,0.00475
17996,2025-01-08 03:00:00,0.00477,0.00479,0.00470,0.00475,172447279.0,817394.40629,4814.0,83137452.0,394301.22048,0.00470
17997,2025-01-08 04:00:00,0.00475,0.00478,0.00469,0.00470,118707872.0,560994.75155,3123.0,58510516.0,276787.96469,0.00469


In [23]:
evaluate_baseline(df_base['close'], df_base['next_close'])

({'Overall Median Absolute Error': 4.99999999999997e-05,
  'Overall Variance Absolute Error': 1.482328941537609e-08,
  'Mean Error': np.float64(2.8334907494860827e-08),
  'Mean Absolute Error ': 7.660925606978164e-05,
  'Mean Absolute Percentage Error': np.float64(0.8188468960307557),
  'Mean Percentage Error': np.float64(-0.008809926458098435),
  'Root Mean Squared Error': np.float64(0.00014384520836339802),
  'R-squared': 0.9982952015619053,
  'Min-Max Error': np.float64(0.0)},
 shape: (17_999, 3)
 ┌───────────────────────┬───────────────────┬────────────────┐
 │ volatility_prediction ┆ volatility_actual ┆ dif_volatility │
 │ ---                   ┆ ---               ┆ ---            │
 │ f64                   ┆ f64               ┆ f64            │
 ╞═══════════════════════╪═══════════════════╪════════════════╡
 │ 0.00522               ┆ 0.00523           ┆ 0.00001        │
 │ 0.00523               ┆ 0.00527           ┆ 0.00004        │
 │ 0.00527               ┆ 0.00527           ┆ 

## Base ML

In [None]:
import time
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

# Separate df into features and target
features_df = df_base.drop(['timestamp', 'next_close'], axis=1)
target_df = df_base['next_close']

# Convert to NumPy arrays
X = features_df.values
y = target_df.values

# Specify parameters for the sliding window approach
num_predictions = 1   # Number of rows to predict
gap = 1               # Gap (number of rows to skip after each window)
max_windows = [100]    # Maximum number of windows to process
set_limit = True      # Set this to False to process all windows

# Define list of window sizes
# window_sizes = [1000]
window_sizes = list(range(500, 15001, 500))

# List to store results
all_results = []

# List to store resultsAggregate
for max_window in max_windows:
# Loop through each window size
    for window_size in window_sizes:
        print(f'Processing window size: {window_size}')

        # Calculate the number of windows based on dataset size
        num_windows = len(X) - window_size - num_predictions

        # Adjust the starting point if set_limit is True
        start_index = 0
        if set_limit:
            # Only process the latest max_window
            start_index = max(0, len(X) - window_size - num_predictions - max_window)
            num_windows = min(num_windows, max_window)

        # Initialize lists to store actual and predicted values for this window size
        y_vals = []
        y_preds = []

        # Loop through each sliding window with the gap applied
        for window_number in range(start_index, start_index + num_windows, gap):
            
            start = window_number
            end = start + window_size
            X_train = X[start:end]
            y_train = y[start:end]

            X_train_mean = np.mean(X_train, axis=0)
            X_train_std = np.std(X_train, axis=0)
            X_train_normalized = (X_train - X_train_mean) / X_train_std

            # Get the column index for 'cur_avg_vol_4h' from features_df
            close_index = features_df.columns.get_loc('close')

            # Normalize y_train using the mean and std of cur_avg_vol_4h
            close_mean = X_train[:, close_index].mean()
            close_std = X_train[:, close_index].std()
            y_train_normalized = (y_train - close_mean) / close_std

            # Prepare validation data for prediction
            X_val = X[end:end + num_predictions]
            y_val = y[end:end + num_predictions]

            # Normalize validation data using the statistics from the training set
            X_val_normalized = (X_val - X_train_mean) / X_train_std

            # Track the start time of the window processing
            start_time = time.time()

            # Initialize and fit the model
            model = LinearRegression()
            model.fit(X_train_normalized, y_train_normalized)

            # Predict on validation data
            y_pred_val = model.predict(X_val_normalized)
            # Denormalize y_val and y_pred_val using the mean and std of cur_avg_vol_4h
            y_pred_val_denorm = y_pred_val * close_std + close_mean

            # Track the end time of the window processing
            end_time = time.time()

            # Append actual and predicted values to the lists
            y_vals.append(y_val[0])  # Assuming a single prediction per window
            y_preds.append(y_pred_val_denorm[0])  # Assuming a single prediction per window

        # Now that all windows for this window_size are processed, evaluate the baseline
        eval_results, _ = evaluate_baseline(y_preds, y_vals)

        # Append the window size and evaluation metrics to the results list
        all_results.append({
            'eval_size' : max_window,
            'window_size': window_size,
            **eval_results
        })

# Convert the results to a DataFrame for further analysis
results_df = pd.DataFrame(all_results)

# Print the results DataFrame
results_df

Processing window size: 1000


Unnamed: 0,eval_size,window_size,Overall Median Absolute Error,Overall Variance Absolute Error,Mean Error,Mean Absolute Error,Mean Absolute Percentage Error,Mean Percentage Error,Root Mean Squared Error,R-squared,Min-Max Error
0,100,1000,0.003914,2.3e-05,0.00126,0.005085,0.638223,-0.169592,0.006981,0.951388,0.004356
