# USD JPY Hour Data

In [2]:
import polars as pl

# Read the CSV file into a Polars DataFrame
file_path = "USDJPY60.csv"  # Replace with your CSV file path
df = pl.read_csv(file_path, separator="\t", has_header=False)

# Rename columns to match the expected structure
df.columns = ["date", "open", "high", "low", "close", "volume"]

# Display the first few rows
df

date,open,high,low,close,volume
str,f64,f64,f64,f64,i64
"""2008-11-11 06:00""",97.97,98.06,97.85,97.92,7479
"""2008-11-11 07:00""",97.92,97.96,97.62,97.64,33880
"""2008-11-11 08:00""",97.64,97.82,97.555,97.785,35566
"""2008-11-11 09:00""",97.785,97.98,97.67,97.76,36609
"""2008-11-11 10:00""",97.755,97.9,97.605,97.795,34606
…,…,…,…,…,…
"""2024-12-04 22:00""",150.525,150.555,150.445,150.505,972
"""2024-12-04 23:00""",150.518,150.774,150.509,150.547,10980
"""2024-12-05 00:00""",150.547,150.547,150.191,150.221,28875
"""2024-12-05 01:00""",150.221,150.68,150.185,150.536,30694


# Clean + Features Engineering

In [3]:
import polars as pl

# Load the DataFrame (assuming it is already loaded as df)

df_features = df.clone()

# Ensure 'date' is in datetime format
df_features = df_features.with_columns(
    pl.col("date").str.strptime(pl.Datetime).alias("date")
)

# Create lagged features
df_features = df_features.with_columns(
    pl.col('open').shift(1).alias('prev_open'),
    pl.col('high').shift(1).alias('prev_high'),
    pl.col('low').shift(1).alias('prev_low'),
    pl.col('close').shift(1).alias('prev_close'),
    pl.col('volume').shift(1).alias('prev_volume')
)

# Calculate changes
df_features = df_features.with_columns(
    (pl.col('open') - pl.col('prev_open')).alias('open_change'),
    (pl.col('high') - pl.col('prev_high')).alias('high_change'),
    (pl.col('low') - pl.col('prev_low')).alias('low_change'),
    (pl.col('close') - pl.col('prev_close')).alias('close_change'),
    ((pl.col('open') - pl.col('prev_open')) / pl.col('prev_open') * 100).alias('open_pct_change'),
    ((pl.col('close') - pl.col('prev_close')) / pl.col('prev_close') * 100).alias('close_pct_change')
)

# Calculate rolling statistics
df_features = df_features.with_columns(
    pl.col('open').rolling_mean(3).alias('rolling_avg_open_3h'),
    pl.col('close').rolling_mean(5).alias('rolling_avg_close_5h'),
    pl.col('high').rolling_max(3).alias('rolling_max_high_3h'),
    pl.col('low').rolling_min(3).alias('rolling_min_low_3h')
)

# Calculate volume features
df_features = df_features.with_columns(
    (pl.col('volume') - pl.col('prev_volume')).alias('volume_change'),
    pl.col('volume').rolling_mean(3).alias('volume_ma_3h')
)

# Extract time-related features
df_features = df_features.with_columns(
    pl.col('date').dt.hour().alias('hour_of_day'),
    pl.col('date').dt.weekday().alias('day_of_week')  # Corrected method
)

# Calculate ratios
df_features = df_features.with_columns(
    (pl.col('high') / pl.col('low')).alias('high_low_ratio'),
    (pl.col('close') / pl.col('open')).alias('close_open_ratio')
)

# Calculate the highest close in the last 4, 8, 12, and 24 hours using rolling windows
df_features = df_features.with_columns(
    pl.col('close').rolling_max(window_size=4).alias('max_close_4h'),
    pl.col('close').rolling_max(window_size=8).alias('max_close_8h'),
    pl.col('close').rolling_max(window_size=12).alias('max_close_12h'),
    pl.col('close').rolling_max(window_size=24).alias('max_close_24h')
)

# Calculate the min and max of the entire 'close' column
max_close_all = df_features['close'].max()
min_close_all = df_features['close'].min()

# Calculate the difference from the current close to the max and min close
df_features = df_features.with_columns(
    (pl.col('close') - max_close_all).alias('diff_to_max_close'),
    (pl.col('close') - min_close_all).alias('diff_to_min_close')
)

# Drop rows with any null values
df_features = df_features.drop_nulls()

df_features = df_features.with_columns(
    pl.col('close').shift(-1).alias('next_close'),
)

# Print the updated DataFrame schema
df_features

date,open,high,low,close,volume,prev_open,prev_high,prev_low,prev_close,prev_volume,open_change,high_change,low_change,close_change,open_pct_change,close_pct_change,rolling_avg_open_3h,rolling_avg_close_5h,rolling_max_high_3h,rolling_min_low_3h,volume_change,volume_ma_3h,hour_of_day,day_of_week,high_low_ratio,close_open_ratio,max_close_4h,max_close_8h,max_close_12h,max_close_24h,diff_to_max_close,diff_to_min_close,next_close
datetime[μs],f64,f64,f64,f64,i64,f64,f64,f64,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,f64,i8,i8,f64,f64,f64,f64,f64,f64,f64,f64,f64
2008-11-12 05:00:00,97.685,97.74,97.555,97.7,18396,97.57,97.765,97.57,97.685,21204,0.115,-0.025,-0.015,0.015,0.117864,0.015355,97.68,97.714,97.84,97.545,-2808,21443.333333,5,3,1.001896,1.000154,97.795,97.825,97.855,97.975,-64.236,22.021,97.77
2008-11-12 06:00:00,97.7,97.815,97.67,97.77,16598,97.685,97.74,97.555,97.7,18396,0.015,0.075,0.115,0.07,0.015355,0.071648,97.651667,97.703,97.815,97.555,-1798,18732.666667,6,3,1.001485,1.000716,97.77,97.825,97.855,97.975,-64.166,22.091,97.945
2008-11-12 07:00:00,97.755,98.07,97.72,97.945,38507,97.7,97.815,97.67,97.77,16598,0.055,0.255,0.05,0.175,0.056295,0.178992,97.713333,97.733,98.07,97.555,21909,24500.333333,7,3,1.003582,1.001944,97.945,97.945,97.945,97.975,-63.991,22.266,97.565
2008-11-12 08:00:00,97.955,97.995,97.51,97.565,41159,97.755,98.07,97.72,97.945,38507,0.2,-0.075,-0.21,-0.38,0.204593,-0.387973,97.803333,97.733,98.07,97.51,2652,32088.0,8,3,1.004974,0.996019,97.945,97.945,97.945,97.975,-64.371,21.886,97.31
2008-11-12 09:00:00,97.56,97.75,97.195,97.31,41104,97.955,97.995,97.51,97.565,41159,-0.395,-0.245,-0.315,-0.255,-0.403246,-0.261364,97.756667,97.658,98.07,97.195,-55,40256.666667,9,3,1.00571,0.997437,97.945,97.945,97.945,97.975,-64.626,21.631,97.26
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2024-12-04 22:00:00,150.525,150.555,150.445,150.505,972,150.519,150.659,150.484,150.539,8303,0.006,-0.104,-0.039,-0.034,0.003986,-0.022586,150.494,150.4822,150.67,150.436,-7331,11390.666667,22,3,1.000731,0.999867,150.539,150.539,151.17,151.17,-11.431,74.826,150.547
2024-12-04 23:00:00,150.518,150.774,150.509,150.547,10980,150.525,150.555,150.445,150.505,972,-0.007,0.219,0.064,0.042,-0.00465,0.027906,150.520667,150.51,150.774,150.445,10008,6751.666667,23,3,1.001761,1.000193,150.547,150.547,151.17,151.17,-11.389,74.868,150.221
2024-12-05 00:00:00,150.547,150.547,150.191,150.221,28875,150.518,150.774,150.509,150.547,10980,0.029,-0.227,-0.318,-0.326,0.019267,-0.216544,150.53,150.4666,150.774,150.191,17895,13609.0,0,4,1.00237,0.997835,150.547,150.547,151.071,151.17,-11.715,74.542,150.536
2024-12-05 01:00:00,150.221,150.68,150.185,150.536,30694,150.547,150.547,150.191,150.221,28875,-0.326,0.133,-0.006,0.315,-0.216544,0.209691,150.428667,150.4696,150.774,150.185,1819,23516.333333,1,4,1.003296,1.002097,150.547,150.547,150.848,151.17,-11.4,74.857,150.333


# Linear Regression

In [None]:
import time
import numpy as np
import polars as pl
import pandas as pd
from sklearn.linear_model import LinearRegression

# Separate df into features and target using Polars' select method
features_df = df_features.select([col for col in df_features.columns if col not in ['datetime', 'next_close']])
target_df = df_features['next_close']

# Convert Polars DataFrame to NumPy arrays for processing
X = features_df.to_numpy()
y = target_df.to_numpy()

# Specify parameters for the sliding window approach
num_predictions = 1   # Number of rows to predict
gap = 1               # Gap (number of rows to skip after each window)
max_windows = 50      # Maximum number of windows to process
set_limit = False     # Set this to False to process all windows

# Define list of window sizes
window_sizes = list(range(10000, 95001, 10000))
# window_sizes = [90000]

# List to store results
results = []

# Loop through each window size
for window_size in window_sizes:
    # Initiate lists to store RMSEs and percentages
    all_val_rmse = []
    all_val_rmse_perc = []
    all_train_rmse = []
    all_train_rmse_perc = []
    total_window_times = 0  # Variable to store total time for all windows

    # Counters and variables for prediction comparison to actual values
    lower_count = 0
    higher_count = 0
    max_rmse_perc_lower = 0  # To store max RMSE% when prediction is lower than actual
    max_rmse_perc_higher = 0  # To store max RMSE% when prediction is higher than actual

    # Separate lists to store RMSE percentage for lower and higher predictions
    lower_rmse_percs = []
    higher_rmse_percs = []

    # Separate lists to store val_rmse for lower and higher predictions
    rmse_lower_perc = []  # To store RMSE when prediction is lower than actual
    rmse_higher_perc = []  # To store RMSE when prediction is higher than actual

    # Calculate the number of windows based on dataset size
    num_windows = len(X) - window_size - num_predictions

    # Apply maximum window limit if set
    if set_limit:
        num_windows = min(num_windows, max_windows)

    # Loop through each sliding window with the gap applied
    window_number = 0
    while window_number < num_windows:
        
        if window_number % 20000 == 0:
            print(f'Processing window {window_number} of {num_windows}')

        start = window_number
        end = start + window_size
        X_train = X[start:end]
        y_train = y[start:end]

        # Normalize the training features
        X_train_mean = np.mean(X_train, axis=0)
        X_train_std = np.std(X_train, axis=0)
        X_train_normalized = (X_train - X_train_mean) / X_train_std

        # Get the column index for 'close' from features_df
        close_index = features_df.columns.index('close')

        # Normalize y_train using the mean and std of the 'close' column
        close_mean = X_train[:, close_index].mean()
        close_std = X_train[:, close_index].std()
        y_train_normalized = (y_train - close_mean) / close_std

        # Prepare validation data for prediction
        X_val = X[end:end + num_predictions]
        y_val = y[end:end + num_predictions]

        # Normalize validation data using the statistics from the training set
        X_val_normalized = (X_val - X_train_mean) / X_train_std

        # Track the start time of the window processing
        start_time = time.time()

        # Initialize and fit the model
        model = LinearRegression()
        model.fit(X_train_normalized, y_train_normalized)

        # Predict on validation data
        y_pred_val = model.predict(X_val_normalized)
        # Predict on training data
        y_pred_train = model.predict(X_train_normalized)

        # Denormalize y_val, y_pred_train, and y_pred_val using the mean and std of 'close'
        y_train_denorm = y_train * close_std + close_mean
        y_pred_train_denorm = y_pred_train * close_std + close_mean
        y_pred_val_denorm = y_pred_val * close_std + close_mean

        # Calculate RMSE and RMSE percentage for validation
        mse_val = np.mean((y_val - y_pred_val_denorm) ** 2)
        rmse_val = np.sqrt(mse_val)
        rmse_val_perc = (rmse_val / y_val)[0] * 100  # Convert to percentage

        # Calculate RMSE for training
        mse_train = np.mean((y_train - y_pred_train_denorm) ** 2)
        rmse_train = np.sqrt(mse_train)
        rmse_train_perc = ((y_train - y_pred_train_denorm) ** 2 / y_train).mean() * 100  # Convert to percentage

        # Track the end time of the window processing
        end_time = time.time()

        # Calculate the time taken for this window
        window_time = end_time - start_time
        total_window_times += window_time  # Add the window time to the total time

        # Append RMSEs and percentage errors
        all_val_rmse.append(rmse_val)
        all_val_rmse_perc.append(rmse_val_perc)
        all_train_rmse.append(rmse_train)
        all_train_rmse_perc.append(rmse_train_perc)

        # Count predictions relative to actual values and update max RMSE percentage
        if y_pred_val_denorm < y_val:
            lower_count += 1
            lower_rmse_percs.append(rmse_val_perc)  # Store RMSE percentage for lower predictions
            max_rmse_perc_lower = max(max_rmse_perc_lower, rmse_val_perc)
        elif y_pred_val_denorm > y_val:
            higher_count += 1
            higher_rmse_percs.append(rmse_val_perc)  # Store RMSE percentage for higher predictions
            max_rmse_perc_higher = max(max_rmse_perc_higher, rmse_val_perc)

        # Move to the next window based on the gap
        window_number += gap

    # Calculate percentage for lower and higher counts
    lower_count_perc = (lower_count / num_windows) * 100
    higher_count_perc = (higher_count / num_windows) * 100

    # Calculate average RMSE percentage errors for lower and higher predictions
    avg_rmse_perc_lower = np.mean(lower_rmse_percs) if lower_rmse_percs else 0
    avg_rmse_perc_higher = np.mean(higher_rmse_percs) if higher_rmse_percs else 0

    # Calculate average, max, min, and variance for validation and training RMSEs, percentages
    avg_val_rmse = np.mean(all_val_rmse)
    var_val_rmse = np.var(all_val_rmse)

    avg_val_rmse_perc = np.mean(all_val_rmse_perc)
    var_val_rmse_perc = np.var(all_val_rmse_perc)
    max_avg_val_rmse_perc = np.max(all_val_rmse_perc)

    avg_train_rmse = np.mean(all_train_rmse)
    var_train_rmse = np.var(all_train_rmse)

    avg_train_rmse_perc = np.mean(all_train_rmse_perc)
    var_train_rmse_perc = np.var(all_train_rmse_perc)

    # Calculate percentage of times RMSE is less than or equal to 0.0776
    rmse_less_equal_avg_change = (np.sum(np.array(all_val_rmse) <= 0.0776) / len(all_val_rmse)) * 100

    # Calculate percentage of times prediction is lower or equal to average RMSE
    percentage_lower_equal_avg_rmse = (np.sum(np.array(all_val_rmse) <= avg_val_rmse) / len(all_val_rmse)) * 100

    # Append results to the list with updated metric name
    results.append({
        'window_size': window_size,
        'avg_val_rmse': avg_val_rmse,
        'windowed_confidence_level': percentage_lower_equal_avg_rmse,  # Renamed metric
        'var_val_rmse': var_val_rmse,
        'avg_val_rmse_perc': avg_val_rmse_perc,
        'var_val_rmse_perc': var_val_rmse_perc,
        'avg_train_rmse': avg_train_rmse,
        'var_train_rmse': var_train_rmse,
        'avg_train_rmse_perc': avg_train_rmse_perc,
        'var_train_rmse_perc': var_train_rmse_perc,
        'window_time': total_window_times,
        'max_avg_val_rmse_perc': max_avg_val_rmse_perc,
        'lower_count_perc': lower_count_perc,
        'higher_count_perc': higher_count_perc,
        'max_rmse_perc_lower': max_rmse_perc_lower,
        'max_rmse_perc_higher': max_rmse_perc_higher,
        'avg_rmse_perc_lower': avg_rmse_perc_lower,
        'avg_rmse_perc_higher': avg_rmse_perc_higher,
        'rmse_less_equal_60_perc': rmse_less_equal_avg_change
    })

    # Print results for the current window size with the new name
    print(f'Window size [{window_size}] | Time Elapsed: {total_window_times:.3f} seconds')
    print(f'Average Prediction Error: {avg_val_rmse:.3f} JPY | {avg_val_rmse_perc:.3f} % | Confidence Level: {percentage_lower_equal_avg_rmse:.3f} % ')
    print(f'Average Prediction Error that less than avg change: {rmse_less_equal_avg_change:.3f} %')
# Optionally, you could convert the results to a DataFrame or CSV for further analysis
results_summary = pd.DataFrame(results)
results_summary

Processing window 0 of 9987
Window size [90000] | Time Elapsed: 604.600 seconds
Average Prediction Error: 0.112 JPY | 0.076 % | Confidence Level: 66.807 % 
Average Prediction Error that less than avg change: 53.429 %


Unnamed: 0,window_size,avg_val_rmse,windowed_confidence_level,var_val_rmse,avg_val_rmse_perc,var_val_rmse_perc,avg_train_rmse,var_train_rmse,avg_train_rmse_perc,var_train_rmse_perc,window_time,max_avg_val_rmse_perc,lower_count_perc,higher_count_perc,max_rmse_perc_lower,max_rmse_perc_higher,avg_rmse_perc_lower,avg_rmse_perc_higher,rmse_less_equal_60_perc
0,90000,0.112434,66.806849,0.021617,0.076139,0.009891,0.130075,3e-06,0.015261,8.184805e-08,604.600202,2.15428,50.705918,49.294082,1.380629,2.15428,0.07562,0.076673,53.429458
