In [1]:
import pandas as pd
import polars as pl

# Read Data

In [2]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('NEAR_USDT_futures_data_2020.csv')

# Display the first few rows of the DataFrame
df

Unnamed: 0,coin,opentime,openprice,highprice,lowprice,closeprice,volume,closetime,quotevolume,trades,taker_buy_volume,taker_buy_quote,unused
0,NEAR,1602748800000,1.0625,1.1872,1.0625,1.1169,1449407,1602752399999,1.655530e+06,9968,766784,8.779915e+05,0
1,NEAR,1602752400000,1.1169,1.1301,1.0778,1.1179,1616432,1602755999999,1.789316e+06,8743,873713,9.684362e+05,0
2,NEAR,1602756000000,1.1176,1.1650,1.1136,1.1570,1270406,1602759599999,1.441663e+06,8310,721173,8.190750e+05,0
3,NEAR,1602759600000,1.1572,1.1575,1.1210,1.1279,481575,1602763199999,5.503787e+05,4446,227307,2.600841e+05,0
4,NEAR,1602763200000,1.1286,1.1812,1.1025,1.1536,1835160,1602766799999,2.107977e+06,9266,934671,1.074642e+06,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
36456,NEAR,1733990400000,7.0790,7.1800,7.0780,7.1800,1252786,1733993999999,8.950482e+06,20675,658195,4.702462e+06,0
36457,NEAR,1733994000000,7.1790,7.2200,7.1460,7.1490,1535789,1733997599999,1.103534e+07,23363,709728,5.101661e+06,0
36458,NEAR,1733997600000,7.1490,7.1560,7.0210,7.0380,2065246,1734001199999,1.461731e+07,35347,875286,6.193253e+06,0
36459,NEAR,1734001200000,7.0380,7.0510,6.9450,7.0070,1712135,1734004799999,1.197202e+07,32840,797708,5.579540e+06,0


# Features Engineering

## non-aggregate

In [3]:
df_m = df.copy()

# Calculate Volatility
df_m.drop(columns=['coin','unused', 'closetime'], inplace=True)

df_m['volatility'] = (df_m['highprice'] - df_m['lowprice']) / df_m['lowprice']

df_m['next_4h_avg_volatility'] = (df_m['volatility'].shift(-1) + df_m['volatility'].shift(-2) + df_m['volatility'].shift(-3) + df_m['volatility'].shift(-4)) / 4

df_m

Unnamed: 0,opentime,openprice,highprice,lowprice,closeprice,volume,quotevolume,trades,taker_buy_volume,taker_buy_quote,volatility,next_4h_avg_volatility
0,1602748800000,1.0625,1.1872,1.0625,1.1169,1449407,1.655530e+06,9968,766784,8.779915e+05,0.117365,0.049656
1,1602752400000,1.1169,1.1301,1.0778,1.1179,1616432,1.789316e+06,8743,873713,9.684362e+05,0.048525,0.052932
2,1602756000000,1.1176,1.1650,1.1136,1.1570,1270406,1.441663e+06,8310,721173,8.190750e+05,0.046157,0.053795
3,1602759600000,1.1572,1.1575,1.1210,1.1279,481575,5.503787e+05,4446,227307,2.600841e+05,0.032560,0.050324
4,1602763200000,1.1286,1.1812,1.1025,1.1536,1835160,2.107977e+06,9266,934671,1.074642e+06,0.071383,0.047671
...,...,...,...,...,...,...,...,...,...,...,...,...
36456,1733990400000,7.0790,7.1800,7.0780,7.1800,1252786,8.950482e+06,20675,658195,4.702462e+06,0.014411,0.014886
36457,1733994000000,7.1790,7.2200,7.1460,7.1490,1535789,1.103534e+07,23363,709728,5.101661e+06,0.010355,
36458,1733997600000,7.1490,7.1560,7.0210,7.0380,2065246,1.461731e+07,35347,875286,6.193253e+06,0.019228,
36459,1734001200000,7.0380,7.0510,6.9450,7.0070,1712135,1.197202e+07,32840,797708,5.579540e+06,0.015263,


In [None]:
df_features = df_m.copy()

# Get Rolling 4h data
df_features['openprice_4h'] = df_features['openprice'].shift(3)
df_features['highprice_4h'] = df_features['highprice'].rolling(window=4).max()
df_features['lowprice_4h'] = df_features['lowprice'].rolling(window=4).min()
df_features['volume_4h'] = df_features['volume'].rolling(window=4).sum()
df_features['quotevolume_4h'] = df_features['quotevolume'].rolling(window=4).sum()
df_features['trades_4h'] = df_features['trades'].rolling(window=4).sum()
df_features['taker_buy_volume_4h'] = df_features['taker_buy_volume'].rolling(window=4).sum()
df_features['taker_buy_quote_4h'] = df_features['taker_buy_quote'].rolling(window=4).sum()

df_features['4h_volatility'] = (df_features['highprice_4h'] - df_features['lowprice_4h']) / df_features['lowprice_4h']

df_features['current_4h_avg_volatility'] = df_features['volatility'].rolling(window=4).mean()

# Volume features
df_features['volume_change'] = df_features['volume'].diff()
df_features['quotevolume_change'] = df_features['quotevolume'].diff()
df_features['relative_volume'] = df_features['volume'] / df_features['quotevolume']
df_features['taker_buy_volume_ratio'] = df_features['taker_buy_volume'] / df_features['volume']
df_features['taker_buy_quotevolume_ratio'] = df_features['taker_buy_quote'] / df_features['quotevolume']

df_features['std_10_volume'] = df_features['volume'].rolling(window=10).std()
df_features['std_20_volume'] = df_features['volume'].rolling(window=20).std()
df_features['std_50_volume'] = df_features['volume'].rolling(window=50).std()

df_features['sum_10_volume'] = df_features['volume'].rolling(window=10).sum()
df_features['sum_20_volume'] = df_features['volume'].rolling(window=20).sum()
df_features['sum_50_volume'] = df_features['volume'].rolling(window=50).sum()

# df_features['sum_10_quotevolume'] = df_features['quotevolume'].rolling(window=10).sum()
# df_features['sum_20_quotevolume'] = df_features['quotevolume'].rolling(window=20).sum()
# df_features['sum_50_quotevolume'] = df_features['quotevolume'].rolling(window=50).sum()

# # ema
# df_features['ema_closeprice'] = df_features['closeprice'].ewm(alpha=0.1, adjust=False).mean()
# df_features['ema_volume'] = df_features['volume'].ewm(alpha=0.1, adjust=False).mean() 

# lagged
df_features['lag_price_change_1h'] = df_features['closeprice'] - df_features['closeprice'].shift(1)
df_features['lag_price_change_2h'] = df_features['closeprice'] - df_features['closeprice'].shift(2)
df_features['lag_price_change_4h'] = df_features['closeprice'] - df_features['closeprice'].shift(4)
df_features['lag_price_change_8h'] = df_features['closeprice'] - df_features['closeprice'].shift(8)

df_features['lag_volume_change_1h'] = df_features['volume'] - df_features['volume'].shift(1)
df_features['lag_volume_change_2h'] = df_features['volume'] - df_features['volume'].shift(2)
df_features['lag_volume_change_4h'] = df_features['volume'] - df_features['volume'].shift(4)
df_features['lag_volume_change_8h'] = df_features['volume'] - df_features['volume'].shift(8)

# Rolling
df_features['ma_10_volatility'] = df_features['volatility'].rolling(window=10).mean()
df_features['ma_20_volatility'] = df_features['volatility'].rolling(window=20).mean()
df_features['ma_50_volatility'] = df_features['volatility'].rolling(window=50).mean()

df_features['max_10_volatility'] = df_features['volatility'].rolling(window=10).max()
df_features['max_20_volatility'] = df_features['volatility'].rolling(window=20).max()
df_features['max_50_volatility'] = df_features['volatility'].rolling(window=50).max()

df_features['min_10_volatility'] = df_features['volatility'].rolling(window=10).min()
df_features['min_20_volatility'] = df_features['volatility'].rolling(window=20).min()
df_features['min_50_volatility'] = df_features['volatility'].rolling(window=50).min()

df_features['max_10_open'] = df_features['openprice'].rolling(window=10).max()
df_features['min_10_open'] = df_features['openprice'].rolling(window=10).min()
df_features['max_20_open'] = df_features['openprice'].rolling(window=20).max()
df_features['min_20_open'] = df_features['openprice'].rolling(window=20).min()
df_features['max_50_open'] = df_features['openprice'].rolling(window=50).max()
df_features['min_50_open'] = df_features['openprice'].rolling(window=50).min()

df_features['max_10_close'] = df_features['closeprice'].rolling(window=10).max()
df_features['min_10_close'] = df_features['closeprice'].rolling(window=10).min()
df_features['max_20_close'] = df_features['closeprice'].rolling(window=20).max()
df_features['min_20_close'] = df_features['closeprice'].rolling(window=20).min()
df_features['max_50_close'] = df_features['closeprice'].rolling(window=50).max()
df_features['min_50_close'] = df_features['closeprice'].rolling(window=50).min()

df_features['ma_10_high'] = df_features['highprice'].rolling(window=10).mean()
df_features['ma_20_high'] = df_features['highprice'].rolling(window=20).mean()
df_features['ma_50_high'] = df_features['highprice'].rolling(window=50).mean()

df_features['ma_10_low'] = df_features['lowprice'].rolling(window=10).mean()
df_features['ma_20_low'] = df_features['lowprice'].rolling(window=20).mean()
df_features['ma_50_low'] = df_features['lowprice'].rolling(window=50).mean()

#Seasonality
df_features['year'] = pd.to_datetime(df_features['opentime'], unit='ms').dt.year
df_features['month'] = pd.to_datetime(df_features['opentime'], unit='ms').dt.month
df_features['day'] = pd.to_datetime(df_features['opentime'], unit='ms').dt.day
df_features['hour'] = pd.to_datetime(df_features['opentime'], unit='ms').dt.hour

df_features.dropna(inplace=True)

df_features

Unnamed: 0,opentime,openprice,highprice,lowprice,closeprice,volume,quotevolume,trades,taker_buy_volume,taker_buy_quote,...,ma_10_high,ma_20_high,ma_50_high,ma_10_low,ma_20_low,ma_50_low,year,month,day,hour
49,1602925200000,0.8135,0.8200,0.7732,0.7786,1767918,1.400896e+06,10835,915548,7.252089e+05,...,0.82514,0.854545,1.003818,0.77800,0.815300,0.958682,2020,10,17,9
50,1602928800000,0.7785,0.8070,0.7756,0.7983,1585840,1.255008e+06,10357,834632,6.612113e+05,...,0.82400,0.847655,0.996214,0.78361,0.809405,0.952944,2020,10,17,10
51,1602932400000,0.7982,0.8230,0.7970,0.7995,742794,6.015695e+05,5050,389137,3.154854e+05,...,0.82981,0.843325,0.990072,0.78874,0.805755,0.947328,2020,10,17,11
52,1602936000000,0.7994,0.8073,0.7857,0.7935,783374,6.238367e+05,6806,388355,3.097979e+05,...,0.83331,0.837980,0.982918,0.79260,0.801795,0.940770,2020,10,17,12
53,1602939600000,0.7935,0.8040,0.7744,0.7841,1139354,9.015986e+05,9553,612633,4.856943e+05,...,0.82711,0.833305,0.975848,0.79474,0.796740,0.933838,2020,10,17,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36452,1733976000000,7.0530,7.1250,7.0400,7.0810,1549828,1.097683e+07,29710,682310,4.834212e+06,...,6.95370,6.857500,6.645040,6.85340,6.747900,6.489080,2024,12,12,4
36453,1733979600000,7.0820,7.2150,7.0710,7.1270,2039678,1.460077e+07,34191,1225453,8.771590e+06,...,6.98960,6.890500,6.662900,6.88130,6.778450,6.509500,2024,12,12,5
36454,1733983200000,7.1270,7.1680,7.0640,7.1220,1633077,1.162963e+07,26984,677472,4.826473e+06,...,7.01690,6.920900,6.678180,6.90420,6.806650,6.527960,2024,12,12,6
36455,1733986800000,7.1210,7.1440,7.0670,7.0800,792438,5.630026e+06,17852,347334,2.468128e+06,...,7.04300,6.946150,6.691860,6.92830,6.834450,6.542220,2024,12,12,7


## aggregate

In [102]:
df_m = df.copy()

# Calculate Volatility
df_m.drop(columns=['coin','unused', 'closetime'], inplace=True)

df_m['volatility'] = (df_m['highprice'] - df_m['lowprice']) / df_m['lowprice']

# Example: Assuming your DataFrame is named `df_m`
df_m['opentime'] = pd.to_datetime(df_m['opentime'], unit='ms')  # Convert 'opentime' to datetime

# Step 1: Drop initial rows to align with 4-hour intervals
start_time = df_m['opentime'].iloc[0]
start_offset = (start_time.hour % 4) * 3600000 + start_time.minute * 60000 + start_time.second * 1000 + start_time.microsecond // 1000
if start_offset != 0:
    df_m = df_m.iloc[(4 - start_time.hour % 4):]

# Step 2: Set 'opentime' as index
df_m.set_index('opentime', inplace=True)

# Step 3: Aggregate the data every 4 hours
agg_dict = {
    'openprice': 'first',
    'highprice': 'max',
    'lowprice': 'min',
    'closeprice': 'last',
    'volume': 'sum',
    'quotevolume': 'sum',
    'trades': 'sum',
    'taker_buy_volume': 'sum',
    'taker_buy_quote': 'sum',
    'volatility' : 'mean'
}

df_agg = df_m.resample('4h').agg(agg_dict).dropna().reset_index()

# Target
df_agg['next_volatility'] = df_agg['volatility'].shift(-1)

df_agg.rename(columns={'volatility' : '4h_avg_volatility', 'next_volatility' : 'next_4h_avg_volatility'}, inplace=True)

df_agg.dropna(inplace=True)

df_agg


Unnamed: 0,opentime,openprice,highprice,lowprice,closeprice,volume,quotevolume,trades,taker_buy_volume,taker_buy_quote,4h_avg_volatility,next_4h_avg_volatility
0,2020-10-15 08:00:00,1.0625,1.1872,1.0625,1.1279,4817820,5.436888e+06,31467,2588977,2.925587e+06,0.061152,0.050324
1,2020-10-15 12:00:00,1.1286,1.2231,1.1025,1.1777,7234110,8.514717e+06,44982,3741813,4.409185e+06,0.050324,0.035115
2,2020-10-15 16:00:00,1.1772,1.1800,1.1124,1.1609,3082573,3.527578e+06,15450,1317154,1.509746e+06,0.035115,0.018301
3,2020-10-15 20:00:00,1.1614,1.1676,1.1143,1.1220,1350979,1.536829e+06,7032,614980,7.007536e+05,0.018301,0.028176
4,2020-10-16 00:00:00,1.1210,1.1585,1.1000,1.1082,1678252,1.900205e+06,8564,702786,7.972310e+05,0.028176,0.081511
...,...,...,...,...,...,...,...,...,...,...,...,...
9110,2024-12-11 16:00:00,6.8760,6.9250,6.7410,6.8520,5710584,3.899074e+07,103733,2551115,1.742565e+07,0.013953,0.009367
9111,2024-12-11 20:00:00,6.8510,6.9060,6.8250,6.8450,3149226,2.162340e+07,63129,1365533,9.378979e+06,0.009367,0.021838
9112,2024-12-12 00:00:00,6.8450,7.1500,6.7750,7.0540,7133997,4.963662e+07,137048,3651954,2.542302e+07,0.021838,0.014514
9113,2024-12-12 04:00:00,7.0530,7.2150,7.0400,7.0800,6015021,4.283726e+07,108737,2932569,2.090040e+07,0.014514,0.014814


In [103]:
df_features = df_agg.copy()

# Volume features
df_features['volume_change'] = df_features['volume'].diff()
df_features['quotevolume_change'] = df_features['quotevolume'].diff()
df_features['relative_volume'] = df_features['volume'] / df_features['quotevolume']
df_features['taker_buy_volume_ratio'] = df_features['taker_buy_volume'] / df_features['volume']
df_features['taker_buy_quotevolume_ratio'] = df_features['taker_buy_quote'] / df_features['quotevolume']

# ema
df_features['ema_closeprice'] = df_features['closeprice'].ewm(alpha=0.1, adjust=False).mean()
df_features['ema_volume'] = df_features['volume'].ewm(alpha=0.1, adjust=False).mean() 

# lagged
df_features['lag_price_change_1h'] = df_features['closeprice'] - df_features['closeprice'].shift(1)
df_features['lag_price_change_2h'] = df_features['closeprice'] - df_features['closeprice'].shift(2)
df_features['lag_volume_change_1h'] = df_features['volume'] - df_features['volume'].shift(1)
df_features['lag_volume_change_2h'] = df_features['volume'] - df_features['volume'].shift(2)

df_features.dropna(inplace=True)

df_features

Unnamed: 0,opentime,openprice,highprice,lowprice,closeprice,volume,quotevolume,trades,taker_buy_volume,taker_buy_quote,...,quotevolume_change,relative_volume,taker_buy_volume_ratio,taker_buy_quotevolume_ratio,ema_closeprice,ema_volume,lag_price_change_1h,lag_price_change_2h,lag_volume_change_1h,lag_volume_change_2h
2,2020-10-15 16:00:00,1.1772,1.1800,1.1124,1.1609,3082573,3.527578e+06,15450,1317154,1.509746e+06,...,-4.987139e+06,0.873850,0.427290,0.427984,1.135682,4.861761e+06,-0.0168,0.0330,-4151537.0,-1735247.0
3,2020-10-15 20:00:00,1.1614,1.1676,1.1143,1.1220,1350979,1.536829e+06,7032,614980,7.007536e+05,...,-1.990749e+06,0.879069,0.455211,0.455974,1.134314,4.510683e+06,-0.0389,-0.0557,-1731594.0,-5883131.0
4,2020-10-16 00:00:00,1.1210,1.1585,1.1000,1.1082,1678252,1.900205e+06,8564,702786,7.972310e+05,...,3.633753e+05,0.883195,0.418761,0.419550,1.131702,4.227440e+06,-0.0138,-0.0527,327273.0,-1404321.0
5,2020-10-16 04:00:00,1.1086,1.1087,0.9274,0.9359,6440520,6.439931e+06,31286,2614438,2.599064e+06,...,4.539727e+06,1.000091,0.405936,0.403586,1.112122,4.448748e+06,-0.1723,-0.1861,4762268.0,5089541.0
6,2020-10-16 08:00:00,0.9352,0.9770,0.8600,0.9213,7946880,7.273910e+06,59926,3903522,3.578668e+06,...,8.339790e+05,1.092518,0.491202,0.491987,1.093040,4.798561e+06,-0.0146,-0.1869,1506360.0,6268628.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9110,2024-12-11 16:00:00,6.8760,6.9250,6.7410,6.8520,5710584,3.899074e+07,103733,2551115,1.742565e+07,...,-2.788801e+07,0.146460,0.446735,0.446918,6.816768,1.032557e+07,-0.0240,0.2790,-4193301.0,-275364.0
9111,2024-12-11 20:00:00,6.8510,6.9060,6.8250,6.8450,3149226,2.162340e+07,63129,1365533,9.378979e+06,...,-1.736733e+07,0.145640,0.433609,0.433742,6.819591,9.607936e+06,-0.0070,-0.0310,-2561358.0,-6754659.0
9112,2024-12-12 00:00:00,6.8450,7.1500,6.7750,7.0540,7133997,4.963662e+07,137048,3651954,2.542302e+07,...,2.801321e+07,0.143724,0.511909,0.512183,6.843032,9.360542e+06,0.2090,0.2020,3984771.0,1423413.0
9113,2024-12-12 04:00:00,7.0530,7.2150,7.0400,7.0800,6015021,4.283726e+07,108737,2932569,2.090040e+07,...,-6.799359e+06,0.140416,0.487541,0.487902,6.866729,9.025990e+06,0.0260,0.2350,-1118976.0,2865795.0


# Import CSV

In [28]:
df_test = pd.read_csv("/home/ubuntu/Rheza/local-share/02_NEAR_USDT/near_test_1.csv")
df_test

Unnamed: 0,opentime,openprice,highprice,lowprice,closeprice,volume,quotevolume,trades,taker_buy_volume,taker_buy_quote,...,rolling_price_spread,moving_avg_crossover,day_of_week,hour_of_day,return_to_volatility_ratio,historical_volatility,uptrend_volume,downtrend_volume,volume_volatility_interaction,quote_momentum_interaction
0,1602936000000,0.7994,0.8073,0.7857,0.7935,783374,6.238367e+05,6806,388355,3.097979e+05,...,0.2274,-0.146228,6,12,-0.268467,1.695386,0,783374,2.153605e+06,-1.129768e+04
1,1602939600000,0.7935,0.8040,0.7744,0.7841,1139354,9.015986e+05,9553,612633,4.856943e+05,...,0.2253,-0.145322,6,13,-0.309924,0.686394,0,1139354,4.354969e+06,-1.942043e+04
2,1602943200000,0.7840,0.8036,0.7752,0.7978,1905833,1.506094e+06,13371,1010611,7.994994e+05,...,0.1947,-0.141476,6,14,0.480461,1.587069,0,1905833,6.982154e+06,-6.882850e+03
3,1602946800000,0.7967,0.8130,0.7887,0.7900,984310,7.858549e+05,5701,529272,4.236381e+05,...,0.1947,-0.135062,6,15,-0.272952,1.610187,0,984310,3.032678e+06,-8.015720e+03
4,1602950400000,0.7899,0.7977,0.7560,0.7609,1452344,1.125279e+06,10245,605373,4.696621e+05,...,0.1805,-0.131576,6,16,-0.665597,2.716583,0,1452344,8.010945e+06,-3.896842e+04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36400,1733976000000,7.0530,7.1250,7.0400,7.0810,1549828,1.097683e+07,29710,682310,4.834212e+06,...,0.7860,0.331720,4,4,0.328805,1.528993,1549828,0,1.871241e+06,1.891308e+06
36401,1733979600000,7.0820,7.2150,7.0710,7.1270,2039678,1.460077e+07,34191,1225453,8.771590e+06,...,0.8510,0.342840,4,5,0.312015,0.126791,2039678,0,4.153778e+06,2.785827e+06
36402,1733983200000,7.1270,7.1680,7.0640,7.1220,1633077,1.162963e+07,26984,677472,4.826473e+06,...,0.8510,0.354240,4,6,-0.047652,0.358911,1633077,0,2.404304e+06,1.859578e+06
36403,1733986800000,7.1210,7.1440,7.0670,7.0800,792438,5.630026e+06,17852,347334,2.468128e+06,...,0.8500,0.361680,4,7,-0.528430,0.608333,792438,0,8.634177e+05,5.494905e+05


In [29]:
df_test.columns

Index(['opentime', 'openprice', 'highprice', 'lowprice', 'closeprice',
       'volume', 'quotevolume', 'trades', 'taker_buy_volume',
       'taker_buy_quote', 'volatility', 'avg_volatility_4h',
       'next_avg_volatility_4h', 'next_volatility_4h', 'date', 'month',
       'datetime', 'volume_change', 'quote_volume_change', 'relative_volume',
       'taker_buy_volume_ratio', 'taker_buy_quote_volume_ratio',
       'rolling_avg_volume', 'rolling_std_volume', 'rolling_low_volatility',
       'normalized_price_range', 'rolling_avg_volatility',
       'rolling_std_volatility', 'rolling_avg_volatility_4h',
       'rolling_std_volatility_4h', 'percentage_price_change',
       'intraday_volatility', 'ema_closeprice', 'ema_volume',
       'bollinger_upper', 'bollinger_lower', 'lag_price_change_1h',
       'lag_price_change_2h', 'lag_volume_change_1h', 'lag_volume_change_2h',
       'vwap', 'uptrend_indicator', 'downtrend_indicator',
       'momentum_close_10h', 'momentum_volume_10h', 'relative_h

In [None]:
X = near_df.select([col for col in near_df.columns if col not in ['opentime', 'datetime', 'next_avg_volatility_4h', 'date', 'next_volatility_4h']])
y = near_df['next_avg_volatility_4h']

# ML Baseline (Linear Regression)

## Evaluation Function

In [10]:
import numpy as np
import polars as pl
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Adjust the evaluate_baseline function to accept only y_pred and y_test
def evaluate_baseline(y_pred, y_test):
    # Convert lists to Numpy arrays
    pred = np.array(y_pred)
    test = np.array(y_test)

    # Evaluation Metrics
    me = np.mean(pred - test)  # Mean Error
    mae = mean_absolute_error(test, pred)  # Mean Absolute Error
    mape = np.mean(np.abs((test - pred) / test)) * 100  # Mean Absolute Percentage Error
    mpe = np.mean((test - pred) / test) * 100  # Mean Percentage Error
    rmse = np.sqrt(mean_squared_error(test, pred))  # Root Mean Squared Error

    r = r2_score(test, pred)  # Coefficient of Determination (R-squared)
    min_max_error = np.abs((np.min(pred) - np.min(test)) + 
                            (np.max(pred) - np.max(test)))  # Min-Max Error

    df = pl.DataFrame({
        f'volatility_prediction': pred,
        f'volatility_actual': test
    })

    df = df.with_columns(
        abs(pl.col(f'volatility_prediction') - pl.col(f'volatility_actual')).alias(f'dif_volatility')
    )

    # overall median abs error
    median_abs_err =  df[f'dif_volatility'].median()

    # overall var abs error
    var_abs_err =  df[f'dif_volatility'].var()

    # Results
    results = {
        "Overall Median Absolute Error" : median_abs_err,
        "Overall Variance Absolute Error" : var_abs_err,
        "Mean Error": me,
        "Mean Absolute Error ": mae,
        "Mean Absolute Percentage Error": mape,
        "Mean Percentage Error": mpe,
        "Root Mean Squared Error": rmse,
        "R-squared": r, 
        "Min-Max Error": min_max_error
    }
    
    return results, df

## Sliding Window

In [97]:
import time
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

# Separate df into features and target
features_df = df_features.drop(['opentime', 'next_4h_avg_volatility'], axis=1)
target_df = df_features['next_4h_avg_volatility']

# Convert to NumPy arrays
X = features_df.values
y = target_df.values

# Specify parameters for the sliding window approach
num_predictions = 1   # Number of rows to predict
gap = 1               # Gap (number of rows to skip after each window)
max_windows = [500]    # Maximum number of windows to process
set_limit = True      # Set this to False to process all windows

# Define list of window sizes
window_sizes = [5000]

# List to store results
all_results = []

# Loop through each evaluation size
for max_window in max_windows:
# Loop through each window size
    for window_size in window_sizes:
        print(f'Processing window size: {window_size}')

        # Calculate the number of windows based on dataset size
        num_windows = len(X) - window_size - num_predictions

        # Adjust the starting point if set_limit is True
        start_index = 0
        if set_limit:
            # Only process the latest max_window
            start_index = max(0, len(X) - window_size - num_predictions - max_window)
            num_windows = min(num_windows, max_window)

        # Initialize lists to store actual and predicted values for this window size
        y_vals = []
        y_preds = []

        # Loop through each sliding window with the gap applied
        for window_number in range(start_index, start_index + num_windows, gap):
            
            start = window_number
            end = start + window_size
            X_train = X[start:end]
            y_train = y[start:end]

            X_train_mean = np.mean(X_train, axis=0)
            X_train_std = np.std(X_train, axis=0)
            X_train_normalized = (X_train - X_train_mean) / X_train_std

            # Get the column index for 'cur_avg_vol_4h' from features_df
            close_index = features_df.columns.get_loc('4h_avg_volatility')

            # Normalize y_train using the mean and std of cur_avg_vol_4h
            close_mean = X_train[:, close_index].mean()
            close_std = X_train[:, close_index].std()
            y_train_normalized = (y_train - close_mean) / close_std

            # Prepare validation data for prediction
            X_val = X[end:end + num_predictions]
            y_val = y[end:end + num_predictions]

            # Normalize validation data using the statistics from the training set
            X_val_normalized = (X_val - X_train_mean) / X_train_std

            # Track the start time of the window processing
            start_time = time.time()

            # Initialize and fit the model
            model = LinearRegression()
            model.fit(X_train_normalized, y_train_normalized)

            # Predict on validation data
            y_pred_val = model.predict(X_val_normalized)
            # Denormalize y_val and y_pred_val using the mean and std of cur_avg_vol_4h
            y_pred_val_denorm = y_pred_val * close_std + close_mean

            # Track the end time of the window processing
            end_time = time.time()

            # Append actual and predicted values to the lists
            y_vals.append(y_val[0])  # Assuming a single prediction per window
            y_preds.append(y_pred_val_denorm[0])  # Assuming a single prediction per window

        # Now that all windows for this window_size are processed, evaluate the baseline
        eval_results, _ = evaluate_baseline(y_preds, y_vals)

        # Append the window size and evaluation metrics to the results list
        all_results.append({
            'eval_size' : max_window,
            'window_size': window_size,
            **eval_results
        })

# Convert the results to a DataFrame for further analysis
results_df = pd.DataFrame(all_results)

# Print the results DataFrame
results_df

Processing window size: 5000


Unnamed: 0,eval_size,window_size,Overall Median Absolute Error,Overall Variance Absolute Error,Mean Error,Mean Absolute Error,Mean Absolute Percentage Error,Mean Percentage Error,Root Mean Squared Error,R-squared,Min-Max Error
0,500,5000,0.003892,3.3e-05,-0.000623,0.005132,28.513307,-9.211112,0.007713,0.327034,0.04431


## Direct Random Split

In [98]:
import time
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Separate df into features and target
features_df = df_features.drop(['opentime', 'next_4h_avg_volatility'], axis=1)
target_df = df_features['next_4h_avg_volatility']

# Convert to NumPy arrays
X = features_df.values
y = target_df.values

# Define list of train-test split proportions
train_test_ratios = [0.10]  # Example proportions for training set

# List to store results
all_results = []

# Loop through each train-test ratio
for train_ratio in train_test_ratios:
    print(f'Processing train-test split ratio: {train_ratio}')

    # Perform train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=train_ratio, random_state=42
    )

    # Normalize training data
    X_train_mean = np.mean(X_train, axis=0)
    X_train_std = np.std(X_train, axis=0)
    X_train_normalized = (X_train - X_train_mean) / X_train_std

    # Get the column index for 'volatility' from features_df
    close_index = features_df.columns.get_loc('4h_avg_volatility')

    # Normalize y_train using the mean and std of the 'volatility' column
    close_mean = X_train[:, close_index].mean()
    close_std = X_train[:, close_index].std()
    y_train_normalized = (y_train - close_mean) / close_std

    # Normalize test data using the training set's statistics
    X_test_normalized = (X_test - X_train_mean) / X_train_std

    # Initialize and fit the model
    start_time = time.time()
    model = LinearRegression()
    model.fit(X_train_normalized, y_train_normalized)

    # Predict on the test set
    y_pred_test = model.predict(X_test_normalized)
    # Denormalize predictions
    y_pred_test_denorm = y_pred_test * close_std + close_mean

    # Track the end time of the process
    end_time = time.time()

    # Evaluate the model
    eval_results, _ = evaluate_baseline(y_pred_test_denorm, y_test)

    # Append results to the list
    all_results.append({
        'train_ratio': train_ratio,
        **eval_results
    })

# Convert results to a DataFrame for further analysis
results_df = pd.DataFrame(all_results)

# Print the results DataFrame
results_df


Processing train-test split ratio: 0.1


Unnamed: 0,train_ratio,Overall Median Absolute Error,Overall Variance Absolute Error,Mean Error,Mean Absolute Error,Mean Absolute Percentage Error,Mean Percentage Error,Root Mean Squared Error,R-squared,Min-Max Error
0,0.1,0.004605,9.6e-05,-8.6e-05,0.006823,36.002128,-14.088654,0.011944,0.207818,0.0613


## Chronological Split

In [54]:
import time
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

# Separate df into features and target
features_df = df_features.drop(['opentime', 'next_4h_avg_volatility'], axis=1)
target_df = df_features['next_4h_avg_volatility']

# Convert to NumPy arrays
X = features_df.values
y = target_df.values

# Define list of train-test split proportions
train_test_ratios = [0.93]  # Example proportions for training set

# List to store results
all_results = []

# Loop through each train-test ratio
for train_ratio in train_test_ratios:
    print(f'Processing train-test split ratio: {train_ratio}')

    # Split data chronologically
    split_index = int(len(X) * train_ratio)
    X_train, X_test = X[:split_index], X[split_index:]
    y_train, y_test = y[:split_index], y[split_index:]

    # Normalize training data
    X_train_mean = np.mean(X_train, axis=0)
    X_train_std = np.std(X_train, axis=0)
    X_train_normalized = (X_train - X_train_mean) / X_train_std

    # Get the column index for 'volatility' from features_df
    close_index = features_df.columns.get_loc('volatility')

    # Normalize y_train using the mean and std of the 'volatility' column
    close_mean = X_train[:, close_index].mean()
    close_std = X_train[:, close_index].std()
    y_train_normalized = (y_train - close_mean) / close_std

    # Normalize test data using the training set's statistics
    X_test_normalized = (X_test - X_train_mean) / X_train_std

    # Initialize and fit the model
    start_time = time.time()
    model = LinearRegression()
    model.fit(X_train_normalized, y_train_normalized)

    # Predict on the test set
    y_pred_test = model.predict(X_test_normalized)
    # Denormalize predictions
    y_pred_test_denorm = y_pred_test * close_std + close_mean

    # Track the end time of the process
    end_time = time.time()

    # Evaluate the model
    eval_results, _ = evaluate_baseline(y_pred_test_denorm, y_test)

    # Append results to the list
    all_results.append({
        'train_ratio': train_ratio,
        **eval_results
    })

# Convert results to a DataFrame for further analysis
results_df = pd.DataFrame(all_results)

# Print the results DataFrame
results_df

Processing train-test split ratio: 0.93


Unnamed: 0,train_ratio,Overall Median Absolute Error,Overall Variance Absolute Error,Mean Error,Mean Absolute Error,Mean Absolute Percentage Error,Mean Percentage Error,Root Mean Squared Error,R-squared,Min-Max Error
0,0.93,0.003191,2.5e-05,-0.000763,0.004494,25.182982,-4.964374,0.006712,0.418205,0.049216
