### Preparation

and Time Window Selection

In [1]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import numpy as np
import pandas as pd

In [17]:
dex = pd.read_pickle("datasets/subsets/dex_WETH_USDT_2022_wk1_allblocks.pkl")
cex = pd.read_pickle("datasets/cex_ETH_USDT_2022.pkl")

# merged = cex.merge(dex, left_index=True, right_index=True, suffixes=('_cex', '_dex'))
start_time = '2022-01-01 00:00:00'
end_time = '2022-01-08 23:59:59'
cex = cex.loc[start_time:end_time]
dex = dex.loc[start_time:end_time]

### Linear Regression considering only effects of Price shocks

- only consider close prices
- filter the cex data for the 5% of highest candles (in both directions)
- for those price shocks add the dex price 0-12 **blocks** after the CEX shock, which we are trying to predict
- as benchmark we predict DEX Close price by its own DEX Price at the time of the shock, also lag 0 can be considered as benchmark
- 80/20 Train/Test split

In [19]:
df = cex.copy()
# Calculate the absolute difference between open and close prices for cex
df['shock_height'] = abs(df['open'] - df['close'])
df['price_diff'] = df['close'] - df['open']

# Calculate the 95th percentile of the absolute difference
threshold = np.percentile(df['shock_height'], 95)

# Filter rows where the absolute difference is greater than or equal to the threshold
df_shock = df[df['shock_height'] >= threshold]

# Drop the 'cex_candle' column as it's no longer needed
# df_shock = df_shock.drop(columns=['cex_candle'])

# Create a DataFrame to store the new dataset
df_new = pd.DataFrame(columns=['shock_timestamp', 'price_diff', 'close_cex'] + [f'lag_{i}_dex' for i in range(1, 12)])

# For each row in the df_shock DataFrame
for row_num, (index, row) in enumerate(df_shock.iterrows()):
    # If there are at least 6 blocks after the current index
    if row_num <= len(df_shock) - 12:
        # Create a new row
        new_row = {'shock_timestamp': index, 'price_diff': row['price_diff'], 'close_cex': row['close'],}
        # For each lag from 1 to 12
        df_dex_after = dex.loc[index:]
        for i in range(1, 12):
            # Add the lagged dex close price to the new row
            new_row[f'lag_{i}_dex'] = df_dex_after.iloc[i]['price']
        # Append the new row to the new DataFrame
        df_new = pd.concat([df_new, pd.DataFrame([new_row])], ignore_index=True)


df_new = df_new.set_index('shock_timestamp')
df_new

Unnamed: 0_level_0,price_diff,close_cex,lag_1_dex,lag_2_dex,lag_3_dex,lag_4_dex,lag_5_dex,lag_6_dex,lag_7_dex,lag_8_dex,lag_9_dex,lag_10_dex,lag_11_dex
shock_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2022-01-01 00:00:00,8.62,3684.84,3674.629392,3674.629392,3674.629392,3674.629392,3674.629392,3674.629392,3674.629392,3685.919262,3687.793268,3687.793268,3687.793268
2022-01-01 00:01:00,6.70,3691.55,3674.629392,3685.919262,3687.793268,3687.793268,3687.793268,3687.793268,3687.793268,3686.599758,3686.599758,3686.599758,3686.599758
2022-01-01 00:05:00,8.52,3698.64,3693.416131,3693.416131,3693.416131,3693.416131,3693.416131,3693.416131,3693.024931,3702.250195,3702.250195,3699.082659,3699.082659
2022-01-01 00:07:00,-6.79,3698.34,3699.082659,3697.802559,3697.802559,3696.862109,3696.659493,3696.659493,3696.563672,3696.563672,3696.563672,3696.193301,3696.193301
2022-01-01 00:11:00,-7.99,3691.19,3696.896951,3696.896951,3696.896951,3696.896951,3696.896951,3696.896951,3696.896951,3687.708950,3687.708950,3687.708950,3687.708950
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-01-08 20:26:00,10.68,3033.73,3027.546162,3027.140239,3027.140239,3027.140239,3027.140239,3027.140239,3027.140239,3026.670816,3026.670816,3030.040498,3030.040498
2022-01-08 20:46:00,-8.03,3032.89,3041.585154,3041.585154,3042.116250,3042.887478,3042.887478,3043.803386,3043.803386,3043.803386,3043.803386,3043.803386,3043.803386
2022-01-08 20:57:00,9.54,3034.45,3027.888278,3027.888278,3027.983319,3027.983319,3028.976509,3028.976509,3028.976509,3028.976509,3028.976509,3028.621602,3028.621602
2022-01-08 21:01:00,7.50,3037.42,3028.771472,3028.771472,3029.012887,3029.012887,3028.301614,3028.301614,3028.301614,3038.171818,3038.171818,3038.171818,3038.171818


In [20]:
df = df_new.copy()
# Define the models
models = [
    ('Linear Regression', LinearRegression()),
    # ('SVM', SVR()),
    # ('KNN', KNeighborsRegressor())
]

# Split the data into train and test sets
train_size = int(len(df) * 0.8)
train, test = df[:train_size], df[train_size:]

# Create a DataFrame to store the results
results = pd.DataFrame(columns=['Model', 'Lag', 'Test MSE', 'Test R2', 'Test MSE Benchmark', 'Test R2 Benchmark'])

# For each number of lags
for l in range(1, 12):

  # For each model
  for name, model in models:

    # Train on cex to predict dex
    X_train, y_train = train[['close_cex']], train[f'lag_{l}_dex']
    X_test, y_test = test[['close_cex']], test[f'lag_{l}_dex']
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    
    # Train on dex to predict dex
    X_train_dex, y_train_dex = train[['lag_1_dex']], train[f'lag_{l}_dex']
    X_test_dex, y_test_dex = test[['lag_1_dex']], test[f'lag_{l}_dex']
    model.fit(X_train_dex, y_train_dex)
    predictions_dex = model.predict(X_test_dex)
    mse_dex = mean_squared_error(y_test_dex, predictions_dex)
    r2_dex = r2_score(y_test_dex, predictions_dex)
    new_row = pd.DataFrame({'Model': [name], 'Lag': [l], 'Test MSE': [mse],'Test R2': [r2], 'Test MSE Benchmark': [mse_dex],'Test R2 Benchmark': [r2_dex]})
    results = pd.concat([results, new_row], ignore_index=True)

# Display the results
results

Unnamed: 0,Model,Lag,Test MSE,Test R2,Test MSE Benchmark,Test R2 Benchmark
0,Linear Regression,1,112.384207,0.978031,0.0,1.0
1,Linear Regression,2,80.904173,0.984224,22.646502,0.995584
2,Linear Regression,3,68.277709,0.98673,42.557128,0.991729
3,Linear Regression,4,75.414537,0.98522,53.769277,0.989462
4,Linear Regression,5,46.850335,0.990872,99.047132,0.980703
5,Linear Regression,6,34.433073,0.993298,89.866655,0.982508
6,Linear Regression,7,48.903082,0.990518,103.522706,0.979928
7,Linear Regression,8,53.618298,0.989604,119.593765,0.976811
8,Linear Regression,9,79.744651,0.984569,140.880604,0.972738
9,Linear Regression,10,83.36518,0.983797,152.296031,0.970399


### Same with Cross Validation

In [21]:
df = df_new.copy()
# Define the models
models = [
    ('Linear Regression', LinearRegression()),
    # ('SVM', SVR()),
    # ('KNN', KNeighborsRegressor())
]

# Create a DataFrame to store the results
results = pd.DataFrame(columns=['Model', 'Lag', 'Test MSE', 'Test R2', 'Test MSE Benchmark', 'Test R2 Benchmark'])
cv = KFold(n_splits=10, random_state=1, shuffle=True)

# For each number of lags
for l in range(1, 12):

  # For each model
  for name, model in models:

    # Train on cex to predict dex
    X = df[['close_cex']]
    y = df[f'lag_{l}_dex']
    mse_scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
    mse_scores = -mse_scores  # flip the sign to make MSE positive
    r2_scores = cross_val_score(model, X, y, scoring='r2', cv=cv, n_jobs=-1)

    # Train on dex to predict dex
    X_dex = df[['lag_1_dex']]
    y_dex = df[f'lag_{l}_dex']
    dex_mse_scores = cross_val_score(model, X_dex, y_dex, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
    dex_mse_scores = -dex_mse_scores  # flip the sign to make MSE positive
    dex_r2_scores = cross_val_score(model, X_dex, y_dex, scoring='r2', cv=cv, n_jobs=-1)

    new_row = pd.DataFrame({'Model': [name], 'Lag': [l], 'Test MSE': ['%.3f' % (np.mean(mse_scores))],'Test R2': ['%.3f' % (np.mean(r2_scores))], 'Test MSE Benchmark': ['%.3f' % (np.mean(dex_mse_scores))],'Test R2 Benchmark': ['%.3f' % (np.mean(dex_r2_scores))]})
    results = pd.concat([results, new_row], ignore_index=True)

# Display the results
display(results)

Unnamed: 0,Model,Lag,Test MSE,Test R2,Test MSE Benchmark,Test R2 Benchmark
0,Linear Regression,1,120.073,0.998,0.0,1.0
1,Linear Regression,2,98.181,0.999,39.748,0.999
2,Linear Regression,3,87.336,0.999,60.14,0.999
3,Linear Regression,4,60.89,0.999,83.425,0.999
4,Linear Regression,5,54.251,0.999,100.38,0.999
5,Linear Regression,6,65.698,0.999,100.93,0.998
6,Linear Regression,7,73.605,0.999,121.222,0.998
7,Linear Regression,8,67.262,0.999,127.144,0.998
8,Linear Regression,9,81.533,0.999,131.894,0.998
9,Linear Regression,10,92.15,0.999,142.958,0.998
