### Preparation

and Time Window Selection

In [2]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import numpy as np
import pandas as pd

In [3]:
dex = pd.read_pickle("datasets/dex_WETH_USDT_2022.pkl")
cex = pd.read_pickle("datasets/cex_ETH_USDT_2022.pkl")

merged = cex.merge(dex, left_index=True, right_index=True, suffixes=('_cex', '_dex'))
start_time = '2022-01-01 00:00:00'
end_time = '2022-01-08 23:59:59'
window = merged.loc[start_time:end_time]

### Linear Regression considering all Candles 

- we only consider close prices
- independent of price shocks (including times of sideways movement, the whole given time frame)
- we try to predict DEX Close price by the CEX Price 1-5 minutes before
- as benchmark we predict DEX Close price by its own CEX Price 1-5 minutes before
- 80/20 Train/Test split

In [14]:
# Load the pickle file
#%ntbl pull datasets subsets
# df = pd.read_pickle('datasets/subsets/2022-01-09_2-eth.pkl')

# Create lags of cex and dex
df = window.copy()
for i in range(0, 6):
	df.loc[:, f'lag_{i}_cex'] = df['close_cex'].shift(i)
	df.loc[:, f'lag_{i}_dex'] = df['close_dex'].shift(i)

# Drop the initial rows which have NaN values due to lag
df = df.dropna()

# Define the models
models = [
	('Linear Regression', LinearRegression()),
	# ('SVM', SVR()),
	# ('KNN', KNeighborsRegressor())
]

# Define the lag columns
lag_columns_cex = [f'lag_{i}_cex' for i in range(0, 6)]
lag_columns_dex = [f'lag_{i}_dex' for i in range(0, 6)]

# Split the data into train and test sets
train_size = int(len(df) * 0.8)
train, test = df[:train_size], df[train_size:]

# Create a DataFrame to store the results
results = pd.DataFrame(columns=['Model', 'Lag', 'Test MSE', 'Test R2', 'Test MSE Benchmark', 'Test R2 Benchmark'])

# For each number of lags
for num_lags in range(0, 6):
	
	# For each model
	for name, model in models:
		
		# Train on cex to predict dex
		X_train, y_train = train[[lag_columns_cex[num_lags]]], train['close_dex']
		X_test, y_test = test[[lag_columns_cex[num_lags]]], test['close_dex']
		model.fit(X_train, y_train)
		predictions = model.predict(X_test)
		mse = mean_squared_error(y_test, predictions)
		r2 = r2_score(y_test, predictions)
		
		# Train on dex to predict dex
		X_train_dex, y_train_dex = train[[lag_columns_dex[num_lags]]], train['close_dex']
		X_test_dex, y_test_dex = test[[lag_columns_dex[num_lags]]], test['close_dex']
		model.fit(X_train_dex, y_train_dex)
		predictions_dex = model.predict(X_test_dex)
		mse_dex = mean_squared_error(y_test_dex, predictions_dex)
		r2_dex = r2_score(y_test_dex, predictions_dex)
		new_row = pd.DataFrame({'Model': [name], 'Lag': [num_lags], 'Test MSE': [mse],'Test R2': [r2], 'Test MSE Benchmark': [mse_dex],'Test R2 Benchmark': [r2_dex]})
		results = pd.concat([results, new_row], ignore_index=True)
        
display(results)

Unnamed: 0,Model,Lag,Test MSE,Test R2,Test MSE Benchmark,Test R2 Benchmark
0,Linear Regression,0,27.563777,0.992446,0.0,1.0
1,Linear Regression,1,27.590598,0.992439,22.455155,0.993846
2,Linear Regression,2,34.85618,0.990448,38.775641,0.989374
3,Linear Regression,3,49.235656,0.986507,55.288581,0.984849
4,Linear Regression,4,63.189801,0.982683,72.974152,0.980002
5,Linear Regression,5,81.263265,0.97773,93.113999,0.974483


### Linear Regression considering only effects of Price shocks

- only consider close prices
- filter the cex data for the 5% of highest candles (in both directions)
- for those price shocks add the dex price 0-6 minutes after the CEX shock, which we are trying to predict
- as benchmark we predict DEX Close price by its own DEX Price at the time of the shock, also lag 0 can be considered as benchmark
- 80/20 Train/Test split

In [70]:
df = window.copy()
# Calculate the absolute difference between open and close prices for cex
df['shock_height'] = abs(df['open_cex'] - df['close_cex'])
df['price_diff'] = df['close_cex'] - df['open_cex']

# Calculate the 95th percentile of the absolute difference
threshold = np.percentile(df['shock_height'], 95)

# Filter rows where the absolute difference is greater than or equal to the threshold
df_shock = df[df['shock_height'] >= threshold]

# Drop the 'cex_candle' column as it's no longer needed
# df_shock = df_shock.drop(columns=['cex_candle'])

# Create a DataFrame to store the new dataset
df_new = pd.DataFrame(columns=['shock_timestamp', 'price_diff', 'close_cex'] + [f'lag_{i}_dex' for i in range(0, 7)] + [f'lag_{i}_dex_diff' for i in range(0, 7)])

# For each row in the df_shock DataFrame
for row_num, (index, row) in enumerate(df_shock.iterrows()):
    # If there are at least 6 rows after the current row
    if row_num <= len(df_shock) - 7:
        # Create a new row
        new_row = {'shock_timestamp': index, 'price_diff': row['price_diff'], 'close_cex': row['close_cex'],}
        # For each lag from 1 to 6
        for i in range(0, 7):
            # Add the lagged dex close price to the new row
            new_row[f'lag_{i}_dex'] = df_shock.iloc[row_num + i]['close_dex']
            new_row[f'lag_{i}_dex_diff'] = df_shock.iloc[row_num + i]['close_dex'] - df_shock.iloc[row_num + i]['open_dex']
        # Append the new row to the new DataFrame
        df_new = pd.concat([df_new, pd.DataFrame([new_row])], ignore_index=True)


df_new = df_new.set_index('shock_timestamp')
df_new

Unnamed: 0_level_0,price_diff,close_cex,lag_0_dex,lag_1_dex,lag_2_dex,lag_3_dex,lag_4_dex,lag_5_dex,lag_6_dex,lag_0_dex_diff,lag_1_dex_diff,lag_2_dex_diff,lag_3_dex_diff,lag_4_dex_diff,lag_5_dex_diff,lag_6_dex_diff
shock_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2022-01-01 00:00:00,8.62,3684.84,3674.629392,3687.793268,3693.416131,3696.862109,3696.896951,3715.269059,3729.505163,0.000000,13.163876,7.339010,-2.220550,-0.002827,1.232823,-0.126271
2022-01-01 00:01:00,6.70,3691.55,3687.793268,3693.416131,3696.862109,3696.896951,3715.269059,3729.505163,3734.195455,13.163876,7.339010,-2.220550,-0.002827,1.232823,-0.126271,-1.521239
2022-01-01 00:05:00,8.52,3698.64,3693.416131,3696.862109,3696.896951,3715.269059,3729.505163,3734.195455,3730.622342,7.339010,-2.220550,-0.002827,1.232823,-0.126271,-1.521239,0.000000
2022-01-01 00:07:00,-6.79,3698.34,3696.862109,3696.896951,3715.269059,3729.505163,3734.195455,3730.622342,3750.854552,-2.220550,-0.002827,1.232823,-0.126271,-1.521239,0.000000,17.851972
2022-01-01 00:11:00,-7.99,3691.19,3696.896951,3715.269059,3729.505163,3734.195455,3730.622342,3750.854552,3763.386656,-0.002827,1.232823,-0.126271,-1.521239,0.000000,17.851972,17.916450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-01-08 21:30:00,8.20,3074.10,3070.736291,3076.431407,3093.248194,3083.361747,3092.071459,3107.049544,3105.284496,9.947947,8.196256,10.974102,-7.574501,4.739351,8.551221,0.596345
2022-01-08 21:40:00,17.63,3079.80,3076.431407,3093.248194,3083.361747,3092.071459,3107.049544,3105.284496,3113.434079,8.196256,10.974102,-7.574501,4.739351,8.551221,0.596345,-8.374288
2022-01-08 21:42:00,7.65,3091.23,3093.248194,3083.361747,3092.071459,3107.049544,3105.284496,3113.434079,3121.209598,10.974102,-7.574501,4.739351,8.551221,0.596345,-8.374288,6.204991
2022-01-08 21:46:00,-12.50,3079.06,3083.361747,3092.071459,3107.049544,3105.284496,3113.434079,3121.209598,3123.275213,-7.574501,4.739351,8.551221,0.596345,-8.374288,6.204991,-6.488012


In [57]:
df = df_new.copy()
# Define the models
models = [
    ('Linear Regression', LinearRegression()),
    # ('SVM', SVR()),
    # ('KNN', KNeighborsRegressor())
]

# Split the data into train and test sets
train_size = int(len(df) * 0.8)
train, test = df[:train_size], df[train_size:]

# Create a DataFrame to store the results
results = pd.DataFrame(columns=['Model', 'Lag', 'Test MSE', 'Test R2', 'Test MSE Benchmark', 'Test R2 Benchmark'])

# For each number of lags
for l in range(1, 7):

  # For each model
  for name, model in models:

    # Train on cex to predict dex
    X_train, y_train = train[['close_cex']], train[f'lag_{l}_dex']
    X_test, y_test = test[['close_cex']], test[f'lag_{l}_dex']
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    
    # Train on dex to predict dex
    X_train_dex, y_train_dex = train[['lag_0_dex']], train[f'lag_{l}_dex']
    X_test_dex, y_test_dex = test[['lag_0_dex']], test[f'lag_{l}_dex']
    model.fit(X_train_dex, y_train_dex)
    predictions_dex = model.predict(X_test_dex)
    mse_dex = mean_squared_error(y_test_dex, predictions_dex)
    r2_dex = r2_score(y_test_dex, predictions_dex)
    new_row = pd.DataFrame({'Model': [name], 'Lag': [l], 'Test MSE': [mse],'Test R2': [r2], 'Test MSE Benchmark': [mse_dex],'Test R2 Benchmark': [r2_dex]})
    results = pd.concat([results, new_row], ignore_index=True)

# Display the results
results

Unnamed: 0,Model,Lag,Test MSE,Test R2,Test MSE Benchmark,Test R2 Benchmark
0,Linear Regression,0,60.030859,0.988207,0.0,1.0
1,Linear Regression,1,166.291764,0.96728,232.537326,0.954245
2,Linear Regression,2,422.428297,0.91688,506.60061,0.900317
3,Linear Regression,3,698.595522,0.862414,801.70159,0.842108
4,Linear Regression,4,1024.667696,0.79809,1113.033331,0.780678
5,Linear Regression,5,1299.991426,0.743905,1368.380051,0.730433
6,Linear Regression,6,1569.438002,0.690839,1614.029601,0.682055


### Same with Cross Validation

In [73]:
df = df_new.copy()
# Define the models
models = [
    ('Linear Regression', LinearRegression()),
    # ('SVM', SVR()),
    # ('KNN', KNeighborsRegressor())
]

# Create a DataFrame to store the results
results = pd.DataFrame(columns=['Model', 'Lag', 'Test MSE', 'Test R2', 'Test MSE Benchmark', 'Test R2 Benchmark'])
cv = KFold(n_splits=10, random_state=1, shuffle=True)

# For each number of lags
for l in range(0, 7):

  # For each model
  for name, model in models:

    # Train on cex to predict dex
    X = df[['close_cex']]
    y = df[f'lag_{l}_dex']
    mse_scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
    #mse_scores = -mse_scores  # flip the sign to make MSE positive
    r2_scores = cross_val_score(model, X, y, scoring='r2', cv=cv, n_jobs=-1)

    # Train on dex to predict dex
    X_dex = df[['lag_0_dex']]
    y_dex = df[f'lag_{l}_dex']
    dex_mse_scores = cross_val_score(model, X_dex, y_dex, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
    #dex_mse_scores = -dex_mse_scores  # flip the sign to make MSE positive
    dex_r2_scores = cross_val_score(model, X_dex, y_dex, scoring='r2', cv=cv, n_jobs=-1)

    new_row = pd.DataFrame({'Model': [name], 'Lag': [l], 'Test MSE': ['%.3f' % (np.mean(mse_scores))],'Test R2': ['%.3f' % (np.mean(r2_scores))], 'Test MSE Benchmark': ['%.3f' % (np.mean(dex_mse_scores))],'Test R2 Benchmark': ['%.3f' % (np.mean(dex_r2_scores))]})
    results = pd.concat([results, new_row], ignore_index=True)

# Display the results
display(results)

Unnamed: 0,Model,Lag,Test MSE,Test R2,Test MSE Benchmark,Test R2 Benchmark
0,Linear Regression,0,-66.148,0.999,-0.0,1.0
1,Linear Regression,1,-209.369,0.997,-223.654,0.997
2,Linear Regression,2,-425.787,0.994,-462.055,0.993
3,Linear Regression,3,-680.449,0.99,-717.36,0.989
4,Linear Regression,4,-951.138,0.986,-989.414,0.985
5,Linear Regression,5,-1218.409,0.982,-1266.717,0.981
6,Linear Regression,6,-1506.824,0.978,-1559.395,0.977


In [78]:
df = df_new.copy()
# Define the models
models = [
    ('Linear Regression', LinearRegression()),
    # ('SVM', SVR()),
    # ('KNN', KNeighborsRegressor())
]

# Create a DataFrame to store the results
results = pd.DataFrame(columns=['Model', 'Lag', 'Test MSE', 'Test R2', 'Test MSE Benchmark', 'Test R2 Benchmark'])
cv = KFold(n_splits=10, random_state=1, shuffle=True)

# For each number of lags
for l in range(0, 7):

  # For each model
  for name, model in models:

    # Train on cex to predict dex
    X = df[['price_diff']]
    y = df[[f'lag_{l}_dex_diff']]
    mse_scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
    #mse_scores = -mse_scores  # flip the sign to make MSE positive
    r2_scores = cross_val_score(model, X, y, scoring='r2', cv=cv, n_jobs=-1)

    # Train on dex to predict dex
    X_dex = df[['lag_0_dex_diff']]
    y_dex = df[f'lag_{l}_dex_diff']
    dex_mse_scores = cross_val_score(model, X_dex, y_dex, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
    #dex_mse_scores = -dex_mse_scores  # flip the sign to make MSE positive
    dex_r2_scores = cross_val_score(model, X_dex, y_dex, scoring='r2', cv=cv, n_jobs=-1)

    new_row = pd.DataFrame({'Model': [name], 'Lag': [l], 'Test MSE': ['%.3f' % (np.mean(mse_scores))],'Test R2': ['%.3f' % (np.mean(r2_scores))], 'Test MSE Benchmark': ['%.3f' % (np.mean(dex_mse_scores))],'Test R2 Benchmark': ['%.3f' % (np.mean(dex_r2_scores))]})
    results = pd.concat([results, new_row], ignore_index=True)

# Display the results
display(results)

Unnamed: 0,Model,Lag,Test MSE,Test R2,Test MSE Benchmark,Test R2 Benchmark
0,Linear Regression,0,-62.493,0.441,-0.0,1.0
1,Linear Regression,1,-115.256,-0.014,-115.928,-0.019
2,Linear Regression,2,-115.878,-0.027,-115.621,-0.023
3,Linear Regression,3,-116.308,-0.025,-115.463,-0.022
4,Linear Regression,4,-115.737,-0.022,-115.794,-0.019
5,Linear Regression,5,-116.296,-0.033,-116.292,-0.034
6,Linear Regression,6,-115.999,-0.019,-115.959,-0.02
