In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd
import pandas as pd

# Load the pickle file
#%ntbl pull datasets subsets
df = pd.read_pickle('datasets/subsets/2022-01-09_2-eth.pkl')

# Create lags of cex and dex
for i in range(1, 6):
	df[f'lag_{i}_cex'] = df['close_cex'].shift(i)
	df[f'lag_{i}_dex'] = df['close_dex'].shift(i)

# Drop the initial rows which have NaN values due to lag
df = df.dropna()

# Define the models
models = [
	('Linear Regression', LinearRegression()),
	('SVM', SVR()),
	('KNN', KNeighborsRegressor())
]

# Define the lag columns
lag_columns_cex = [f'lag_{i}_cex' for i in range(1, 6)]
lag_columns_dex = [f'lag_{i}_dex' for i in range(1, 6)]

# Split the data into train and test sets
train_size = int(len(df) * 0.8)
train, test = df[:train_size], df[train_size:]

# Create a DataFrame to store the results
results = pd.DataFrame(columns=['Model', 'Lag', 'Train On', 'Test MSE', 'Test R2', 'Test MSE Benchmark', 'Test R2 Benchmark'])

# For each number of lags
for num_lags in range(1, 6):
	
	# For each model
	for name, model in models:
		
		# Train on cex to predict dex
		X_train, y_train = train[lag_columns_cex[:num_lags]], train['close_dex']
		X_test, y_test = test[lag_columns_cex[:num_lags]], test['close_dex']
		model.fit(X_train, y_train)
		predictions = model.predict(X_test)
		mse = mean_squared_error(y_test, predictions)
		r2 = r2_score(y_test, predictions)
		
		# Train on dex to predict dex
		X_train_dex, y_train_dex = train[lag_columns_dex[:num_lags]], train['close_dex']
		X_test_dex, y_test_dex = test[lag_columns_dex[:num_lags]], test['close_dex']
		model.fit(X_train_dex, y_train_dex)
		predictions_dex = model.predict(X_test_dex)
		mse_dex = mean_squared_error(y_test_dex, predictions_dex)
		r2_dex = r2_score(y_test_dex, predictions_dex)
		results = results.append({'Model': name, 'Lag': num_lags, 'Test MSE': mse, 'Test R2': r2, 'Test MSE Benchmark': mse_dex, 'Test R2 Benchmark': r2_dex}, ignore_index=True)
        
print(results)

  results = results.append({'Model': name, 'Lag': num_lags, 'Test MSE': mse, 'Test R2': r2, 'Test MSE Benchmark': mse_dex, 'Test R2 Benchmark': r2_dex}, ignore_index=True)
  results = results.append({'Model': name, 'Lag': num_lags, 'Test MSE': mse, 'Test R2': r2, 'Test MSE Benchmark': mse_dex, 'Test R2 Benchmark': r2_dex}, ignore_index=True)
  results = results.append({'Model': name, 'Lag': num_lags, 'Test MSE': mse, 'Test R2': r2, 'Test MSE Benchmark': mse_dex, 'Test R2 Benchmark': r2_dex}, ignore_index=True)
  results = results.append({'Model': name, 'Lag': num_lags, 'Test MSE': mse, 'Test R2': r2, 'Test MSE Benchmark': mse_dex, 'Test R2 Benchmark': r2_dex}, ignore_index=True)
  results = results.append({'Model': name, 'Lag': num_lags, 'Test MSE': mse, 'Test R2': r2, 'Test MSE Benchmark': mse_dex, 'Test R2 Benchmark': r2_dex}, ignore_index=True)
  results = results.append({'Model': name, 'Lag': num_lags, 'Test MSE': mse, 'Test R2': r2, 'Test MSE Benchmark': mse_dex, 'Test R2 Benchmar

                Model Lag Train On   Test MSE   Test R2  Test MSE Benchmark  \
0   Linear Regression   1      NaN  10.003530  0.964234            6.202274   
1                 SVM   1      NaN   9.892608  0.964630            8.364482   
2                 KNN   1      NaN  25.335645  0.909416           12.863554   
3   Linear Regression   2      NaN   9.193433  0.967130            6.243194   
4                 SVM   2      NaN   9.297010  0.966760            9.481342   
5                 KNN   2      NaN  24.655019  0.911850           23.522659   
6   Linear Regression   3      NaN   8.728787  0.968792            6.263796   
7                 SVM   3      NaN   9.027016  0.967725           10.616460   
8                 KNN   3      NaN  23.636293  0.915492           30.964255   
9   Linear Regression   4      NaN   8.418912  0.969899            6.323468   
10                SVM   4      NaN   9.031262  0.967710           11.713427   
11                KNN   4      NaN  29.785837  0.893

  results = results.append({'Model': name, 'Lag': num_lags, 'Test MSE': mse, 'Test R2': r2, 'Test MSE Benchmark': mse_dex, 'Test R2 Benchmark': r2_dex}, ignore_index=True)
  results = results.append({'Model': name, 'Lag': num_lags, 'Test MSE': mse, 'Test R2': r2, 'Test MSE Benchmark': mse_dex, 'Test R2 Benchmark': r2_dex}, ignore_index=True)


In [10]:
display(results)

Unnamed: 0,Model,Lag,Train On,Test MSE,Test R2,Test MSE Benchmark,Test R2 Benchmark
0,Linear Regression,1,,10.00353,0.964234,6.202274,0.977825
1,SVM,1,,9.892608,0.96463,8.364482,0.970094
2,KNN,1,,25.335645,0.909416,12.863554,0.954008
3,Linear Regression,2,,9.193433,0.96713,6.243194,0.977678
4,SVM,2,,9.29701,0.96676,9.481342,0.966101
5,KNN,2,,24.655019,0.91185,23.522659,0.915898
6,Linear Regression,3,,8.728787,0.968792,6.263796,0.977605
7,SVM,3,,9.027016,0.967725,10.61646,0.962042
8,KNN,3,,23.636293,0.915492,30.964255,0.889292
9,Linear Regression,4,,8.418912,0.969899,6.323468,0.977391


In [31]:
df = pd.read_pickle('datasets/subsets/2022-01-09_2-eth.pkl')
# Calculate the absolute difference between open and close prices for cex
df['shock_height'] = abs(df['open_cex'] - df['close_cex'])

# Calculate the 95th percentile of the absolute difference
threshold = np.percentile(df['shock_height'], 95)

# Filter rows where the absolute difference is greater than or equal to the threshold
df_shock = df[df['shock_height'] >= threshold]

# Drop the 'cex_candle' column as it's no longer needed
# df_shock = df_shock.drop(columns=['cex_candle'])

# Create a DataFrame to store the new dataset
df_new = pd.DataFrame(columns=['shock_timestamp', 'shock_height', 'close_cex', 'close_dex'] + [f'lag_{i}_dex' for i in range(1, 7)])

# For each row in the df_shock DataFrame
for row_num, (index, row) in enumerate(df_shock.iterrows()):
    # If there are at least 6 rows after the current row
    if row_num <= len(df_shock) - 7:
        # Create a new row
        new_row = {'shock_timestamp': index, 'shock_height': row['shock_height'], 'close_cex': row['close_cex'], 'close_dex': row['close_dex'] }
        # For each lag from 1 to 6
        for i in range(1, 7):
            # Add the lagged dex close price to the new row
            new_row[f'lag_{i}_dex'] = df_shock.iloc[row_num + i]['close_dex']
        # Append the new row to the new DataFrame
        df_new = pd.concat([df_new, pd.DataFrame([new_row])], ignore_index=True)

df_new

In [24]:
df = pd.read_pickle('datasets/subsets/2022-01-09_2-eth.pkl')
df

df.index

DatetimeIndex(['2022-01-09 12:00:00', '2022-01-09 12:01:00',
               '2022-01-09 12:02:00', '2022-01-09 12:03:00',
               '2022-01-09 12:04:00', '2022-01-09 12:05:00',
               '2022-01-09 12:06:00', '2022-01-09 12:07:00',
               '2022-01-09 12:08:00', '2022-01-09 12:09:00',
               ...
               '2022-01-09 23:50:00', '2022-01-09 23:51:00',
               '2022-01-09 23:52:00', '2022-01-09 23:53:00',
               '2022-01-09 23:54:00', '2022-01-09 23:55:00',
               '2022-01-09 23:56:00', '2022-01-09 23:57:00',
               '2022-01-09 23:58:00', '2022-01-09 23:59:00'],
              dtype='datetime64[ns]', length=722, freq=None)