# Beta based price movement Prediction

In [18]:
# import dependencies

import pandas as pd
import numpy as np
import os
from typing import List
from sklearn.model_selection import train_test_split


In [2]:
DATA_PATH = "/Users/chriskang/Desktop/Projects/SideQuant/PxDataDownloader/data"

In [12]:
# function for data loading

def list_universe():
    f_list: List[str] = os.listdir(DATA_PATH)
    symbols = [f.split('.')[0] for f in f_list if f.endswith('.parquet') and not '_' in f and 'USDT' in f]
    symbols = sorted(symbols)
    return symbols

def load_data(symbol: str, interval: str):
    target_data_path = os.path.join(DATA_PATH, f'{symbol}.parquet')
    data = pd.read_parquet(target_data_path)
    data = data[data['interval'] == interval]
    data = data.reset_index(drop=True)
    return data

def prep_many_data(symbols: List[str], interval: str):
    data = {}
    for symbol in symbols:
        df = load_data(symbol, interval)
        if len(df) < 2000:
            continue
        data[symbol] = df 
    return data

In [13]:
# load universe data in the kernal
all_symbols = list_universe()
raw_data = prep_many_data(all_symbols, '4h')

In [14]:
raw_data['TRXUSDT'].head()

Unnamed: 0,symbol,interval,open_ts,open,high,low,close,volume,close_ts,quote_volume,num_trades,taker_buy_base_vol,taker_buy_quote_vol
0,TRXUSDT,4h,2020-01-15 08:00:00,0.01684,0.01847,0.0167,0.01762,300336826.0,2020-01-15 11:59:59,5277016.0,8455,151314067.0,2655620.0
1,TRXUSDT,4h,2020-01-15 12:00:00,0.01762,0.0182,0.01688,0.01693,243584647.0,2020-01-15 15:59:59,4303606.0,5868,61747969.0,1093694.0
2,TRXUSDT,4h,2020-01-15 16:00:00,0.01693,0.01765,0.01674,0.01755,265138686.0,2020-01-15 19:59:59,4578507.0,6832,170289809.0,2945830.0
3,TRXUSDT,4h,2020-01-15 20:00:00,0.01755,0.0178,0.01706,0.01734,283733684.0,2020-01-15 23:59:59,4958437.0,7116,164881261.0,2885921.0
4,TRXUSDT,4h,2020-01-16 00:00:00,0.01731,0.01745,0.0159,0.01639,262536833.0,2020-01-16 03:59:59,4345176.0,7656,93485030.0,1563261.0


# Data Processing
## Returns

In [77]:
returns = pd.DataFrame()

for symbol, df in raw_data.items():
    new_price = (df['close'] + df['open'] + df['high'] + df['low']) / 4
    tmp_return_df = new_price.pct_change().dropna().to_frame(name=symbol)
    tmp_return_df.set_index(df['open_ts'].iloc[1:], inplace=True)
    returns = pd.merge(returns, tmp_return_df, how='outer', left_index=True, right_index=True)
returns = returns.T.sort_index().T
returns.dropna(inplace=True)
# drop columns where the values are all 0
returns = returns.loc[:, (returns != 0).any(axis=0)]
returns.head()

Unnamed: 0_level_0,1000FLOKIUSDT,1000LUNCUSDT,1000PEPEUSDT,1000SHIBUSDT,1000XECUSDT,1INCHUSDT,AAVEUSDT,ACHUSDT,ADAUSDT,AGIXUSDT,...,XMRUSDT,XRPUSDT,XTZUSDT,XVGUSDT,XVSUSDT,YFIUSDT,ZECUSDT,ZENUSDT,ZILUSDT,ZRXUSDT
open_ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-07-05 16:00:00,0.001682,-0.005376,0.001595,-0.000941,0.002144,-0.003034,-0.00034,-0.005145,-0.001941,0.000105,...,0.018274,5.3e-05,-0.002807,0.012167,-0.000883,-0.000585,-0.015419,-0.017567,-0.013023,-0.022803
2023-07-05 20:00:00,0.001679,0.001283,0.00574,0.001513,0.022134,0.002643,0.015288,0.0,0.001061,0.004948,...,0.000315,0.005747,0.002502,-0.047897,0.006544,0.005122,0.000395,0.000822,0.010069,0.012749
2023-07-06 00:00:00,0.001282,0.008905,0.004627,0.001578,0.014249,0.004472,-0.002113,0.003491,0.001766,-0.001676,...,0.000704,-0.000367,-0.000312,-0.023033,-0.007145,0.004477,-0.002214,0.029799,-0.004011,-0.019108
2023-07-06 04:00:00,0.010046,0.008041,0.019423,0.008951,0.064846,0.011847,0.002588,0.011725,0.011192,0.007031,...,0.004224,0.00451,0.007803,0.019101,0.011267,0.014131,0.015135,0.022641,0.00069,-0.023147
2023-07-06 08:00:00,0.01687,0.007227,0.024106,0.004519,0.087433,0.012494,0.014715,0.017066,0.006885,0.010629,...,0.004355,-0.002506,0.007123,-0.00462,0.013825,0.015257,0.021154,0.017255,-0.003794,0.004575


In [78]:
train, test = train_test_split(returns, test_size=0.3, shuffle=False)

## Rolling Beta

In [114]:
rolling_cov = train.rolling(window=30).cov()
rolling_cov.dropna(inplace=True)
rolling_cov_btc = rolling_cov['BTCUSDT']
var_btc = train['BTCUSDT'].rolling(window=30).var()
var_btc.dropna(inplace=True)
rolling_beta = rolling_cov_btc / var_btc
rolling_beta.dropna(inplace=True)
rolling_beta = rolling_beta.to_frame(name='Beta')
rolling_beta.index = rolling_beta.index.set_names('symbol', level=1)
rolling_beta_train = rolling_beta

In [115]:
rolling_beta_train

Unnamed: 0_level_0,Unnamed: 1_level_0,Beta
open_ts,symbol,Unnamed: 2_level_1
2023-07-10 12:00:00,1000FLOKIUSDT,1.746961
2023-07-10 12:00:00,1000LUNCUSDT,0.984463
2023-07-10 12:00:00,1000PEPEUSDT,2.768366
2023-07-10 12:00:00,1000SHIBUSDT,1.270154
2023-07-10 12:00:00,1000XECUSDT,2.851916
...,...,...
2024-03-03 12:00:00,YFIUSDT,0.248271
2024-03-03 12:00:00,ZECUSDT,0.013075
2024-03-03 12:00:00,ZENUSDT,0.205646
2024-03-03 12:00:00,ZILUSDT,0.536887


In [116]:
# rolling beta test
rolling_cov = test.rolling(window=30).cov()
rolling_cov.dropna(inplace=True)
rolling_cov_btc = rolling_cov['BTCUSDT']

var_btc = test['BTCUSDT'].rolling(window=30).var()
var_btc.dropna(inplace=True)

rolling_beta = rolling_cov_btc / var_btc
rolling_beta.dropna(inplace=True)

rolling_beta = rolling_beta.to_frame(name='Beta')
rolling_beta.index = rolling_beta.index.set_names('symbol', level=1)
rolling_beta_test = rolling_beta


### Data massaging to add return to rolling beta

In [117]:
# add t+1 return
# massage train data to have the same index as rolling_beta
proc_train = train.melt(ignore_index=False, var_name='symbol', value_name='return').reset_index()
proc_train.set_index(['open_ts', 'symbol'], inplace=True)
proc_train.sort_index(level=0, inplace=True)
proc_train

Unnamed: 0_level_0,Unnamed: 1_level_0,return
open_ts,symbol,Unnamed: 2_level_1
2023-07-05 16:00:00,1000FLOKIUSDT,0.001682
2023-07-05 16:00:00,1000LUNCUSDT,-0.005376
2023-07-05 16:00:00,1000PEPEUSDT,0.001595
2023-07-05 16:00:00,1000SHIBUSDT,-0.000941
2023-07-05 16:00:00,1000XECUSDT,0.002144
...,...,...
2024-03-03 12:00:00,YFIUSDT,0.016163
2024-03-03 12:00:00,ZECUSDT,0.001015
2024-03-03 12:00:00,ZENUSDT,0.030753
2024-03-03 12:00:00,ZILUSDT,0.015449


In [118]:
# merge rolling_beta with test data
proc_test = test.melt(ignore_index=False, var_name='symbol', value_name='return').reset_index()
proc_test.set_index(['open_ts', 'symbol'], inplace=True)
proc_test.sort_index(level=0, inplace=True)
proc_test


Unnamed: 0_level_0,Unnamed: 1_level_0,return
open_ts,symbol,Unnamed: 2_level_1
2024-03-03 16:00:00,1000FLOKIUSDT,-0.018423
2024-03-03 16:00:00,1000LUNCUSDT,-0.003934
2024-03-03 16:00:00,1000PEPEUSDT,0.032971
2024-03-03 16:00:00,1000SHIBUSDT,-0.003160
2024-03-03 16:00:00,1000XECUSDT,-0.004646
...,...,...
2024-06-15 08:00:00,YFIUSDT,0.001013
2024-06-15 08:00:00,ZECUSDT,-0.003173
2024-06-15 08:00:00,ZENUSDT,0.000720
2024-06-15 08:00:00,ZILUSDT,0.003180


## Final data to play with
### Train Set

In [119]:
# merge proc_train with rolling_beta basd on rolling_beta index
final_df_train = pd.merge(rolling_beta_train, proc_train, how='inner', left_index=True, right_index=True)
final_df_train.sort_index(level=1, inplace=True)
# add t+1 return for each symbol
grouped_df = final_df_train.groupby('symbol')
final_df_train['return_t+1'] = grouped_df['return'].shift(-1)
final_df_train.dropna(inplace=True)
final_df_train

Unnamed: 0_level_0,Unnamed: 1_level_0,Beta,return,return_t+1
open_ts,symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-07-10 12:00:00,1000FLOKIUSDT,1.746961,0.010539,0.018507
2023-07-10 16:00:00,1000FLOKIUSDT,1.787655,0.018507,0.010140
2023-07-10 20:00:00,1000FLOKIUSDT,1.802581,0.010140,-0.006261
2023-07-11 00:00:00,1000FLOKIUSDT,1.787221,-0.006261,-0.003900
2023-07-11 04:00:00,1000FLOKIUSDT,1.763373,-0.003900,-0.001506
...,...,...,...,...
2024-03-02 16:00:00,ZRXUSDT,-0.208242,0.006047,0.013554
2024-03-02 20:00:00,ZRXUSDT,-0.253806,0.013554,-0.004419
2024-03-03 00:00:00,ZRXUSDT,-0.227003,-0.004419,-0.046838
2024-03-03 04:00:00,ZRXUSDT,-0.047220,-0.046838,-0.003247


### Test Set

In [120]:
# merge proc_test with rolling_beta basd on rolling_beta index
final_df_test = pd.merge(rolling_beta_test, proc_test, how='inner', left_index=True, right_index=True)
final_df_test.sort_index(level=1, inplace=True)
# add t+1 return for each symbol
grouped_test_df = final_df_test.groupby('symbol')
final_df_test['return_t+1'] = grouped_test_df['return'].shift(-1)
final_df_test.dropna(inplace=True)
final_df_test

Unnamed: 0_level_0,Unnamed: 1_level_0,Beta,return,return_t+1
open_ts,symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-03-08 12:00:00,1000FLOKIUSDT,2.820797,0.150556,0.047015
2024-03-08 16:00:00,1000FLOKIUSDT,2.773809,0.047015,0.018564
2024-03-08 20:00:00,1000FLOKIUSDT,2.769165,0.018564,0.087527
2024-03-09 00:00:00,1000FLOKIUSDT,2.663586,0.087527,0.040608
2024-03-09 04:00:00,1000FLOKIUSDT,2.658388,0.040608,0.000222
...,...,...,...,...
2024-06-14 12:00:00,ZRXUSDT,1.163365,-0.014823,-0.035659
2024-06-14 16:00:00,ZRXUSDT,1.323532,-0.035659,-0.008047
2024-06-14 20:00:00,ZRXUSDT,1.290934,-0.008047,0.010465
2024-06-15 00:00:00,ZRXUSDT,1.320647,0.010465,0.009499


# Linear Regression

In [83]:
import statsmodels.api as sm

In [124]:
# group final_df by symbol and apply OLS regression to each group where x is beta and y is return_t+1 and add a constant
grouped_train_df = final_df_train.groupby('symbol')
grouped_test_df = final_df_test.groupby('symbol')
pred_results = {}
for symbol, df in grouped_train_df:
    x = df['Beta']
    y = df['return_t+1']
    x = sm.add_constant(x)
    model = sm.OLS(y, x)
    results = model.fit()

    x_test = grouped_test_df.get_group(symbol)['Beta']
    x_test = sm.add_constant(x_test)
    y_pred = results.predict(x_test) 
    y_acutal = grouped_test_df.get_group(symbol)['return_t+1']
    pred_results[symbol] = (y_acutal, y_pred)

# ols_results_df = pd.DataFrame(ols_results).T
# ols_results_df.columns = ['Beta', 'const', 'adj_R^2']
# ols_results_df.sort_values('adj_R^2', ascending=False, inplace=True)
pred_results


{'1000FLOKIUSDT': (open_ts              symbol       
  2024-03-08 12:00:00  1000FLOKIUSDT    0.047015
  2024-03-08 16:00:00  1000FLOKIUSDT    0.018564
  2024-03-08 20:00:00  1000FLOKIUSDT    0.087527
  2024-03-09 00:00:00  1000FLOKIUSDT    0.040608
  2024-03-09 04:00:00  1000FLOKIUSDT    0.000222
                                          ...   
  2024-06-14 12:00:00  1000FLOKIUSDT   -0.050927
  2024-06-14 16:00:00  1000FLOKIUSDT   -0.002771
  2024-06-14 20:00:00  1000FLOKIUSDT    0.022643
  2024-06-15 00:00:00  1000FLOKIUSDT    0.012571
  2024-06-15 04:00:00  1000FLOKIUSDT   -0.004732
  Name: return_t+1, Length: 593, dtype: float64,
  open_ts              symbol       
  2024-03-08 12:00:00  1000FLOKIUSDT   -0.001912
  2024-03-08 16:00:00  1000FLOKIUSDT   -0.001803
  2024-03-08 20:00:00  1000FLOKIUSDT   -0.001792
  2024-03-09 00:00:00  1000FLOKIUSDT   -0.001545
  2024-03-09 04:00:00  1000FLOKIUSDT   -0.001533
                                          ...   
  2024-06-14 12:00:00  1000

In [None]:
# plot the results
import matplotlib.pyplot as plt

for symbol, (y_actual, y_pred) in pred_results.items():
    result = pd.DataFrame({'Actual': y_actual, 'Predicted': y_pred})
    result.plot(title=symbol)