In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from joblib import Parallel, delayed
from tqdm import tqdm
import matplotlib.pyplot as plt
import re

## Data Preprocessing

In [7]:
ret_df = pd.read_feather('data/nasdaq_etfs.feather') # contains daily percentage returns (NOTE: 0.747 means 0.747% not 74.7%)

symmap = pd.read_csv("data/symbol2name.csv")
sym2name = dict(zip(symmap["Symbol"], symmap["Security Name"]))
syms2drop = []
for sym in ret_df.columns:
    name = sym2name.get(sym, None)
    if name is None: continue
    if re.search(r"\b(bill)\b", name.lower()):
        # print(sym, "|", name)
        syms2drop.append(sym)
ret_df.drop(columns=syms2drop, inplace=True)

In [9]:
growth = 1 + ret_df/100
price_df = growth.cumprod().shift(1, fill_value=1)
log_price_df = np.log(price_df)

# We use first 40% of data for cointegration test
train_size = int(len(log_price_df) * 0.40)

price_df_train = price_df.iloc[:train_size]
log_price_df_train = log_price_df.iloc[:train_size]
ret_df_train = ret_df.iloc[:train_size-1]

In [12]:
pairs_df = pd.read_csv('data/pairs_df.csv', index_col=0)
pairs_df = pairs_df[np.sign(pairs_df['lev'])==np.sign(pairs_df['cor'])]

pairs_df = pairs_df[pairs_df.svm1 >= 1.1]
# pairs_df = pairs_df[pairs_df.svm2 >= 0.3]
pairs_df.shape

(5054, 15)

## Kalman Filter Formulation

Denote $x_t, y_t$ as stock prices of any pair.

#### Model
$$ y_t=x_t \beta_t + \alpha_t + \epsilon_t,\; \epsilon \sim N(0, R) $$
$$  $$