# Pair trading using Correlation

In [13]:
# import dependencies

import pandas as pd
import numpy as np
import os
from typing import List
from sklearn.model_selection import train_test_split

# import data manager from local directory ../src/DataManger
import sys
sys.path.append(os.path.abspath(os.path.join('..')))
from src.DataManager import data_manager


In [14]:
DATA_PATH = "/Users/chriskang/Desktop/Projects/SideQuant/PxDataDownloader/data"

In [15]:
# initialize data manager
dm = data_manager.DataManager(DATA_PATH)

In [22]:
# load universe data in the kernal
all_symbols = dm.list_universe()
raw_data = dm.prep_ohlcv_data(all_symbols, '15m')
len(raw_data)

280

In [27]:
# filter the symbols with less than 50000 data points
key_to_delete = []
for symbol, df in raw_data.items():
    if len(df) < 50000:
        key_to_delete.append(symbol)
for key in key_to_delete:
    del raw_data[key]

len(raw_data)

152

In [28]:
# prep returns
returns = dm.prep_return_data(ohlcv_dict=raw_data,style='close')
returns.head()

Unnamed: 0_level_0,1000LUNCUSDT,1000SHIBUSDT,1000XECUSDT,1INCHUSDT,AAVEUSDT,ADAUSDT,ALGOUSDT,ALICEUSDT,ALPHAUSDT,ANKRUSDT,...,XEMUSDT,XLMUSDT,XMRUSDT,XRPUSDT,XTZUSDT,YFIUSDT,ZECUSDT,ZENUSDT,ZILUSDT,ZRXUSDT
open_ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-11-02 12:15:00,-0.005009,-0.003326,-0.001899,0.000337,-0.00231,-0.00153,-0.002297,-0.001876,0.000889,-0.004224,...,-0.005249,-0.000463,0.000201,0.002427,0.0,-0.000507,-0.001596,-7.9e-05,-0.001349,-0.002781
2022-11-02 12:30:00,-0.000915,0.002586,0.000543,-0.000505,-0.000366,-0.000766,0.001151,-0.00188,-0.002664,-0.002121,...,0.002639,-0.000649,-0.001473,-0.00088,-0.002163,0.000888,0.0008,-0.000629,0.0,0.000398
2022-11-02 12:45:00,-0.000458,-0.001831,-0.002173,-0.002527,-0.002194,0.000256,0.000575,-0.001255,0.0,0.000354,...,-0.002632,-0.000371,-0.000805,0.002203,0.0,-0.00038,-0.000799,-0.00063,-0.000675,-0.002788
2022-11-02 13:00:00,0.005958,0.003585,0.003266,0.005236,0.003909,0.00281,0.002298,0.0044,0.003562,0.003541,...,0.002639,0.004825,0.0,0.0,0.000723,0.000888,0.004397,0.004331,0.00338,0.003195
2022-11-02 13:15:00,0.000456,0.000166,0.001085,0.000168,-0.001825,0.001783,0.003153,-0.000626,-0.000887,-0.000706,...,0.002632,0.001847,-0.000738,0.001319,0.002166,-0.00038,0.000199,-0.000157,0.0,0.0


In [30]:
# split data into train and test
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(returns, test_size=0.3)

In [36]:
# select pairs based on correlation for each symbol where correlation is the lowest and corr < 0.1

from dataclasses import dataclass

@dataclass
class Pair:
    symbol1: str
    symbol2: str
    correlation: float

# compute correlation matrix
corr_matrix = train_data.corr()

# build the pairs
pairs = []
for symbol in corr_matrix.columns:
    # filter the symbols with correlation < 0.5
    filtered_symbols = corr_matrix[corr_matrix[symbol] < 0.1][symbol]
    # choose the one with the lowest correlation
    if len(filtered_symbols) > 0:
        lowest_corr_symbol = filtered_symbols.idxmin()
        # check if the pair already exists
        if not any(pair.symbol1 == lowest_corr_symbol and pair.symbol2 == symbol for pair in pairs):
            pairs.append(Pair(symbol1=symbol, symbol2=lowest_corr_symbol, correlation=float(filtered_symbols.min())))

# choose the top 10 pairs
top_pairs = sorted(pairs, key=lambda x: x.correlation)[:10]
top_pairs


[Pair(symbol1='BTCDOMUSDT', symbol2='DEFIUSDT', correlation=-0.4905962841024761),
 Pair(symbol1='BLUEBIRDUSDT', symbol2='BTCDOMUSDT', correlation=-0.4839128241968404),
 Pair(symbol1='ETHUSDT', symbol2='BTCDOMUSDT', correlation=-0.4701430132786881),
 Pair(symbol1='MATICUSDT', symbol2='BTCDOMUSDT', correlation=-0.46004758053313083),
 Pair(symbol1='LINKUSDT', symbol2='BTCDOMUSDT', correlation=-0.45188893354448756),
 Pair(symbol1='BNBUSDT', symbol2='BTCDOMUSDT', correlation=-0.451201295773365),
 Pair(symbol1='ADAUSDT', symbol2='BTCDOMUSDT', correlation=-0.4418768236127394),
 Pair(symbol1='ETCUSDT', symbol2='BTCDOMUSDT', correlation=-0.4380065232115677),
 Pair(symbol1='DASHUSDT', symbol2='BTCDOMUSDT', correlation=-0.4348553571110476),
 Pair(symbol1='BALUSDT', symbol2='BTCDOMUSDT', correlation=-0.4337191947703332)]

Now that pairs are chose, we need to find:
1. when to rebalance these pairs
    - fixed interval when the current weights deviates from target weights by x %
2. the ratio between pairs
    - Use beta