This notebook is used to explore Statistical Arbitrage

We first explain on words what Statistical Arbitrage is, and then explain the stages in implement such strategy. 

Statistical Arbitrage is a mean-reversion pair-trading strategy that exploits the statistical relationship between two assets. 

This two assets should have follow certain statistical relationship, for example highly correlated, and we assume such relationship is norm. This relationship, once disrupted, will supposedly revert to its norm (mean-reversion). 

Here we implement the most basical stat arb, with two highly correlated & co-integrated assets. The implementation step is as follow:

0. load timeseries dataset for asset X and asset Y w/ necessary cleaning

1. compute correlation coefficients - confirm that two assets are indeed highly correlated

2. test the co-integration, using dicky-fuller test (will talk more on selection of test)

3. generate signal when the spread is off by one standard deviation

Note*

a. By step 2, we will be sure whether the selected pairs are ideal, or that the seires do not cointegrated, thus not being an ideal pair trade

b. after we passed the test, we will start generate & monitor trades in step 3

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

from statsmodels.tsa.stattools import adfuller
import statsmodels.api as sm
from scipy.stats import chisquare
import datetime as DT

In [3]:
#huobi api
from huobi.client.account import AccountClient
from huobi.client.market import MarketClient
from huobi.constant import *
from huobi.privateconfig import g_api_key, g_secret_key
from huobi.utils import *

account_client = AccountClient(api_key=g_api_key,
                               secret_key=g_secret_key)

account_type = "spot"
asset_valuation = account_client.get_account_asset_valuation(account_type=account_type, valuation_currency="usd")
asset_valuation.print_object()

#for market data
market_client = MarketClient()

balance : 1.74
timestamp : 1656264063341


In [1]:
class StatsArb():
    
    def __init__(self, dataseriesX, dataseriesY, alpha = 0.05, assetX, assetY): #two pandas dataframe (Price; Date)
        self.df_x = dataseriesX
        self.df_y = dataseriesY
        self.alpha = alpha
        self.asset_x = assetX
        self.asset_y = assetY

        #check if the current shape matches each other
        if self.df_x.shape != self.df_y.shape:
            print('imported two data series do not match their shape, please check!')
            exit()

        self.logging = f'asset_x: {self.asset_x}, asset_y: {self.asset_y}\n'
    
    def get_price(self):
        result = (0, 0)
        '''
        return the price pair of assetX and assetY (assetX, assetY)
        Note: this function would be different w/ different assets (since we 
        might use different api)
        '''
        list_obj = market_client.get_market_trade(symbol = self.asset_x + 'usdt')
        price_x = list_obj[0].__dict__['price']

        list_obj = market_client.get_market_trade(symbol = self.asset_y + 'usdt')
        price_y = list_obj[0].__dict__['price']

        return (price_x, price_y)

    def correlation(self)->float:
        '''
        Compute the correlation between two stock prices
        '''
        x_serie = self.df_x['Price']
        y_serie = self.df_y['Price']

        result = x_serie.corr(y_serie)
        assert (-1 <= result <= 1)
        return result

    def regression(self):
        '''
        get the regression coefficient and initialize portfolio val series
        '''
        price_x = self.df_x['Price']
        price_y = self.df_y['Price']
        
        #price_x = sm.add_constant(price_x)
        model = sm.OLS(price_x, price_y).fit()
        summary = model.summary()

        price_x = self.df_x['Price']
        self.reg_coeff = model.params[0]
        self.portfolio = price_x - self.reg_coeff * price_y

    def cointegration(self)->bool:
        '''
        return whether there exists a co-integration relationship given the
        alpha level, using augmented dickyfuller's test
        '''
        #print(self.portfolio)
        adftest = adfuller(self.portfolio)
        p_value = adftest[1]
        return p_value > self.alpha

    def sd_cross(self):
        '''
        return whether the standard deviation has crossed the mean 
        '''
        result = False

        #check if it is normal
        stats, pvalue = chisquare(self.portfolio)

        if pvalue <= self.alpha:
            print("Warning: the current spread is not of normally distributed")

        portfolio_mean = self.portfolio.mean()
        portfolio_sd = self.portfolio.std()

        print(f'series mean is {portfolio_mean}')
        print(f'series sd is {portfolio_sd}')
        print(f'current portfolio spread is {self.portfolio[len(self.portfolio) - 1]}\n')
        
        self.logging = self.logging + f'portfolio mean: {portfolio_mean}, portfolio sd: {portfolio_sd}\n'
        
        #check if the last element crosses 1 standard deviation of our portfolio
        cross_index = 0
        if self.portfolio[len(self.portfolio) - 1] > portfolio_mean + 2 * portfolio_sd:
            cross_index = 1
            result = True
        if self.portfolio[len(self.portfolio) - 1] < portfolio_mean - 2 * portfolio_sd:
            cross_index = -1
            result = True

        return result, cross_index

    def main(self)->bool:
        '''
        once called, run 
        0. run regression & test for co-integration & normal to proceed
        1. get_price, to get current price
        2. with the latest price adding to the timeseries, test if we are 1 sd across the mean
        3. if so, take proper position to trade
        '''
        #we proceed to get current price
        price_x, price_y = self.get_price()
        self.logging = self.logging + f'price of x: {price_x}, price of y: {price_y}\n'

        #add price to time series
        curr_time = pd.to_datetime("today").strftime("%m/%d/%Y")
        curr_time = pd.to_datetime(curr_time)
        self.df_x.loc[len(self.df_x.index)] = [curr_time, price_x]
        self.df_y.loc[len(self.df_y.index)] = [curr_time, price_y]
        
        #run regression to initialize the regression coefficient
        self.regression()
        self.logging = self.logging + f'regression coefficient is {self.reg_coeff}\n'
        self.logging = self.logging + f'latest portfolio spread is {self.portfolio[len(self.portfolio) - 1]}\n'

        #get correlation
        corr_co = self.correlation()
        is_cointegrate = self.cointegration()

        #output the correlation coefficient
        print('current correlation coefficient is: ' + str(corr_co))
        self.logging = self.logging + f'current correlation coefficient is: {corr_co}\n'

        #terminate if the series do not cointegrate
        if not is_cointegrate:
            print('the selected trade pairs do not cointegrate!')
            self.logging = self.logging + 'current pair DO NOT cointegrate!!\n'
            #exit()
        
        #check if signal generated
        is_trade, trade_direction = self.sd_cross()
        self.logging = self.logging + f'signal: {is_trade}, direction: {trade_direction}\n'
        
        return is_trade, trade_direction     

SyntaxError: non-default argument follows default argument (1893083731.py, line 3)

In [82]:
def generatedf(asset:str):
    list_obj = market_client.get_candlestick(asset + 'usdt', '1day', size = 200 )
    today = DT.date.today()
    week_ago = today - DT.timedelta(days=len(list_obj))
    date = pd.Series(pd.date_range(week_ago, freq="D", periods=len(list_obj)))
    price = reversed([elem.__dict__['close'] for elem in list_obj])
    
    return pd.DataFrame({'Date': date, 'Price':price})

def trade_pair(direction, asset_x, asset_y):
    '''
    when direction is 1, that means we need to short the portfolio, i.e. 
    short 1 share of x, long regression coefficient many shares of y

    else
    we long the portfolio, i.e.
    long 1 share of x, short regression coefficient many shares of y
    '''

def runtime_main(asset_x, asset_y):
    #first, get two asset & their price dataset
    #asset_x = 'eth'
    #asset_y = 'bch'

    df_x = generatedf(asset_x)
    df_y = generatedf(asset_y)
    alpha = 0.05 #standard 0.95 confidence interval

    strategy = StatsArb(df_x, df_y, alpha, asset_x, asset_y)
    signal, direction = strategy.main()
    
    if signal:
        trade_pair(direction, asset_x, asset_y)
   
    return strategy.logging

    

In [87]:
crypto_lis = ['btc', 'eth', 'ht', 'dot', 'xrp', 'link', 'bch']
crypto_pair = []
signal_res = ''

#get a combination of pairs
for i in range(len(crypto_lis) - 1):
    for elem in crypto_lis[i+1:]:
        crypto_pair.append((crypto_lis[i], elem))

for assetx, assety in crypto_pair:
    log = runtime_main(assetx, assety)
    signal_res = signal_res + log + '\n'

text_file = open("result.txt", "w")
n = text_file.write(signal_res)
text_file.close()

#runtime_main()

current correlation coefficient is: 0.9786453110910817
series mean is 805.3794495370112
series sd is 3315.490053052638
current portfolio spread is 4565.29883181876

asset_x: btc, asset_y: eth
price of x: 21379.28, price of y: 1229.1
regression coefficient is 13.679913081263722
latest portfolio spread is 4565.29883181876
current correlation coefficient is: 0.9786453110910817
portfolio mean: 805.3794495370112, portfolio sd: 3315.490053052638
signal: False, direction: 0

current correlation coefficient is: 0.9354945061248909
series mean is -179.4139626721468
series sd is 2944.0358060607473
current portfolio spread is -2370.754091715051

asset_x: btc, asset_y: ht
price of x: 21379.29, price of y: 5.3557
regression coefficient is 4434.535932131197
latest portfolio spread is -2370.754091715051
current correlation coefficient is: 0.9354945061248909
portfolio mean: -179.4139626721468, portfolio sd: 2944.0358060607473
signal: False, direction: 0

current correlation coefficient is: 0.9267961776