In this Jupyter notebook we created an algorithm that takes a set of public securities of our choosing and finds correlations or cointegrations between them. Using this, we then mark buy or sell signals accordingly allowing us to profit using pairwise trading.  

In [3]:
#imports
import numpy as np
import pandas as pd
import statsmodels
import statsmodels.api as sm
import yfinance as yf
from statsmodels.tsa.stattools import coint, adfuller
from pandas_datareader import data as pdr
pd.core.common.is_list_like = pd.api.types.is_list_like
import datetime
from scipy.stats.stats import pearsonr

import matplotlib.pyplot as plt
import seaborn as sns; sns.set(style="whitegrid")

  from scipy.stats.stats import pearsonr


Area to choose what Stocks to track:

In [4]:
yf.pdr_override()
start = datetime.datetime(2015, 1, 1)
end = datetime.datetime.now()
tickers = ['TSLA', 'NKE', 'AMZN', 'WMT', 'GOOG', 'JPM', 'META', 'NVDA', 'ABBV', 'AAPL', 'ADBE', 'ORCL', 'EBAY', 'MSFT', 'QCOM', 'HPQ', 'JNPR', 'AMD', 'IBM', 'VOO']


df = pdr.get_data_yahoo(tickers, start, end)['Close']
df.tail()

[*********************100%***********************]  20 of 20 completed


Unnamed: 0_level_0,AAPL,ABBV,ADBE,AMD,AMZN,EBAY,GOOG,HPQ,IBM,JNPR,JPM,META,MSFT,NKE,NVDA,ORCL,QCOM,TSLA,VOO,WMT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2023-03-01 00:00:00-05:00,145.309998,155.270004,323.380005,78.290001,92.169998,45.75,90.510002,28.860001,128.190002,30.530001,142.550003,173.419998,246.270004,118.580002,226.979996,86.339996,123.68,202.770004,362.779999,140.149994
2023-03-02 00:00:00-05:00,145.910004,154.380005,333.5,80.440002,92.129997,45.580002,92.309998,28.809999,128.929993,30.73,141.070007,174.529999,251.110001,119.580002,233.139999,86.970001,123.779999,190.899994,365.48999,140.5
2023-03-03 00:00:00-05:00,151.029999,156.059998,344.040009,81.519997,94.900002,46.040001,94.019997,28.610001,129.639999,30.68,143.660004,185.25,255.289993,120.940002,238.899994,89.25,123.599998,197.789993,371.279999,140.669998
2023-03-06 00:00:00-05:00,153.830002,155.279999,347.019989,81.160004,93.75,44.75,95.580002,28.57,130.190002,31.33,142.820007,184.899994,256.869995,120.169998,235.539993,89.739998,122.410004,193.809998,371.730011,140.649994
2023-03-07 00:00:00-05:00,151.600006,152.929993,344.799988,82.110001,93.550003,43.98,94.169998,27.66,128.25,31.200001,138.619995,184.509995,254.149994,119.589996,232.880005,88.360001,119.190002,187.710007,365.980011,139.25


First we need to test for stationarity using the Dickey-Fuller Test; we obviously only want to look at stocks that are not following stationarity.

In [5]:
def stationarity_test(X, cutoff=0.01):
    pvalue = adfuller(X)[1]
    if pvalue < cutoff:
        print('p-value = ' + str(pvalue) + ' The series ' + X.name +' is likely stationary.')
    else:
        print('p-value = ' + str(pvalue) + ' The series ' + X.name +' is likely non-stationary.')

This is to test if the Stationarity Test actually works:

In [6]:
ticker = yf.Ticker('AAPL')
Stock_history = ticker.history(period = "60d")
Stock_close = Stock_history["Close"]
stationarity_test(Stock_close)

p-value = 0.8490290797012403 The series Close is likely non-stationary.


Test for Cointegration/Correlated Pairs:

In [7]:
pairs = []
def find_cointegrated_pairs(data):
    n = data.shape[1]
    score_matrix = np.zeros((n, n))
    pvalue_matrix = np.ones((n, n))
    keys = data.keys()
    for i in range(n):
        for j in range(i+1, n):
            S1 = data[keys[i]]
            S2 = data[keys[j]]
            result = coint(S1, S2)
            score = result[0]
            pvalue = result[1]
            score_matrix[i, j] = score
            pvalue_matrix[i, j] = pvalue
            if pvalue < 0.05:
                pairs.append((keys[i], keys[j]))
    return pvalue_matrix, score_matrix, pairs

In [8]:
def add_row(S1, S2, df, p_value_testing = True):
    s1_name = S1.name
    s2_name = S2.name
    result = coint(S1, S2)
    score = result[0]
    p_value = result[1]
    corr = pearsonr(S1, S2)[0]
    temp_df = pd.DataFrame([s1_name, s2_name, score, p_value, corr])
    temp_df = temp_df.transpose()
    temp_df.columns = 'Stock 1', 'Stock 2', 'CoInt', 'P CoInt', 'Correlation'
    if df is not None:
        if p_value_testing == True:
            if p_value < 0.05:
                df = pd.concat([df, temp_df], ignore_index=True)
        if p_value_testing == False:
            df = pd.concat([df, temp_df], ignore_index=True)
    else:
        if p_value_testing == True:
            if p_value <0.05:
                df = temp_df
        else:
            df = temp_df
    return df


In [9]:
def make_table(data):
    temp_df = None
    n = data.shape[1]
    for i in range(n):
        for j in range(i+1, n):
            S1 = data.iloc[:,i]
            S2 = data.iloc[:,j]
            temp_df = add_row(S1, S2, temp_df)
    return temp_df



Now we need to make a function to test a new stock with all of the other stocks we already were testing and store it inside the dataframe

In [10]:
def new_stock_test(df, new_stock, original_stocks):
    temp_new_stock = pdr.get_data_yahoo(new_stock, start, end)['Close']
    temp_df = None
    n = df.shape[1]
    for i in range(n):
        for j in range(i+1, n):
            S1 = df.iloc[:,i]
            S2 = temp_new_stock.iloc[j]
            temp_df = add_row(S1, S2, temp_df)
    return temp_df

Now we will test the Cointegrated Pairs Function and the make table function

In [11]:
find_cointegrated_pairs(df)
make_table(df)


Unnamed: 0,Stock 1,Stock 2,CoInt,P CoInt,Correlation
0,ADBE,AMZN,-4.517504,0.001146,0.978123
1,ADBE,EBAY,-3.564569,0.027085,0.937079
2,AMD,GOOG,-3.404778,0.041823,0.957776
3,AMD,MSFT,-4.405088,0.001746,0.978362
4,AMD,NKE,-3.628679,0.022569,0.953327
5,AMD,QCOM,-3.887207,0.01033,0.959038
6,AMD,WMT,-3.413241,0.040901,0.90806
7,EBAY,META,-4.026399,0.006583,0.920491
8,GOOG,TSLA,-3.40032,0.042315,0.932379
9,IBM,JPM,-3.975534,0.007779,-0.347839


In [12]:
new_stock_test(df, 'UBER', tickers)

[*********************100%***********************]  1 of 1 completed


AttributeError: 'numpy.float64' object has no attribute 'name'

Now we will create a Pandas Dataframe to store all of the Cointegration values and the Correlations values. We have the option to store each pair twice or once in order to create iteration through the information easier for post-processing. 

: 