In [38]:
import pandas as pd
import numpy as np
import datetime as dt
import yfinance as yf

from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering,AffinityPropagation, DBSCAN

from pairs_identification import cointegration_pairs

In [2]:
list_of_tickers_data = pd.read_html("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies")[0]

list_of_tickers = list_of_tickers_data.Symbol.to_list()

list_of_tickers

['MMM',
 'AOS',
 'ABT',
 'ABBV',
 'ACN',
 'ADBE',
 'AMD',
 'AES',
 'AFL',
 'A',
 'APD',
 'ABNB',
 'AKAM',
 'ALB',
 'ARE',
 'ALGN',
 'ALLE',
 'LNT',
 'ALL',
 'GOOGL',
 'GOOG',
 'MO',
 'AMZN',
 'AMCR',
 'AEE',
 'AEP',
 'AXP',
 'AIG',
 'AMT',
 'AWK',
 'AMP',
 'AME',
 'AMGN',
 'APH',
 'ADI',
 'ANSS',
 'AON',
 'APA',
 'APO',
 'AAPL',
 'AMAT',
 'APTV',
 'ACGL',
 'ADM',
 'ANET',
 'AJG',
 'AIZ',
 'T',
 'ATO',
 'ADSK',
 'ADP',
 'AZO',
 'AVB',
 'AVY',
 'AXON',
 'BKR',
 'BALL',
 'BAC',
 'BAX',
 'BDX',
 'BRK.B',
 'BBY',
 'TECH',
 'BIIB',
 'BLK',
 'BX',
 'BK',
 'BA',
 'BKNG',
 'BSX',
 'BMY',
 'AVGO',
 'BR',
 'BRO',
 'BF.B',
 'BLDR',
 'BG',
 'BXP',
 'CHRW',
 'CDNS',
 'CZR',
 'CPT',
 'CPB',
 'COF',
 'CAH',
 'KMX',
 'CCL',
 'CARR',
 'CAT',
 'CBOE',
 'CBRE',
 'CDW',
 'COR',
 'CNC',
 'CNP',
 'CF',
 'CRL',
 'SCHW',
 'CHTR',
 'CVX',
 'CMG',
 'CB',
 'CHD',
 'CI',
 'CINF',
 'CTAS',
 'CSCO',
 'C',
 'CFG',
 'CLX',
 'CME',
 'CMS',
 'KO',
 'CTSH',
 'CL',
 'CMCSA',
 'CAG',
 'COP',
 'ED',
 'STZ',
 'CEG',
 'COO',


In [3]:
# 6 months worth of data
start = dt.datetime(2024,6,1)
end = dt.datetime(2024,12,1)

snp500 = yf.download(list_of_tickers, start, end)

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  503 of 503 completed

2 Failed downloads:
['BF.B']: YFPricesMissingError('possibly delisted; no price data found  (1d 2024-06-01 00:00:00 -> 2024-12-01 00:00:00)')
['BRK.B']: YFTzMissingError('possibly delisted; no timezone found')


Save a copy of S&P500

In [4]:
snp500.to_csv("snp_500.csv")

In [5]:
snp500 = snp500["Close"]

In [6]:
snp500_1 = snp500.copy()
snp500_1 = snp500_1.drop(columns=['BRK.B', 'BF.B'])

### Cluster by Sector

In [17]:
def getSector(ticker):
    try:
        sector = yf.Ticker(ticker).info['sector']    
        return sector
    except Exception as e:
        print(f"Error Fetching sector for {ticker}: {e}")

In [18]:
snp500_sector = pd.DataFrame({
    "tickers":snp500_1.columns
})

snp500_sector['sector'] = snp500_sector['tickers'].apply(getSector)

In [19]:
snp500_sector.groupby("sector")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001B0F9146E90>

In [24]:
sector_groupby = snp500_sector.groupby("sector")

for key, item in sector_groupby:
    sector_tickers = sector_groupby.get_group(key)
    sector_tickers.to_csv(f"{key}_tickers.csv")

In [37]:
sector_list = list(sector_groupby.groups.keys())
sector_list

['Basic Materials',
 'Communication Services',
 'Consumer Cyclical',
 'Consumer Defensive',
 'Energy',
 'Financial Services',
 'Healthcare',
 'Industrials',
 'Real Estate',
 'Technology',
 'Utilities']

### Loop through all Sectors and Get All Cointegrated Pairs

In [58]:
for sector in sector_list: 
    print(f"Sector:{sector}")
    tickers = pd.read_csv(f"{sector}_tickers.csv")["tickers"].to_list()
    find_pairs = cointegration_pairs(snp500_1[tickers], p_val_cutoff=0.05)
    find_pairs.identify_pairs()
    find_pairs.get_top_pairs(n=len(tickers))
    find_pairs.pairs

In [65]:
from pprint import pprint

In [66]:
def sector_coint(df, tickers):
    find_pairs = cointegration_pairs(df[tickers], p_val_cutoff=0.05)
    find_pairs.identify_pairs()
    find_pairs.get_top_pairs(n=len(df[tickers]))
    return find_pairs.pairs

for sector in sector_list: 
    print(f"Sector:{sector}")
    tickers = pd.read_csv(f"{sector}_tickers.csv")["tickers"].to_list()
    pprint(sector_coint(snp500_1, tickers))

Sector:Basic Materials
{('CF', 'SHW'): -3.379813040769222,
 ('DD', 'ECL'): -3.405461513457286,
 ('DD', 'EMN'): -3.415500899147951,
 ('ECL', 'EMN'): -3.9136456163224973,
 ('IFF', 'NEM'): -4.179872263924113,
 ('LYB', 'VMC'): -3.4867000908083097,
 ('MOS', 'SHW'): -3.6427537276621136,
 ('NEM', 'VMC'): -3.526067648399258}
Sector:Communication Services
{('CHTR', 'EA'): -3.6594916162145132,
 ('CHTR', 'NWS'): -3.964557094461437,
 ('CHTR', 'NWSA'): -3.8969052046598716,
 ('CMCSA', 'DASH'): -3.5865331930076523,
 ('CMCSA', 'LYV'): -3.6420575025474524,
 ('CMCSA', 'NFLX'): -3.550117685032766,
 ('CMCSA', 'TMUS'): -3.602331606976676,
 ('DASH', 'FOXA'): -3.941163929262343,
 ('DASH', 'TMUS'): -4.339564815595704,
 ('EA', 'NWS'): -4.10204425019672,
 ('FOX', 'NFLX'): -4.466103614188965,
 ('FOX', 'TTWO'): -3.421909027337632,
 ('FOXA', 'NFLX'): -4.2970939374047274,
 ('FOXA', 'TMUS'): -3.7032061523080935,
 ('GOOG', 'PARA'): -4.6616547076112615,
 ('GOOG', 'WBD'): -3.4606559460524466,
 ('LYV', 'NFLX'): -3.40615