In [112]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from sqlalchemy import create_engine
import os


#### Using [Wikipedia to get current list of S&P 500 companies](https://en.wikipedia.org/wiki/List_of_S%26P_500_companies)

In [3]:
def get_response():

    return(response)


<Response [200]>


In [43]:
def get_content():
    wiki_url = r'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
    response = requests.get(wiki_url)
    soup = BeautifulSoup(response.content)
    return(soup)


In [56]:
s = get_content()

In [59]:
def get_constituents(soup, name):
    table = soup.find('table', id=name)
    header = []
    rows = []
    for i, row in enumerate(table.find_all('tr')):
        if i == 0:
            header = [el.text.strip() for el in row.find_all('th')]
        else:
            rows.append([el.text.strip() for el in row.find_all('td')])
    
    df = pd.DataFrame(rows, columns = header)
    return(df)


In [98]:
constituents = get_constituents(soup, 'constituents')
base = constituents[['Symbol', 'Date added']].rename(columns ={ 'Symbol':'ticker', 'Date added':'date_added'}).sort_values(by = 'ticker').reset_index(drop = True)
base['date_added'] = pd.to_datetime(base['date_added'])
base['currently_listed'] = True
base.head()


Unnamed: 0,ticker,date_added,currently_listed
0,A,2000-06-05,True
1,AAL,2015-03-23,True
2,AAPL,1982-11-30,True
3,ABBV,2012-12-31,True
4,ABNB,2023-09-18,True


In [99]:
rows = []
change_html = s.find('table', id='changes')
for i, row in enumerate(change_html.find_all('tr')):
    if i > 1:
        rows.append([el.text.strip() for el in row.find_all('td')])

df = pd.DataFrame(rows, columns = ['Date', 'ADD_Symbol', 'ADD_Security', 'RMV_Symbol', 'RMV_Security', 'Reason'])       
df['Date'] = pd.to_datetime(df['Date'])
df = df.replace('', None)

add_dates = df.loc[df['ADD_Symbol'].isnull() == False][['Date', 'ADD_Symbol']].\
    rename(columns = {'ADD_Symbol':'ticker', 'Date':'date_added'})
rmv_dates = df.loc[df['RMV_Symbol'].isnull() == False][['Date', 'RMV_Symbol']].\
    rename(columns = {'RMV_Symbol':'ticker', 'Date':'date_removed'})

In [103]:
all_tickers = add_dates.merge(rmv_dates, on = 'ticker', how = 'outer')[['ticker', 'date_added', 'date_removed']]
all_tickers.head()


Unnamed: 0,ticker,date_added,date_removed
0,AA,NaT,2016-11-01
1,AAL,2015-03-23,NaT
2,AAP,2015-07-08,2023-08-25
3,ABBV,2013-01-02,NaT
4,ABK,2000-12-05,2008-06-10


In [104]:
base.head()

Unnamed: 0,ticker,date_added,currently_listed
0,A,2000-06-05,True
1,AAL,2015-03-23,True
2,AAPL,1982-11-30,True
3,ABBV,2012-12-31,True
4,ABNB,2023-09-18,True


In [115]:
tickers_total = base.merge(all_tickers, on = ['ticker', 'date_added'], how = 'outer')
tickers_total['currently_listed'] = tickers_total['currently_listed'].fillna(False)
#Classify anything without a date added as 1957-01-01
tickers_total['date_added'] = [pd.to_datetime('1957-01-01') if pd.isnull(d) == True else d for d in tickers_total['date_added']]
tickers_total = tickers_total.sort_values(by = ['date_added', 'ticker']).reset_index(drop = True)
#Create an int ticker_id that will be used as a primary key for tickers
tickers_total['ticker_id'] = [i + 10000 for i in tickers_total.index]

col_order = [
    'ticker_id', 'ticker', 'date_added', 'date_removed', 'currently_listed'
]
tickers_total = tickers_total[col_order]

conn = os.getenv('STOCK_DB_CONN')
engine = create_engine(conn)
tickers_total.to_sql(name = 's_and_p_500_history', con = engine, schema = 'NASDAQ', if_exists= 'replace', index = False, method = 'multi')


tickers_total.head(10)

  tickers_total['currently_listed'] = tickers_total['currently_listed'].fillna(False)


Unnamed: 0,ticker_id,ticker,date_added,date_removed,currently_listed
0,10000,AA,1957-01-01,2016-11-01,False
1,10001,ABS,1957-01-01,2006-06-02,False
2,10002,ACAS,1957-01-01,2009-03-03,False
3,10003,ACE,1957-01-01,2016-01-19,False
4,10004,AET,1957-01-01,2018-12-03,False
5,10005,AIV,1957-01-01,2020-12-21,False
6,10006,ALTR,1957-01-01,2015-12-29,False
7,10007,AN,1957-01-01,2017-08-08,False
8,10008,ANDV,1957-01-01,2018-10-01,False
9,10009,ANF,1957-01-01,2013-12-23,False


In [108]:
tickers_total.dtypes

ticker                      object
date_added          datetime64[ns]
currently_listed              bool
date_removed        datetime64[ns]
ticker_id                    int64
dtype: object

344