In [None]:
import pandas as pd
import os

#### S&P 500 Historical Listings are sourced from [this](https://github.com/fja05680/sp500) gitub repository
> S&P 500 Historical Components & Changes(12-30-2023).csv

In [None]:
source_folder = 'source-data'
source_file = r'S&P 500 Historical Components & Changes(12-30-2023).csv'
read_string = os.path.join(source_folder, source_file)
historical_data = pd.read_csv(read_string)

#Convert date to datetime dtype
historical_data['date'] = pd.to_datetime(historical_data['date'])

#For the analysis we only want to look at stocks after the year 2000
post_y2k = historical_data.loc[historical_data['date'].dt.year >= 2000].reset_index(drop = True)
print(post_y2k['date'].min(), post_y2k['date'].max())

### Format all stock tickers that have been in the S&P 500 from the start of 2000 until the end of 2023 into a clean dataframe

- ticker: stock ticker symbol
- start_date: date stock was included in S&P 500
    - If stock was already in S&P 500 at the start of 2000, then it's start date = 2000-01-03 (Monday)
- end_date: date stock was delisted from S&P 500
    - If stock was still listed in S&P 500 as of 2023-12-31 then end_date = 2023-12-31
- type: S (stock)

In [None]:
#Format data so we can get min and max dates for each ticker - needed for screenscraping
post_y2k_format_concat = []
for i in post_y2k.index:
    date = post_y2k.iloc[i]['date']
    ticker_string = post_y2k.iloc[i]['tickers']
    ticker_list = ticker_string.split(',')
    df = pd.DataFrame(ticker_list, columns = ['ticker'])
    df.insert(0, 'date', date)
    post_y2k_format_concat.append(df)

formatted_df = pd.concat(post_y2k_format_concat, ignore_index = True)
formatted_df['date'] = pd.to_datetime(formatted_df['date'])

#Get min and max dates for each ticker
s_and_p_stocks = formatted_df.groupby(['ticker']).agg({'date':['min', 'max']}).reset_index()
s_and_p_stocks.columns = ['ticker', 'start_date', 'end_date']
#format end dates to go to end of 2023 where necessary (end_date == end of data range)
end_dates = s_and_p_stocks['end_date']
s_and_p_stocks['end_date'] = [pd.to_datetime('2023-12-31') if d == pd.Timestamp('2023-10-18') else d for d in end_dates]
#Classify all of these as stocks
s_and_p_stocks['type'] = 'S'
s_and_p_stocks.head()

#### I want to include some popular ETF's to compare against in the analysis

NOTE: Start Dates sourced from [yahoo finance historical data](https://finance.yahoo.com/)

- SPY: SPDR S&P 500 ETF Trust for S&P 500 (aka "The Market") - Start Date: 2000-01-03
- QQQ: Invesco QQQ Trust ETF for Nasdaq 100 - Start Date: 2009-01-02
- IWM: iShares Russell 2000 ETF that includes mid-cap stocks - Start Date: 2000-05-26
- GLD: SPDR Gold Shares ETF - Start Date: 2008-01-02
- VTI: Vanguard Total Stock Market Index Fund ETF Shares - Start Date: 2008-01-02
- BND: Vanguard Total Bond Market Index Fund - Start Date: 2008-01-02


In [None]:
etf_dict = {
    'SPY':{'start_date':'2000-01-03', 'end_date':'2023-12-31', 'type':'E'},
    'QQQ':{'start_date':'2009-01-02', 'end_date':'2023-12-31', 'type':'E'},
    'IWM':{'start_date':'2000-05-26', 'end_date':'2023-12-31', 'type':'E'},
    'GLD':{'start_date':'2008-01-02', 'end_date':'2023-12-31', 'type':'E'},
    'VTI':{'start_date':'2008-01-02', 'end_date':'2023-12-31', 'type':'E'},
    'BND':{'start_date':'2008-01-02', 'end_date':'2023-12-31', 'type':'E'}
}

popular_etf = pd.DataFrame.from_dict(etf_dict, orient = 'index').reset_index().rename(columns = {'index':'ticker'})
popular_etf.head()

#### Combine dataframes into one dataset and save

In [None]:
#Define output folder and filename to send data to
output_folder = 'data'
if not os.path.exists(output_folder):
    os.mkdir(output_folder)

output_filename = 'post_Y2K_tickers_for_webscraping.csv'

output_path = os.path.join(output_folder, output_filename)

#Concat stock and etf datframes into one
all_data = pd.concat([s_and_p_stocks, popular_etf], ignore_index= True)
all_data.to_csv(output_path, index = False)