In [255]:
import yfinance as yf
import pandas as pd
from pathlib import Path
import csv
import requests
import numpy as np

In [256]:
# Pulling S&P Data from wiki and outputing html
# Sepecify URL
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'

# Read html
sp500_html = pd.read_html(url)

# Obtain first table
sp500_html = sp500_html[0]

# Create dataframe
sp500_df = pd.DataFrame(sp500_html)

# Save file to CSV
sp500_df.to_csv("sp500_wiki_table.csv")
sp500_df

Unnamed: 0,Symbol,Security,SEC filings,GICS Sector,GICS Sub-Industry,Headquarters Location,Date first added,CIK,Founded
0,MMM,3M,reports,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1976-08-09,66740,1902
1,ABT,Abbott Laboratories,reports,Health Care,Health Care Equipment,"North Chicago, Illinois",1964-03-31,1800,1888
2,ABBV,AbbVie,reports,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
3,ABMD,Abiomed,reports,Health Care,Health Care Equipment,"Danvers, Massachusetts",2018-05-31,815094,1981
4,ACN,Accenture,reports,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989
...,...,...,...,...,...,...,...,...,...
500,YUM,Yum! Brands,reports,Consumer Discretionary,Restaurants,"Louisville, Kentucky",1997-10-06,1041061,1997
501,ZBRA,Zebra Technologies,reports,Information Technology,Electronic Equipment & Instruments,"Lincolnshire, Illinois",2019-12-23,877212,1969
502,ZBH,Zimmer Biomet,reports,Health Care,Health Care Equipment,"Warsaw, Indiana",2001-08-07,1136869,1927
503,ZION,Zions Bancorp,reports,Financials,Regional Banks,"Salt Lake City, Utah",2001-06-22,109380,1873


In [262]:
sp500_all_sectors_df = pd.DataFrame(
    columns=['GICS Sector', 'Symbol'],
    data=sp500_df
    )
sp500_all_sectors_df

Unnamed: 0,GICS Sector,Symbol
0,Industrials,MMM
1,Health Care,ABT
2,Health Care,ABBV
3,Health Care,ABMD
4,Information Technology,ACN
...,...,...
500,Consumer Discretionary,YUM
501,Information Technology,ZBRA
502,Health Care,ZBH
503,Financials,ZION


In [263]:
sp500_sectors_list = sp500_all_sectors_df['GICS Sector'].drop_duplicates().to_list()
print(sp500_sectors_list)

['Industrials', 'Health Care', 'Information Technology', 'Communication Services', 'Consumer Discretionary', 'Utilities', 'Financials', 'Materials', 'Real Estate', 'Consumer Staples', 'Energy']


In [264]:
# Delete index
sp500_df_wo_index = sp500_all_sectors_df.set_index("GICS Sector")
sp500_df_wo_index

Unnamed: 0_level_0,Symbol
GICS Sector,Unnamed: 1_level_1
Industrials,MMM
Health Care,ABT
Health Care,ABBV
Health Care,ABMD
Information Technology,ACN
...,...
Consumer Discretionary,YUM
Information Technology,ZBRA
Health Care,ZBH
Financials,ZION


In [267]:
# Separating out each sector in the S&P 500 and the stocks within each:
# ['Industrials', 'Health Care', 'Information Technology', 'Communication Services', 
# 'Consumer Discretionary', 'Utilities', 'Financials', 'Materials', 'Real Estate', 
# 'Consumer Staples', 'Energy']

industrials_sp500 = sp500_df_wo_index.loc["Industrials"]
health_care_sp500 = sp500_df_wo_index.loc["Health Care"]
information_technology_sp500 = sp500_df_wo_index.loc["Information Technology"]
communication_services_sp500 = sp500_df_wo_index.loc["Communication Services"]
consumer_discretionary_sp500 = sp500_df_wo_index.loc["Consumer Discretionary"]
utilities_sp500 = sp500_df_wo_index.loc["Utilities"]
financials_sp500 = sp500_df_wo_index.loc["Financials"]
materials_sp500 = sp500_df_wo_index.loc["Materials"]
real_estate_sp500 = sp500_df_wo_index.loc["Real Estate"]
consumer_staples_sp500 = sp500_df_wo_index.loc['Consumer Staples']
energy_sp500 = sp500_df_wo_index.loc["Energy"]

In [271]:
# # Displaying top 5 on each list
# display(
#     industrials_sp500.head(),
#     health_care_sp500.head(),
#     information_technology_sp500.head(),
#     communication_services_sp500.head(),
#     consumer_discretionary_sp500.head(),
#     utilities_sp500.head(),
#     financials_sp500.head(),
#     materials_sp500.head(),
#     real_estate_sp500.head(),
#     consumer_staples_sp500.head(),
#     energy_sp500.head()
# )


In [294]:
print(type(industrials_sp500['Symbol']))
print(type(industrials_sp500['Symbol'].values.tolist()))

<class 'pandas.core.series.Series'>
<class 'list'>


In [303]:
# Created list of stocks in each sector from the S&P 500
industrials_list = industrials_sp500["Symbol"].values.tolist()
health_care_list = health_care_sp500["Symbol"].values.tolist()
information_technology_list = information_technology_sp500["Symbol"].values.tolist()
communication_services_list = communication_services_sp500["Symbol"].values.tolist()
consumer_discretionary_list = consumer_discretionary_sp500["Symbol"].values.tolist()
utilities_list = utilities_sp500["Symbol"].values.tolist()
financials_list = financials_sp500["Symbol"].values.tolist()
materials_list = materials_sp500["Symbol"].values.tolist()
real_estate_list = real_estate_sp500["Symbol"].values.tolist()
consumer_staples_list = consumer_staples_sp500["Symbol"].values.tolist()
energy_list = energy_sp500["Symbol"].values.tolist()

In [312]:
# yfinance will only let you input tickers like:
# 'msft aapl goog' 
# but the list we have is like:
# 'msft', 'aapl', 'goog'
# I will need to figure out how to change this
tickers = yf.Tickers('msft aapl goog')