In [1]:
import yfinance as yf
import pandas as pd
from pathlib import Path
import csv
import requests
import numpy as np

In [2]:
# Pulling S&P Data from wiki and outputing html
# Sepecify URL
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'

# Read html
sp500_html = pd.read_html(url)

# Obtain first table
sp500_html = sp500_html[0]

# Create dataframe
sp500_df = pd.DataFrame(sp500_html)

# Save file to CSV
sp500_df.to_csv("sp500_wiki_table.csv")
sp500_df

Unnamed: 0,Symbol,Security,SEC filings,GICS Sector,GICS Sub-Industry,Headquarters Location,Date first added,CIK,Founded
0,MMM,3M,reports,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1976-08-09,66740,1902
1,ABT,Abbott Laboratories,reports,Health Care,Health Care Equipment,"North Chicago, Illinois",1964-03-31,1800,1888
2,ABBV,AbbVie,reports,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
3,ABMD,Abiomed,reports,Health Care,Health Care Equipment,"Danvers, Massachusetts",2018-05-31,815094,1981
4,ACN,Accenture,reports,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989
...,...,...,...,...,...,...,...,...,...
500,YUM,Yum! Brands,reports,Consumer Discretionary,Restaurants,"Louisville, Kentucky",1997-10-06,1041061,1997
501,ZBRA,Zebra Technologies,reports,Information Technology,Electronic Equipment & Instruments,"Lincolnshire, Illinois",2019-12-23,877212,1969
502,ZBH,Zimmer Biomet,reports,Health Care,Health Care Equipment,"Warsaw, Indiana",2001-08-07,1136869,1927
503,ZION,Zions Bancorp,reports,Financials,Regional Banks,"Salt Lake City, Utah",2001-06-22,109380,1873


In [3]:
sp500_all_sectors_df = pd.DataFrame(
    columns=['GICS Sector', 'Symbol'],
    data=sp500_df
    )
sp500_all_sectors_df

Unnamed: 0,GICS Sector,Symbol
0,Industrials,MMM
1,Health Care,ABT
2,Health Care,ABBV
3,Health Care,ABMD
4,Information Technology,ACN
...,...,...
500,Consumer Discretionary,YUM
501,Information Technology,ZBRA
502,Health Care,ZBH
503,Financials,ZION


In [4]:
sp500_sectors_list = sp500_all_sectors_df['GICS Sector'].drop_duplicates().to_list()
print(sp500_sectors_list)

['Industrials', 'Health Care', 'Information Technology', 'Communication Services', 'Consumer Discretionary', 'Utilities', 'Financials', 'Materials', 'Real Estate', 'Consumer Staples', 'Energy']


In [5]:
# Delete index
sp500_df_wo_index = sp500_all_sectors_df.set_index("GICS Sector")
sp500_df_wo_index

Unnamed: 0_level_0,Symbol
GICS Sector,Unnamed: 1_level_1
Industrials,MMM
Health Care,ABT
Health Care,ABBV
Health Care,ABMD
Information Technology,ACN
...,...
Consumer Discretionary,YUM
Information Technology,ZBRA
Health Care,ZBH
Financials,ZION


In [6]:
# Separating out each sector in the S&P 500 and the stocks within each:
# ['Industrials', 'Health Care', 'Information Technology', 'Communication Services', 
# 'Consumer Discretionary', 'Utilities', 'Financials', 'Materials', 'Real Estate', 
# 'Consumer Staples', 'Energy']

industrials_sp500 = sp500_df_wo_index.loc["Industrials"]
health_care_sp500 = sp500_df_wo_index.loc["Health Care"]
information_technology_sp500 = sp500_df_wo_index.loc["Information Technology"]
communication_services_sp500 = sp500_df_wo_index.loc["Communication Services"]
consumer_discretionary_sp500 = sp500_df_wo_index.loc["Consumer Discretionary"]
utilities_sp500 = sp500_df_wo_index.loc["Utilities"]
financials_sp500 = sp500_df_wo_index.loc["Financials"]
materials_sp500 = sp500_df_wo_index.loc["Materials"]
real_estate_sp500 = sp500_df_wo_index.loc["Real Estate"]
consumer_staples_sp500 = sp500_df_wo_index.loc['Consumer Staples']
energy_sp500 = sp500_df_wo_index.loc["Energy"]

In [7]:
# # Displaying top 5 on each list
# display(
#     industrials_sp500.head(),
#     health_care_sp500.head(),
#     information_technology_sp500.head(),
#     communication_services_sp500.head(),
#     consumer_discretionary_sp500.head(),
#     utilities_sp500.head(),
#     financials_sp500.head(),
#     materials_sp500.head(),
#     real_estate_sp500.head(),
#     consumer_staples_sp500.head(),
#     energy_sp500.head()
# )


In [8]:
print(type(industrials_sp500['Symbol']))
print(type(industrials_sp500['Symbol'].values.tolist()))

<class 'pandas.core.series.Series'>
<class 'list'>


In [9]:
# Created list of stocks in each sector from the S&P 500
industrials_list = industrials_sp500["Symbol"].values.tolist()
health_care_list = health_care_sp500["Symbol"].values.tolist()
information_technology_list = information_technology_sp500["Symbol"].values.tolist()
communication_services_list = communication_services_sp500["Symbol"].values.tolist()
consumer_discretionary_list = consumer_discretionary_sp500["Symbol"].values.tolist()
utilities_list = utilities_sp500["Symbol"].values.tolist()
financials_list = financials_sp500["Symbol"].values.tolist()
materials_list = materials_sp500["Symbol"].values.tolist()
real_estate_list = real_estate_sp500["Symbol"].values.tolist()
consumer_staples_list = consumer_staples_sp500["Symbol"].values.tolist()
energy_list = energy_sp500["Symbol"].values.tolist()

In [25]:
fang = ['FB', 'AMZN', 'AAPL', 'NFLX', 'GOOG', 'WKHS']

def market_cap(sector):

    market_cap = {}

    for stock in sector:
        ticker = yf.Ticker(stock)
        market_cap[stock] = ticker.info['marketCap']
      
    # we want to return a sorted Pandas DataFrame based on market cap and filtered to the top 5
    # since the columns will originally be the ticker we us ".T" to transpose the table
    # then we use .sort_values to sort by the "first column" [0] and sort in decending order
    # then we only call the top 5 by using "[0:5]"
    # on average this takes 320 seconds (5 minutes 20 seconds) to run per sector
    return pd.DataFrame(market_cap, index=[0]).T.sort_values(by=[0], ascending=False)[0:5]

market_cap(industrials_list)

Unnamed: 0,0
GE,903409893376
UPS,167212351488
HON,159463112704
UNP,144565616640
BA,132535492608


In [28]:
# Narrow down each list of stocks in the S&P 500 sectors to the top 5 by market cap
# this took 2229 seconds (37+ minutes)
industrials_list_top5 = market_cap(industrials_list)
health_care_list_top5 = market_cap(health_care_list)
information_technology_list_top5 = market_cap(information_technology_list)
communication_services_list_top5 = market_cap(communication_services_list)
consumer_discretionary_list_top5 = market_cap(consumer_discretionary_list)
utilities_list_top5 = market_cap(utilities_list)
financials_list_top5 = market_cap(financials_list)
materials_list_top5 = market_cap(materials_list)
real_estate_list_top5 = market_cap(real_estate_list)
consumer_staples_list_top5 = market_cap(consumer_staples_list)
energy_list_top5 = market_cap(energy_list)

KeyError: 'marketCap'

In [29]:
# # Displaying top 5 on each list
display(
    industrials_list_top5,
    health_care_list_top5,
    information_technology_list_top5,
    communication_services_list_top5,
    consumer_discretionary_list_top5,
    utilities_list_top5,
    financials_list_top5,
    materials_list_top5,
    real_estate_list_top5,
    consumer_staples_list_top5,
    energy_list_top5
)

Unnamed: 0,0
GE,903409893376
UPS,167212351488
HON,159463112704
UNP,144565616640
BA,132535492608


Unnamed: 0,0
JNJ,456526233600
UNH,397808140288
PFE,252959604736
LLY,251715305472
DHR,217992314880


Unnamed: 0,0
AAPL,2452242825216
MSFT,2157870186496
V,520761540608
MA,364715212800
PYPL,323638099968


Unnamed: 0,0
GOOGL,1829099274240
GOOG,1788562636800
FB,1017703366656
DIS,313565806592
CMCSA,266331013120


Unnamed: 0,0
AMZN,1691865841664
TSLA,684850544640
HD,351205425152
NKE,271612657664
MCD,175224045568


Unnamed: 0,0
NEE,156661022720
DUK,82098536448
SO,67784077312
D,60868370432
EXC,45702475776


Unnamed: 0,0
JPM,457818603520
BAC,327947878400
WFC,191112691712
MS,179156205568
C,141634256896


Unnamed: 0,0
LIN,158931566592
SHW,78782373888
APD,65237229568
ECL,62277079040
FCX,52246147072


Unnamed: 0,0
AMT,129529249792
PLD,95356731392
CCI,84337557504
EQIX,74202619904
PSA,53632507904


Unnamed: 0,0
WMT,400259088384
PG,348701360128
KO,241885249536
PEP,212840103936
COST,192519995392


Unnamed: 0,0
XOM,240634413056
CVX,193384431616
COP,74123640832
EOG,41287184384
KMI,38814900224


In [None]:
# yfinance will only let you input tickers like:
# 'msft aapl goog' 
# but the list we have is like:
# 'msft', 'aapl', 'goog'
# I will need to figure out how to change this
# tickers = yf.Tickers('msft aapl goog')

In [None]:
sp500_all_symbols = sp500_all_sectors_df['Symbol'].values.tolist()


In [None]:
# one issue with how the wikipedia symbols come is that they come with a "." instead of a "-"
# yahoo finance needs to have the "-" in order to pull the data
# this step might need to go in front of the part where we break the sectors out individually
stocks = []

for stock_ticker in industrials_list:
    ticker = stock_ticker.replace(".","-")
    stocks.append(ticker)

In [None]:
data = yf.download(
    #tickers list or string as well
    tickers = industrials_list,

    # use "period" instead of start/end
    # valid periods: 1d, 5d, 1mo, 3mo, 6mo, 1y, 2y, 5y, 10y, ytd, max
    # (optional, default is "1mo")
    period = "10y",

    # fetch data by interval (including intraday if period < 60 days)
    # valid intervals: 1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo
    # (optional, default is '1d')
    interval = '1d',

    # adjust all OHLC automatically
    # (optional, default is False)
    auto_adjust = True,

    # download pre/post regular market hours data
    # (optional, default is False)
    prepost = True,

    # use threads for mass downloading? (True/False/Integre)
    # (optional, default is True)
    threads = True,

    # proxy URL scheme use use when downloading?
    # (optional, default is None)
    proxy = None
)['Close']

data

In [None]:
data.columns = pd.MultiIndex.from_product([['Close'], data.columns])
data

In [None]:
# the issue is that there are varying null values that pull from yahoo finance
# for example ALLE wasn't founded until 2013, so if we are taking the null values
# then will be taking out everything prior to 2013
data.isnull().sum()

In [None]:
# so what we can do is replace null values with the average of the past 5 closing prices and 
# sub that price in for the null values. In the case of an ALLE where they are missing 2 years
# of data we can just take the first trading days close price and sub that for all null values before 2013

# since python goes in order of tasks
# Task 1: fill all NaN's with the average of the previous 5 values
# the formula is showing a rolling period of '6'. That is because it is counting the NaN value as the first value, 
# but since it doesn't exist it is not included in the calculation
data_cleaned = data.fillna(data.rolling(6, min_periods=1).mean())

# This still leaves all values that do not have "previous data"
# for these we are just using the 'bfill' method from fillna to backfill the first non-null value
data_cleaned = data_cleaned.fillna(method='bfill')
data_cleaned

# by doing these 2 steps in order it should fill all NaN values in our dataset regardless of what stock sector we pull


In [None]:
data_cleaned.isnull().sum()

In [None]:
data_pct_change = data_cleaned.pct_change().dropna()

In [None]:
data_pct_change

In [None]:
# next steps
# 1) need to figure out the CLI in order to have user select 3 sectors
# 2) need to dynamically filter the stocks by selection and market cap
#     a) need to calculate market cap
# 3) once CLI and dynamic selection are complete test the hell out of it and make sure its correct


In [None]:
msft = yf.Ticker("MSFT")
msft_sharesoutstanding = msft.info['sharesOutstanding']
msft_sharesoutstanding

In [None]:
market_cap = 286.51 * msft_sharesoutstanding
market_cap

In [None]:
msft.info