In [1]:
import pandas as pd
import yfinance as yf
import csv
import requests
import numpy as np
from pathlib import Path
import sqlalchemy as sql

In [2]:
# Pulling S&P Data from wiki and outputing html
# Sepecify URL
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'

# Read html
sp500_html = pd.read_html(url)

# Obtain first table
sp500_html = sp500_html[0]

# Create dataframe
sp500_df = pd.DataFrame(sp500_html)
sp500_df.head()

Unnamed: 0,Symbol,Security,SEC filings,GICS Sector,GICS Sub-Industry,Headquarters Location,Date first added,CIK,Founded
0,MMM,3M,reports,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1976-08-09,66740,1902
1,ABT,Abbott Laboratories,reports,Health Care,Health Care Equipment,"North Chicago, Illinois",1964-03-31,1800,1888
2,ABBV,AbbVie,reports,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
3,ABMD,Abiomed,reports,Health Care,Health Care Equipment,"Danvers, Massachusetts",2018-05-31,815094,1981
4,ACN,Accenture,reports,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989


In [3]:
sp500_all_sectors_df = pd.DataFrame(
    columns=['GICS Sector', 'Symbol'],
    data=sp500_df
    )
sp500_all_sectors_df.head()

Unnamed: 0,GICS Sector,Symbol
0,Industrials,MMM
1,Health Care,ABT
2,Health Care,ABBV
3,Health Care,ABMD
4,Information Technology,ACN


In [4]:
# Delete index
sp500_df_wo_index = sp500_all_sectors_df.set_index("Symbol")
sp500_df_wo_index

Unnamed: 0_level_0,GICS Sector
Symbol,Unnamed: 1_level_1
MMM,Industrials
ABT,Health Care
ABBV,Health Care
ABMD,Health Care
ACN,Information Technology
...,...
YUM,Consumer Discretionary
ZBRA,Information Technology
ZBH,Health Care
ZION,Financials


In [5]:
# isolate symbols in order to pass list to yfinance to get market cap info
sp500_all_symbols = sp500_all_sectors_df['Symbol'].values.tolist()
sp500_all_symbols

['MMM',
 'ABT',
 'ABBV',
 'ABMD',
 'ACN',
 'ATVI',
 'ADBE',
 'AMD',
 'AAP',
 'AES',
 'AFL',
 'A',
 'APD',
 'AKAM',
 'ALK',
 'ALB',
 'ARE',
 'ALGN',
 'ALLE',
 'LNT',
 'ALL',
 'GOOGL',
 'GOOG',
 'MO',
 'AMZN',
 'AMCR',
 'AEE',
 'AAL',
 'AEP',
 'AXP',
 'AIG',
 'AMT',
 'AWK',
 'AMP',
 'ABC',
 'AME',
 'AMGN',
 'APH',
 'ADI',
 'ANSS',
 'ANTM',
 'AON',
 'AOS',
 'APA',
 'AAPL',
 'AMAT',
 'APTV',
 'ADM',
 'ANET',
 'AJG',
 'AIZ',
 'T',
 'ATO',
 'ADSK',
 'ADP',
 'AZO',
 'AVB',
 'AVY',
 'BKR',
 'BLL',
 'BAC',
 'BK',
 'BAX',
 'BDX',
 'BRK.B',
 'BBY',
 'BIO',
 'BIIB',
 'BLK',
 'BA',
 'BKNG',
 'BWA',
 'BXP',
 'BSX',
 'BMY',
 'AVGO',
 'BR',
 'BF.B',
 'CHRW',
 'COG',
 'CDNS',
 'CZR',
 'CPB',
 'COF',
 'CAH',
 'KMX',
 'CCL',
 'CARR',
 'CTLT',
 'CAT',
 'CBOE',
 'CBRE',
 'CDW',
 'CE',
 'CNC',
 'CNP',
 'CERN',
 'CF',
 'CRL',
 'SCHW',
 'CHTR',
 'CVX',
 'CMG',
 'CB',
 'CHD',
 'CI',
 'CINF',
 'CTAS',
 'CSCO',
 'C',
 'CFG',
 'CTXS',
 'CLX',
 'CME',
 'CMS',
 'KO',
 'CTSH',
 'CL',
 'CMCSA',
 'CMA',
 'CAG',
 'COP'

In [6]:
# one issue with how the wikipedia symbols come is that they come with a "." instead of a "-"
# yahoo finance needs to have the "-" in order to pull the data
# this step might need to go in front of the part where we break the sectors out individually
stocks = []

for stock_ticker in sp500_all_symbols:
    ticker = stock_ticker.replace(".","-")
    stocks.append(ticker)

# print(stocks)

In [7]:
def market_cap(stocks):

    market_cap = {}

    for stock in stocks:
        ticker = yf.Ticker(stock)
        market_cap[stock] = ticker.info['marketCap']

      
    # we want to return a sorted Pandas DataFrame based on market cap
    # since the columns will originally be the ticker we us ".T" to transpose the table
    # then we use .sort_values to sort by the "first column" [0] and sort in decending order
    # on average this takes 2400 seconds (37 minutes) to run for entire SP500
    return pd.DataFrame(market_cap, index=[0]).T.sort_values(by=[0], ascending=False)

market_cap_df = market_cap(stocks)
market_cap_df

Unnamed: 0,0
AAPL,2415723282432
MSFT,2180088201216
GOOG,1818393706496
GOOGL,1818393575424
AMZN,1694014767104
...,...
LEG,6419166208
UNM,5438937600
NOV,5254148608
BBWI,


In [8]:
# figure out what the new df column values are so we can replace
# this shows that there is only 1 column and 1 index
list(market_cap_df.columns.values)

[0]

In [9]:
# rename the column and index to be merged

market_cap_df.columns = ['Market_Cap']
market_cap_df.index.names = ['Symbol']
market_cap_df

Unnamed: 0_level_0,Market_Cap
Symbol,Unnamed: 1_level_1
AAPL,2415723282432
MSFT,2180088201216
GOOG,1818393706496
GOOGL,1818393575424
AMZN,1694014767104
...,...
LEG,6419166208
UNM,5438937600
NOV,5254148608
BBWI,


In [10]:
# merge sp500_df_wo_index and market_cap_df to create 1 complete data frame to be sliced for analysis
stock_industry_marketcap = pd.merge(sp500_df_wo_index, market_cap_df, left_index=True, right_index=True)
stock_industry_marketcap.head()


Unnamed: 0_level_0,GICS Sector,Market_Cap
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
MMM,Industrials,114645549056
ABT,Health Care,216585404416
ABBV,Health Care,202253746176
ABMD,Health Care,15696103424
ACN,Information Technology,204084314112


In [13]:
stock_industry_marketcap.sort_values(by=['GICS Sector', 'Market_Cap'], ascending=False)

Unnamed: 0_level_0,GICS Sector,Market_Cap
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
NEE,Utilities,158112481280
DUK,Utilities,82275450880
SO,Utilities,68493357056
D,Utilities,61295824896
EXC,Utilities,46513528832
...,...,...
IPG,Communication Services,14527375360
DISCK,Communication Services,14322982912
NWSA,Communication Services,14227398656
DISCA,Communication Services,13658615808


In [17]:
stock_industry_marketcap.head()

Unnamed: 0_level_0,GICS Sector,Market_Cap
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
MMM,Industrials,114645549056
ABT,Health Care,216585404416
ABBV,Health Care,202253746176
ABMD,Health Care,15696103424
ACN,Information Technology,204084314112


In [18]:
# save new dataframe to csv to be used in other code
stock_industry_marketcap.to_csv("stock_industry_marketcap.csv")