In [3]:
import yfinance as yf
import pandas as pd
from pathlib import Path
import csv
import requests
import numpy as np

In [4]:
# Pulling S&P Data from wiki and outputing html
# Sepecify URL
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'

# Read html
sp500_html = pd.read_html(url)

# Obtain first table
sp500_html = sp500_html[0]

# Create dataframe
sp500_df = pd.DataFrame(sp500_html)

# Save file to CSV
sp500_df.to_csv("sp500_wiki_table.csv")
sp500_df

Unnamed: 0,Symbol,Security,SEC filings,GICS Sector,GICS Sub-Industry,Headquarters Location,Date first added,CIK,Founded
0,MMM,3M,reports,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1976-08-09,66740,1902
1,ABT,Abbott Laboratories,reports,Health Care,Health Care Equipment,"North Chicago, Illinois",1964-03-31,1800,1888
2,ABBV,AbbVie,reports,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
3,ABMD,Abiomed,reports,Health Care,Health Care Equipment,"Danvers, Massachusetts",2018-05-31,815094,1981
4,ACN,Accenture,reports,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989
...,...,...,...,...,...,...,...,...,...
500,YUM,Yum! Brands,reports,Consumer Discretionary,Restaurants,"Louisville, Kentucky",1997-10-06,1041061,1997
501,ZBRA,Zebra Technologies,reports,Information Technology,Electronic Equipment & Instruments,"Lincolnshire, Illinois",2019-12-23,877212,1969
502,ZBH,Zimmer Biomet,reports,Health Care,Health Care Equipment,"Warsaw, Indiana",2001-08-07,1136869,1927
503,ZION,Zions Bancorp,reports,Financials,Regional Banks,"Salt Lake City, Utah",2001-06-22,109380,1873


In [5]:
sp500_all_sectors_df = pd.DataFrame(
    columns=['GICS Sector', 'Symbol'],
    data=sp500_df
    )
sp500_all_sectors_df

Unnamed: 0,GICS Sector,Symbol
0,Industrials,MMM
1,Health Care,ABT
2,Health Care,ABBV
3,Health Care,ABMD
4,Information Technology,ACN
...,...,...
500,Consumer Discretionary,YUM
501,Information Technology,ZBRA
502,Health Care,ZBH
503,Financials,ZION


In [6]:
sp500_sectors_list = sp500_all_sectors_df['GICS Sector'].drop_duplicates().to_list()
print(sp500_sectors_list)

['Industrials', 'Health Care', 'Information Technology', 'Communication Services', 'Consumer Discretionary', 'Utilities', 'Financials', 'Materials', 'Real Estate', 'Consumer Staples', 'Energy']


In [7]:
# Delete index
sp500_df_wo_index = sp500_all_sectors_df.set_index("GICS Sector")
sp500_df_wo_index

Unnamed: 0_level_0,Symbol
GICS Sector,Unnamed: 1_level_1
Industrials,MMM
Health Care,ABT
Health Care,ABBV
Health Care,ABMD
Information Technology,ACN
...,...
Consumer Discretionary,YUM
Information Technology,ZBRA
Health Care,ZBH
Financials,ZION


In [15]:
industrials_sp500 = sp500_df_wo_index.loc["Industrials"]
type(industrials_list)

list

In [9]:
# Separating out each sector in the S&P 500 and the stocks within each:
# ['Industrials', 'Health Care', 'Information Technology', 'Communication Services', 
# 'Consumer Discretionary', 'Utilities', 'Financials', 'Materials', 'Real Estate', 
# 'Consumer Staples', 'Energy']

industrials_sp500 = sp500_df_wo_index.loc["Industrials"]
health_care_sp500 = sp500_df_wo_index.loc["Health Care"]
information_technology_sp500 = sp500_df_wo_index.loc["Information Technology"]
communication_services_sp500 = sp500_df_wo_index.loc["Communication Services"]
consumer_discretionary_sp500 = sp500_df_wo_index.loc["Consumer Discretionary"]
utilities_sp500 = sp500_df_wo_index.loc["Utilities"]
financials_sp500 = sp500_df_wo_index.loc["Financials"]
materials_sp500 = sp500_df_wo_index.loc["Materials"]
real_estate_sp500 = sp500_df_wo_index.loc["Real Estate"]
consumer_staples_sp500 = sp500_df_wo_index.loc['Consumer Staples']
energy_sp500 = sp500_df_wo_index.loc["Energy"]

In [7]:
# # Displaying top 5 on each list
# display(
#     industrials_sp500.head(),
#     health_care_sp500.head(),
#     information_technology_sp500.head(),
#     communication_services_sp500.head(),
#     consumer_discretionary_sp500.head(),
#     utilities_sp500.head(),
#     financials_sp500.head(),
#     materials_sp500.head(),
#     real_estate_sp500.head(),
#     consumer_staples_sp500.head(),
#     energy_sp500.head()
# )


In [8]:
print(type(industrials_sp500['Symbol']))
print(type(industrials_sp500['Symbol'].values.tolist()))

<class 'pandas.core.series.Series'>
<class 'list'>


In [16]:
# Created list of stocks in each sector from the S&P 500
industrials_list = industrials_sp500["Symbol"].values.tolist()
health_care_list = health_care_sp500["Symbol"].values.tolist()
information_technology_list = information_technology_sp500["Symbol"].values.tolist()
communication_services_list = communication_services_sp500["Symbol"].values.tolist()
consumer_discretionary_list = consumer_discretionary_sp500["Symbol"].values.tolist()
utilities_list = utilities_sp500["Symbol"].values.tolist()
financials_list = financials_sp500["Symbol"].values.tolist()
materials_list = materials_sp500["Symbol"].values.tolist()
real_estate_list = real_estate_sp500["Symbol"].values.tolist()
consumer_staples_list = consumer_staples_sp500["Symbol"].values.tolist()
energy_list = energy_sp500["Symbol"].values.tolist()

In [17]:
fang = ['FB', 'AMZN', 'AAPL', 'NFLX', 'GOOG', 'WKHS']

def market_cap(sector):

    market_cap = {}

    for stock in sector:
        ticker = yf.Ticker(stock)
        market_cap[stock] = ticker.info['marketCap']
      
    # we want to return a sorted Pandas DataFrame based on market cap and filtered to the top 5
    # since the columns will originally be the ticker we us ".T" to transpose the table
    # then we use .sort_values to sort by the "first column" [0] and sort in decending order
    # then we only call the top 5 by using "[0:5]"
    # on average this takes 320 seconds (5 minutes 20 seconds) to run per sector
    return pd.DataFrame(market_cap, index=[0]).T.sort_values(by=[0], ascending=False)[0:5]

market_cap(industrials_list)

Unnamed: 0,0
GE,904375500800
UPS,166888079360
HON,159664537600
UNP,145588649984
BA,134779789312


In [19]:
msft = yf.Ticker("MSFT")
msft.info

{'zip': '98052-6399',
 'sector': 'Technology',
 'fullTimeEmployees': 181000,
 'longBusinessSummary': 'Microsoft Corporation develops, licenses, and supports software, services, devices, and solutions worldwide. Its Productivity and Business Processes segment offers Office, Exchange, SharePoint, Microsoft Teams, Office 365 Security and Compliance, and Skype for Business, as well as related Client Access Licenses (CAL); Skype, Outlook.com, OneDrive, and LinkedIn; and Dynamics 365, a set of cloud-based and on-premises business solutions for small and medium businesses, organizations, and enterprise divisions. Its Intelligent Cloud segment licenses SQL, Windows Servers, Visual Studio, System Center, and related CALs; GitHub that provides a collaboration platform and code hosting service for developers; and Azure, a cloud platform. It also offers support services and Microsoft consulting services to assist customers in developing, deploying, and managing Microsoft server and desktop solutio

In [11]:
# Narrow down each list of stocks in the S&P 500 sectors to the top 5 by market cap
# this took 2229 seconds (37+ minutes)
industrials_list_top5 = market_cap(industrials_list)
# health_care_list_top5 = market_cap(health_care_list)
# information_technology_list_top5 = market_cap(information_technology_list)
# communication_services_list_top5 = market_cap(communication_services_list)
# consumer_discretionary_list_top5 = market_cap(consumer_discretionary_list)
# utilities_list_top5 = market_cap(utilities_list)
# financials_list_top5 = market_cap(financials_list)
# materials_list_top5 = market_cap(materials_list)
# real_estate_list_top5 = market_cap(real_estate_list)
# consumer_staples_list_top5 = market_cap(consumer_staples_list)
# energy_list_top5 = market_cap(energy_list)

KeyboardInterrupt: 

In [13]:
# # Displaying top 5 on each list
display(
    industrials_list_top5,
    health_care_list_top5,
    information_technology_list_top5,
    communication_services_list_top5,
    consumer_discretionary_list_top5,
    utilities_list_top5,
    financials_list_top5,
    materials_list_top5,
    real_estate_list_top5,
    consumer_staples_list_top5,
    energy_list_top5
)

NameError: name 'industrials_list_top5' is not defined

In [None]:
# yfinance will only let you input tickers like:
# 'msft aapl goog' 
# but the list we have is like:
# 'msft', 'aapl', 'goog'
# I will need to figure out how to change this
# tickers = yf.Tickers('msft aapl goog')

In [14]:
sp500_all_symbols = sp500_all_sectors_df['Symbol'].values.tolist()


In [15]:
# one issue with how the wikipedia symbols come is that they come with a "." instead of a "-"
# yahoo finance needs to have the "-" in order to pull the data
# this step might need to go in front of the part where we break the sectors out individually
stocks = []

for stock_ticker in industrials_list:
    ticker = stock_ticker.replace(".","-")
    stocks.append(ticker)

In [16]:
data = yf.download(
    #tickers list or string as well
    tickers = industrials_list,

    # use "period" instead of start/end
    # valid periods: 1d, 5d, 1mo, 3mo, 6mo, 1y, 2y, 5y, 10y, ytd, max
    # (optional, default is "1mo")
    period = "10y",

    # fetch data by interval (including intraday if period < 60 days)
    # valid intervals: 1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo
    # (optional, default is '1d')
    interval = '1d',

    # adjust all OHLC automatically
    # (optional, default is False)
    auto_adjust = True,

    # download pre/post regular market hours data
    # (optional, default is False)
    prepost = True,

    # use threads for mass downloading? (True/False/Integre)
    # (optional, default is True)
    threads = True,

    # proxy URL scheme use use when downloading?
    # (optional, default is None)
    proxy = None
)['Close']

data

[*********************100%***********************]  74 of 74 completed


Unnamed: 0_level_0,AAL,ALK,ALLE,AME,AOS,BA,CARR,CAT,CHRW,CMI,...,TT,TXT,UAL,UNP,UPS,URI,VRSK,WAB,WM,XYL
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-08-08,4.685425,11.640026,,21.068756,7.578634,47.050499,,61.175251,50.750435,65.057259,...,15.326816,16.024958,16.610001,35.257996,46.257812,14.750000,30.979580,24.774513,21.312069,
2011-08-09,5.458474,12.522892,,23.029505,7.976838,49.959602,,64.789497,51.334846,70.842178,...,15.622852,17.071768,18.459999,37.048103,47.518040,15.740000,31.402473,26.738699,22.242628,
2011-08-10,5.015384,11.671236,,22.117819,7.482300,46.320740,,61.849236,49.715828,67.063347,...,14.651621,15.887983,17.000000,35.774071,45.953892,14.640000,30.468170,25.830681,21.713039,
2011-08-11,5.166223,12.217452,,23.697659,7.904048,47.482601,,64.619148,52.187786,71.433128,...,15.113871,16.582603,17.820000,37.084415,47.977661,16.129999,31.392643,27.211823,23.248846,
2011-08-12,5.213360,12.480530,,23.897488,7.998246,49.822433,,66.515129,54.020054,73.454765,...,15.783862,16.680429,17.620001,37.382744,48.333488,16.900000,31.687687,27.230936,23.294233,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-07-29,21.170000,58.849998,135.500000,138.600006,70.430000,231.630005,53.939999,212.559998,89.230003,233.149994,...,202.610001,69.290001,48.490002,217.320007,191.809998,322.459991,187.809998,85.010002,146.210007,125.059998
2021-07-30,20.379999,58.029999,136.600006,139.050003,70.330002,226.479996,55.250000,206.750000,89.169998,232.100006,...,203.610001,69.010002,46.720001,218.759995,191.360001,329.549988,189.940002,84.870003,148.259995,125.849998
2021-08-02,20.059999,,,,,225.339996,,205.160004,89.650002,,...,,,46.070000,,191.940002,,190.000000,,,
2021-08-03,19.950001,57.189999,137.809998,138.770004,71.639999,229.089996,57.180000,208.500000,91.410004,233.050003,...,206.559998,71.669998,46.540001,220.059998,194.149994,338.230011,188.490005,85.550003,149.410004,127.930000


In [17]:
data.columns = pd.MultiIndex.from_product([['Close'], data.columns])
data

Unnamed: 0_level_0,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close
Unnamed: 0_level_1,AAL,ALK,ALLE,AME,AOS,BA,CARR,CAT,CHRW,CMI,...,TT,TXT,UAL,UNP,UPS,URI,VRSK,WAB,WM,XYL
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2011-08-08,4.685425,11.640026,,21.068756,7.578634,47.050499,,61.175251,50.750435,65.057259,...,15.326816,16.024958,16.610001,35.257996,46.257812,14.750000,30.979580,24.774513,21.312069,
2011-08-09,5.458474,12.522892,,23.029505,7.976838,49.959602,,64.789497,51.334846,70.842178,...,15.622852,17.071768,18.459999,37.048103,47.518040,15.740000,31.402473,26.738699,22.242628,
2011-08-10,5.015384,11.671236,,22.117819,7.482300,46.320740,,61.849236,49.715828,67.063347,...,14.651621,15.887983,17.000000,35.774071,45.953892,14.640000,30.468170,25.830681,21.713039,
2011-08-11,5.166223,12.217452,,23.697659,7.904048,47.482601,,64.619148,52.187786,71.433128,...,15.113871,16.582603,17.820000,37.084415,47.977661,16.129999,31.392643,27.211823,23.248846,
2011-08-12,5.213360,12.480530,,23.897488,7.998246,49.822433,,66.515129,54.020054,73.454765,...,15.783862,16.680429,17.620001,37.382744,48.333488,16.900000,31.687687,27.230936,23.294233,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-07-29,21.170000,58.849998,135.500000,138.600006,70.430000,231.630005,53.939999,212.559998,89.230003,233.149994,...,202.610001,69.290001,48.490002,217.320007,191.809998,322.459991,187.809998,85.010002,146.210007,125.059998
2021-07-30,20.379999,58.029999,136.600006,139.050003,70.330002,226.479996,55.250000,206.750000,89.169998,232.100006,...,203.610001,69.010002,46.720001,218.759995,191.360001,329.549988,189.940002,84.870003,148.259995,125.849998
2021-08-02,20.059999,,,,,225.339996,,205.160004,89.650002,,...,,,46.070000,,191.940002,,190.000000,,,
2021-08-03,19.950001,57.189999,137.809998,138.770004,71.639999,229.089996,57.180000,208.500000,91.410004,233.050003,...,206.559998,71.669998,46.540001,220.059998,194.149994,338.230011,188.490005,85.550003,149.410004,127.930000


In [19]:
# the issue is that there are varying null values that pull from yahoo finance
# for example ALLE wasn't founded until 2013, so if we are taking the null values
# then will be taking out everything prior to 2013
data.isnull().sum()

Close  AAL       3
       ALK       4
       ALLE    578
       AME       4
       AOS       4
              ... 
       URI       4
       VRSK      3
       WAB       4
       WM        4
       XYL      51
Length: 74, dtype: int64

In [20]:
# so what we can do is replace null values with the average of the past 5 closing prices and 
# sub that price in for the null values. In the case of an ALLE where they are missing 2 years
# of data we can just take the first trading days close price and sub that for all null values before 2013

# since python goes in order of tasks
# Task 1: fill all NaN's with the average of the previous 5 values
# the formula is showing a rolling period of '6'. That is because it is counting the NaN value as the first value, 
# but since it doesn't exist it is not included in the calculation
data_cleaned = data.fillna(data.rolling(6, min_periods=1).mean())

# This still leaves all values that do not have "previous data"
# for these we are just using the 'bfill' method from fillna to backfill the first non-null value
data_cleaned = data_cleaned.fillna(method='bfill')
data_cleaned

# by doing these 2 steps in order it should fill all NaN values in our dataset regardless of what stock sector we pull


Unnamed: 0_level_0,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close
Unnamed: 0_level_1,AAL,ALK,ALLE,AME,AOS,BA,CARR,CAT,CHRW,CMI,...,TT,TXT,UAL,UNP,UPS,URI,VRSK,WAB,WM,XYL
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2011-08-08,4.685425,11.640026,45.387146,21.068756,7.578634,47.050499,11.826947,61.175251,50.750435,65.057259,...,15.326816,16.024958,16.610001,35.257996,46.257812,14.750000,30.979580,24.774513,21.312069,21.150721
2011-08-09,5.458474,12.522892,45.387146,23.029505,7.976838,49.959602,11.826947,64.789497,51.334846,70.842178,...,15.622852,17.071768,18.459999,37.048103,47.518040,15.740000,31.402473,26.738699,22.242628,21.150721
2011-08-10,5.015384,11.671236,45.387146,22.117819,7.482300,46.320740,11.826947,61.849236,49.715828,67.063347,...,14.651621,15.887983,17.000000,35.774071,45.953892,14.640000,30.468170,25.830681,21.713039,21.150721
2011-08-11,5.166223,12.217452,45.387146,23.697659,7.904048,47.482601,11.826947,64.619148,52.187786,71.433128,...,15.113871,16.582603,17.820000,37.084415,47.977661,16.129999,31.392643,27.211823,23.248846,21.150721
2011-08-12,5.213360,12.480530,45.387146,23.897488,7.998246,49.822433,11.826947,66.515129,54.020054,73.454765,...,15.783862,16.680429,17.620001,37.382744,48.333488,16.900000,31.687687,27.230936,23.294233,21.150721
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-07-29,21.170000,58.849998,135.500000,138.600006,70.430000,231.630005,53.939999,212.559998,89.230003,233.149994,...,202.610001,69.290001,48.490002,217.320007,191.809998,322.459991,187.809998,85.010002,146.210007,125.059998
2021-07-30,20.379999,58.029999,136.600006,139.050003,70.330002,226.479996,55.250000,206.750000,89.169998,232.100006,...,203.610001,69.010002,46.720001,218.759995,191.360001,329.549988,189.940002,84.870003,148.259995,125.849998
2021-08-02,20.059999,59.284000,136.488000,137.760001,69.859161,225.339996,52.845999,205.160004,89.650002,233.514001,...,200.994000,68.840001,46.070000,218.172000,191.940002,326.512000,190.000000,83.450002,146.758002,124.373999
2021-08-03,19.950001,57.189999,137.809998,138.770004,71.639999,229.089996,57.180000,208.500000,91.410004,233.050003,...,206.559998,71.669998,46.540001,220.059998,194.149994,338.230011,188.490005,85.550003,149.410004,127.930000


In [21]:
data_cleaned.isnull().sum()

Close  AAL     0
       ALK     0
       ALLE    0
       AME     0
       AOS     0
              ..
       URI     0
       VRSK    0
       WAB     0
       WM      0
       XYL     0
Length: 74, dtype: int64

In [22]:
data_pct_change = data_cleaned.pct_change().dropna()

In [23]:
data_pct_change

Unnamed: 0_level_0,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close
Unnamed: 0_level_1,AAL,ALK,ALLE,AME,AOS,BA,CARR,CAT,CHRW,CMI,...,TT,TXT,UAL,UNP,UPS,URI,VRSK,WAB,WM,XYL
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2011-08-09,0.164990,0.075847,0.000000,0.093064,0.052543,0.061829,0.000000,0.059080,0.011515,0.088920,...,0.019315,0.065324,0.111379,0.050772,0.027244,0.067119,0.013651,0.079283,0.043663,0.000000
2011-08-10,-0.081175,-0.068008,0.000000,-0.039588,-0.061997,-0.072836,0.000000,-0.045382,-0.031538,-0.053342,...,-0.062167,-0.069342,-0.079090,-0.034389,-0.032917,-0.069886,-0.029753,-0.033959,-0.023810,0.000000
2011-08-11,0.030075,0.046800,0.000000,0.071428,0.056366,0.025083,0.000000,0.044785,0.049722,0.065159,...,0.031549,0.043720,0.048235,0.036628,0.044039,0.101776,0.030342,0.053469,0.070732,0.000000
2011-08-12,0.009124,0.021533,0.000000,0.008432,0.011918,0.049278,0.000000,0.029341,0.035109,0.028301,...,0.044330,0.005899,-0.011223,0.008045,0.007417,0.047737,0.009399,0.000702,0.001952,0.000000
2011-08-15,0.063291,0.026438,0.000000,0.015155,0.016595,0.015385,0.000000,0.017370,0.001316,0.018101,...,0.033235,0.005279,0.036322,0.012187,0.010429,0.030178,-0.000310,0.011589,0.005846,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-07-29,-0.021267,-0.013577,-0.000664,0.012640,0.018658,0.000259,0.045957,0.005297,0.008135,0.004870,...,0.022096,0.002750,-0.013227,0.003509,-0.001198,-0.010525,0.013710,0.021018,0.003707,0.017327
2021-07-30,-0.037317,-0.013934,0.008118,0.003247,-0.001420,-0.022234,0.024286,-0.027333,-0.000672,-0.004503,...,0.004936,-0.004041,-0.036502,0.006626,-0.002346,0.021987,0.011341,-0.001647,0.014021,0.006317
2021-08-02,-0.015702,0.021610,-0.000820,-0.009277,-0.006695,-0.005034,-0.043511,-0.007690,0.005383,0.006092,...,-0.012848,-0.002463,-0.013913,-0.002688,0.003031,-0.009219,0.000316,-0.016731,-0.010131,-0.011728
2021-08-03,-0.005483,-0.035322,0.009686,0.007332,0.025492,0.016642,0.082012,0.016280,0.019632,-0.001987,...,0.027692,0.041110,0.010202,0.008654,0.011514,0.035888,-0.007947,0.025165,0.018071,0.028591


In [24]:
# next steps
# 1) need to figure out the CLI in order to have user select 3 sectors
# 2) need to dynamically filter the stocks by selection and market cap
#     a) need to calculate market cap
# 3) once CLI and dynamic selection are complete test the hell out of it and make sure its correct


In [25]:
msft = yf.Ticker("MSFT")
msft_sharesoutstanding = msft.info['sharesOutstanding']
msft_sharesoutstanding

7531570176

In [26]:
market_cap = 286.51 * msft_sharesoutstanding
market_cap

2157870171125.76

In [None]:
msft.info

In [1]:
### WORKING ON THIS FILE, TESTING IF THIS IS WORTH DOING.

"""Industrials List

This file calculates the top 5 stocks in the the industrials list by marketcap

"""
import yfinance as yf

def top_5_industrial_stocks(industrials_sp500, sp500_df_wo_index, industrials_list):
    # list all stocks in the industrials sector of the SP500 
    industrials_sp500 = sp500_df_wo_index.loc["Industrials"]
    industrials_list = industrials_sp500["Symbol"].values.tolist()
        
def market_cap(industrials_list):

    market_cap = {}

    for stock in industrials_list:
        ticker = yf.Ticker(stock)
        market_cap[stock] = ticker.info['marketCap']
      
    # we want to return a sorted Pandas DataFrame based on market cap and filtered to the top 5
    # since the columns will originally be the ticker we us ".T" to transpose the table
    # then we use .sort_values to sort by the "first column" [0] and sort in decending order
    # then we only call the top 5 by using "[0:5]"
    # on average this takes 320 seconds (5 minutes 20 seconds) to run per sector
    return pd.DataFrame(market_cap, index=[0]).T.sort_values(by=[0], ascending=False)[0:5]



In [47]:
import pandas as pd
from pathlib import Path
import csv
sp500_w_marketcap = pd.read_csv(
    Path("../Resources/stock_industry_marketcap.csv"),
)
sp500_w_marketcap

Unnamed: 0,Symbol,GICS Sector,Market_Cap
0,MMM,Industrials,1.144546e+11
1,ABT,Health Care,2.166563e+11
2,ABBV,Health Care,2.034378e+11
3,ABMD,Health Care,1.581906e+10
4,ACN,Information Technology,2.031204e+11
...,...,...,...
498,YUM,Consumer Discretionary,3.981149e+10
499,ZBRA,Information Technology,3.045970e+10
500,ZBH,Health Care,3.130774e+10
501,ZION,Financials,8.701906e+09


In [36]:
sp500_w_marketcap = sp500_w_marketcap.set_index("GICS Sector")
sp500_w_marketcap

Unnamed: 0_level_0,Symbol,Market_Cap
GICS Sector,Unnamed: 1_level_1,Unnamed: 2_level_1
Industrials,MMM,1.144546e+11
Health Care,ABT,2.166563e+11
Health Care,ABBV,2.034378e+11
Health Care,ABMD,1.581906e+10
Information Technology,ACN,2.031204e+11
...,...,...
Consumer Discretionary,YUM,3.981149e+10
Information Technology,ZBRA,3.045970e+10
Health Care,ZBH,3.130774e+10
Financials,ZION,8.701906e+09


In [50]:
csvpath = Path("../Resources/stock_industry_marketcap.csv")
sector = "GICS Sector"

with open(csvpath, 'r', newline='') as csvfile:
    data = csv.reader(csvfile)
    industrials = 
    for row in data:
        print()

In [46]:
# sectors = "GICS Sector"
# for rows in sp500_w_marketcap:
#     industrials = sectors(["Industrials"])
#     print(industrials)

TypeError: 'str' object is not callable

In [41]:
industrials_sp500 = sp500_w_marketcap.loc["Industrials"]
industrials_sp500

Unnamed: 0_level_0,Symbol,Market_Cap
GICS Sector,Unnamed: 1_level_1,Unnamed: 2_level_1
Industrials,MMM,1.144546e+11
Industrials,ALK,7.208412e+09
Industrials,ALLE,1.231529e+10
Industrials,AAL,1.353835e+10
Industrials,AME,3.165210e+10
...,...,...
Industrials,URI,2.355111e+10
Industrials,VRSK,3.078270e+10
Industrials,WAB,1.627273e+10
Industrials,WM,6.261410e+10


In [42]:
industrials_list = industrials_sp500["Symbol"].values.tolist()
industrials_list

['MMM',
 'ALK',
 'ALLE',
 'AAL',
 'AME',
 'AOS',
 'BA',
 'CHRW',
 'CARR',
 'CAT',
 'CTAS',
 'CPRT',
 'CSX',
 'CMI',
 'DE',
 'DAL',
 'DOV',
 'ETN',
 'EMR',
 'EFX',
 'EXPD',
 'FAST',
 'FDX',
 'FTV',
 'FBHS',
 'GNRC',
 'GD',
 'GE',
 'GWW',
 'HON',
 'HWM',
 'HII',
 'IEX',
 'INFO',
 'ITW',
 'IR',
 'J',
 'JBHT',
 'JCI',
 'KSU',
 'LHX',
 'LDOS',
 'LMT',
 'MAS',
 'NLSN',
 'NSC',
 'NOC',
 'ODFL',
 'OTIS',
 'PCAR',
 'PH',
 'PNR',
 'PWR',
 'RTX',
 'RSG',
 'RHI',
 'ROK',
 'ROL',
 'ROP',
 'SNA',
 'LUV',
 'SWK',
 'TDY',
 'TXT',
 'TT',
 'TDG',
 'UNP',
 'UAL',
 'UPS',
 'URI',
 'VRSK',
 'WAB',
 'WM',
 'XYL']

In [19]:
# industrials_list_test = sp500_w_marketcap.loc(["GICS Sector"]["Industrials"])
# industrials_list_test
# sectors = "GICS Sector"
# symbol = "Symbol"
market_cap = "Market_Cap"
for rows in sp500_w_marketcap:
    symbol = rows[3]
    print(rows)


Symbol
GICS Sector
Market_Cap


In [None]:
def stock_data(csv_path,sp500_df):
    # labeling the file path
    csv_path = "../resources/stock_industry_marketcap.csv"
    sp500_df = pd.DataFrame(Path(csv_path))