# Project 9: Web Scraping, APIs & Wrappers (US Stocks)

## Web Scraping - the Dow Jones Constituents

In [1]:
import pandas as pd

In [2]:
pd.read_html("https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average")

[                                                   0  \
 0  Historical logarithmic graph of the DJIA from ...   
 1                                         Foundation   
 2                                           Operator   
 3                                          Exchanges   
 4                                     Trading symbol   
 5                                       Constituents   
 6                                               Type   
 7                                         Market cap   
 8                                   Weighting method   
 9                                            Website   
 
                                                    1  
 0  Historical logarithmic graph of the DJIA from ...  
 1  February 16, 1885; 136 years ago[1]May 26, 189...  
 2                              S&P Dow Jones Indices  
 3                      New York Stock ExchangeNASDAQ  
 4                                               ^DJI  
 5                                 

In [3]:
const = pd.read_html("https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average")[1]

In [4]:
const

Unnamed: 0,Company,Exchange,Symbol,Industry,Date added,Notes,Index weighting
0,3M,NYSE,MMM,Conglomerate,1976-08-09,As Minnesota Mining and Manufacturing,3.84%
1,American Express,NYSE,AXP,Financial services,1982-08-30,,2.88%
2,Amgen,NASDAQ,AMGN,Pharmaceutical industry,2020-08-31,,4.87%
3,Apple Inc.,NASDAQ,AAPL,Information technology,2015-03-19,,2.57%
4,Boeing,NYSE,BA,Aerospace and defense,1987-03-12,,4.92%
5,Caterpillar Inc.,NYSE,CAT,Construction and Mining,1991-05-06,,4.54%
6,Chevron Corporation,NYSE,CVX,Petroleum industry,2008-02-19,Also 1930-07-18 to 1999-11-01,2.03%
7,Cisco Systems,NASDAQ,CSCO,Information technology,2009-06-08,,1.00%
8,The Coca-Cola Company,NYSE,KO,Food industry,1987-03-12,Also 1932-05-26 to 1935-11-20,1.04%
9,Dow Inc.,NYSE,DOW,Chemical industry,2019-04-02,,1.25%


In [5]:
const = const.iloc[:, :5].copy()
const

Unnamed: 0,Company,Exchange,Symbol,Industry,Date added
0,3M,NYSE,MMM,Conglomerate,1976-08-09
1,American Express,NYSE,AXP,Financial services,1982-08-30
2,Amgen,NASDAQ,AMGN,Pharmaceutical industry,2020-08-31
3,Apple Inc.,NASDAQ,AAPL,Information technology,2015-03-19
4,Boeing,NYSE,BA,Aerospace and defense,1987-03-12
5,Caterpillar Inc.,NYSE,CAT,Construction and Mining,1991-05-06
6,Chevron Corporation,NYSE,CVX,Petroleum industry,2008-02-19
7,Cisco Systems,NASDAQ,CSCO,Information technology,2009-06-08
8,The Coca-Cola Company,NYSE,KO,Food industry,1987-03-12
9,Dow Inc.,NYSE,DOW,Chemical industry,2019-04-02


In [8]:
const.rename(columns = {"Date added":"Date_Added"}, inplace = True)

In [9]:
const.Date_Added = pd.to_datetime(const.Date_Added)

In [10]:
const.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Company     30 non-null     object        
 1   Exchange    30 non-null     object        
 2   Symbol      30 non-null     object        
 3   Industry    30 non-null     object        
 4   Date_Added  30 non-null     datetime64[ns]
dtypes: datetime64[ns](1), object(4)
memory usage: 1.3+ KB


## Normalizing Unicode Strings and Getting the Ticker Symbols

In [11]:
import unicodedata

In [12]:
const.Symbol

0      MMM
1      AXP
2     AMGN
3     AAPL
4       BA
5      CAT
6      CVX
7     CSCO
8       KO
9      DOW
10      GS
11      HD
12     HON
13     IBM
14    INTC
15     JNJ
16     JPM
17     MCD
18     MRK
19    MSFT
20     NKE
21      PG
22     CRM
23     TRV
24     UNH
25      VZ
26       V
27     WBA
28     WMT
29     DIS
Name: Symbol, dtype: object

In [13]:
const.Symbol.to_list()

['MMM',
 'AXP',
 'AMGN',
 'AAPL',
 'BA',
 'CAT',
 'CVX',
 'CSCO',
 'KO',
 'DOW',
 'GS',
 'HD',
 'HON',
 'IBM',
 'INTC',
 'JNJ',
 'JPM',
 'MCD',
 'MRK',
 'MSFT',
 'NKE',
 'PG',
 'CRM',
 'TRV',
 'UNH',
 'VZ',
 'V',
 'WBA',
 'WMT',
 'DIS']

In [14]:
const.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Company     30 non-null     object        
 1   Exchange    30 non-null     object        
 2   Symbol      30 non-null     object        
 3   Industry    30 non-null     object        
 4   Date_Added  30 non-null     datetime64[ns]
dtypes: datetime64[ns](1), object(4)
memory usage: 1.3+ KB


In [15]:
const.Symbol.apply(lambda x: unicodedata.normalize("NFKD", x))

0      MMM
1      AXP
2     AMGN
3     AAPL
4       BA
5      CAT
6      CVX
7     CSCO
8       KO
9      DOW
10      GS
11      HD
12     HON
13     IBM
14    INTC
15     JNJ
16     JPM
17     MCD
18     MRK
19    MSFT
20     NKE
21      PG
22     CRM
23     TRV
24     UNH
25      VZ
26       V
27     WBA
28     WMT
29     DIS
Name: Symbol, dtype: object

In [16]:
const.Symbol = const.Symbol.apply(lambda x: unicodedata.normalize("NFKD", x))

In [17]:
const.Symbol[0]

'MMM'

In [18]:
const["Ticker"] = const.Symbol.str.split(": ").apply(lambda x: x[-1])

In [19]:
const

Unnamed: 0,Company,Exchange,Symbol,Industry,Date_Added,Ticker
0,3M,NYSE,MMM,Conglomerate,1976-08-09,MMM
1,American Express,NYSE,AXP,Financial services,1982-08-30,AXP
2,Amgen,NASDAQ,AMGN,Pharmaceutical industry,2020-08-31,AMGN
3,Apple Inc.,NASDAQ,AAPL,Information technology,2015-03-19,AAPL
4,Boeing,NYSE,BA,Aerospace and defense,1987-03-12,BA
5,Caterpillar Inc.,NYSE,CAT,Construction and Mining,1991-05-06,CAT
6,Chevron Corporation,NYSE,CVX,Petroleum industry,2008-02-19,CVX
7,Cisco Systems,NASDAQ,CSCO,Information technology,2009-06-08,CSCO
8,The Coca-Cola Company,NYSE,KO,Food industry,1987-03-12,KO
9,Dow Inc.,NYSE,DOW,Chemical industry,2019-04-02,DOW


In [20]:
ticker_list = const.Ticker.to_list()

In [21]:
ticker_list

['MMM',
 'AXP',
 'AMGN',
 'AAPL',
 'BA',
 'CAT',
 'CVX',
 'CSCO',
 'KO',
 'DOW',
 'GS',
 'HD',
 'HON',
 'IBM',
 'INTC',
 'JNJ',
 'JPM',
 'MCD',
 'MRK',
 'MSFT',
 'NKE',
 'PG',
 'CRM',
 'TRV',
 'UNH',
 'VZ',
 'V',
 'WBA',
 'WMT',
 'DIS']

In [22]:
const.to_csv("const.csv", index = False)

## Loading and Saving Historical Stock Prices

In [23]:
import pandas as pd
import yfinance as yf

In [24]:
ticker_list

['MMM',
 'AXP',
 'AMGN',
 'AAPL',
 'BA',
 'CAT',
 'CVX',
 'CSCO',
 'KO',
 'DOW',
 'GS',
 'HD',
 'HON',
 'IBM',
 'INTC',
 'JNJ',
 'JPM',
 'MCD',
 'MRK',
 'MSFT',
 'NKE',
 'PG',
 'CRM',
 'TRV',
 'UNH',
 'VZ',
 'V',
 'WBA',
 'WMT',
 'DIS']

In [25]:
prices = yf.download(ticker_list, start = "2007-01-01", end = "2020-03-31")

[*********************100%***********************]  30 of 30 completed


In [26]:
prices

Unnamed: 0_level_0,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,...,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume
Unnamed: 0_level_1,AAPL,AMGN,AXP,BA,CAT,CRM,CSCO,CVX,DIS,DOW,...,MRK,MSFT,NKE,PG,TRV,UNH,V,VZ,WBA,WMT
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2007-01-03,2.573566,53.748402,47.838795,64.405731,40.944595,9.017500,20.589767,41.455723,28.317101,,...,15019300,76935100,17299200,9717900,3432800,8360300,,21445850,6294500,35687300
2007-01-04,2.630688,56.050747,47.490051,64.665741,40.837482,9.470000,21.131788,41.052643,28.540649,,...,12515200,45774500,15085600,8711400,2068200,5152500,,19215860,3681800,17073000
2007-01-05,2.611954,56.184349,46.863956,64.391273,40.315292,9.880000,21.139219,41.210381,28.308815,,...,10656900,44607200,14996800,9907900,2104600,6215700,,19047041,3680900,13556900
2007-01-08,2.624853,55.736446,47.307781,64.239609,40.362164,9.982500,21.258024,41.736099,28.565489,,...,7046300,50220200,10109600,11068200,2440900,4344100,,20370917,4720800,16396400
2007-01-09,2.842900,56.003613,47.006588,63.560661,40.583073,9.990000,21.139219,41.257103,28.524090,,...,8623200,44636600,15167200,10823800,1319500,5483900,,16281352,3792500,14643200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-03-24,61.141407,195.457443,82.325073,127.680000,98.098778,153.639999,37.024021,62.829350,98.120003,26.904692,...,18990400,82516700,18849500,14625500,4293300,8894400,24488300.0,40626900,9638400,14235000
2020-03-25,60.804588,186.570389,88.319473,158.729996,101.322266,147.059998,36.131992,65.397270,100.730003,28.709679,...,17472800,75638200,27053200,13789100,4151400,10014000,20619600.0,38551300,11040500,17762500
2020-03-26,64.004311,191.738419,91.375450,180.550003,106.965805,154.729996,38.923183,72.109764,105.360001,28.048168,...,17226000,64568100,17200800,15796000,4610600,7517600,17062900.0,32610900,11095700,19416900
2020-03-27,61.354385,191.525909,86.909035,162.000000,102.067635,146.000000,37.235043,64.934662,96.400002,26.989744,...,11896200,57042300,14183400,16073100,3085400,4879500,14950700.0,25363600,5898000,12053500


In [27]:
prices.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3333 entries, 2007-01-03 to 2020-03-30
Columns: 180 entries, ('Adj Close', 'AAPL') to ('Volume', 'WMT')
dtypes: float64(152), int64(28)
memory usage: 4.6 MB


In [28]:
prices = prices.loc[:,"Close"].copy()

In [29]:
prices.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3333 entries, 2007-01-03 to 2020-03-30
Data columns (total 30 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AAPL    3333 non-null   float64
 1   AMGN    3333 non-null   float64
 2   AXP     3333 non-null   float64
 3   BA      3333 non-null   float64
 4   CAT     3333 non-null   float64
 5   CRM     3333 non-null   float64
 6   CSCO    3333 non-null   float64
 7   CVX     3333 non-null   float64
 8   DIS     3333 non-null   float64
 9   DOW     260 non-null    float64
 10  GS      3333 non-null   float64
 11  HD      3333 non-null   float64
 12  HON     3333 non-null   float64
 13  IBM     3333 non-null   float64
 14  INTC    3333 non-null   float64
 15  JNJ     3333 non-null   float64
 16  JPM     3333 non-null   float64
 17  KO      3333 non-null   float64
 18  MCD     3333 non-null   float64
 19  MMM     3333 non-null   float64
 20  MRK     3333 non-null   float64
 21  MSFT    3333 non-nu

In [30]:
prices

Unnamed: 0_level_0,AAPL,AMGN,AXP,BA,CAT,CRM,CSCO,CVX,DIS,DOW,...,MRK,MSFT,NKE,PG,TRV,UNH,V,VZ,WBA,WMT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2007-01-03,2.992857,68.400002,60.360001,89.169998,61.160000,9.017500,27.730000,70.970001,33.738300,,...,44.020000,29.860001,12.208750,64.540001,53.549999,52.570000,,35.306732,46.070000,47.549999
2007-01-04,3.059286,71.330002,59.919998,89.529999,61.000000,9.470000,28.459999,70.279999,34.004654,,...,45.110001,29.809999,12.333750,64.050003,53.099998,52.910000,,35.502777,46.160000,47.779999
2007-01-05,3.037500,71.500000,59.130001,89.150002,60.220001,9.880000,28.469999,70.550003,33.728436,,...,44.299999,29.639999,12.353750,63.500000,52.410000,52.549999,,34.895969,45.500000,47.389999
2007-01-08,3.052500,70.930000,59.689999,88.940002,60.290001,9.982500,28.629999,71.449997,34.034248,,...,44.290001,29.930000,12.316250,63.639999,52.020000,53.320000,,34.363850,45.689999,47.000000
2007-01-09,3.306071,71.269997,59.310001,88.000000,60.619999,9.990000,28.469999,70.629997,33.984924,,...,43.880001,29.959999,12.470000,63.480000,51.889999,52.680000,,34.503880,45.930000,47.389999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-03-24,61.720001,202.339996,84.050003,127.680000,101.339996,153.639999,38.599998,66.550003,98.120003,28.469999,...,69.050003,148.339996,72.330002,103.269997,93.739998,219.800003,154.529999,49.990002,45.250000,115.029999
2020-03-25,61.380001,193.139999,90.169998,158.729996,104.669998,147.059998,37.669998,69.269997,100.730003,30.379999,...,68.220001,146.919998,79.010002,100.919998,92.180000,234.490005,161.779999,49.939999,41.439999,109.400002
2020-03-26,64.610001,198.490005,93.290001,180.550003,110.500000,154.729996,40.580002,76.379997,105.360001,29.680000,...,73.529999,156.110001,84.300003,107.379997,98.669998,255.389999,168.880005,53.540001,45.669998,109.820000
2020-03-27,61.935001,198.270004,88.730003,162.000000,105.440002,146.000000,38.820000,68.779999,96.400002,28.559999,...,71.730003,149.699997,83.230003,110.169998,99.949997,242.449997,161.559998,52.770000,44.000000,109.580002


In [31]:
prices.to_csv("const_prices.csv")

In [32]:
dji = yf.download("^DJI", start = "2007-01-01",  end = "2020-03-31")

[*********************100%***********************]  1 of 1 completed


In [33]:
dji

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2007-01-03,12459.540039,12580.349609,12404.820312,12474.519531,12474.519531,3272000
2007-01-04,12473.160156,12510.410156,12403.860352,12480.690430,12480.690430,2590600
2007-01-05,12480.049805,12480.129883,12365.410156,12398.009766,12398.009766,2352200
2007-01-08,12392.009766,12445.919922,12337.370117,12423.490234,12423.490234,2235000
2007-01-09,12424.769531,12466.429688,12369.169922,12416.599609,12416.599609,2251900
...,...,...,...,...,...,...
2020-03-24,19722.189453,20737.699219,19649.250000,20704.910156,20704.910156,7993400
2020-03-25,21050.339844,22019.929688,20538.339844,21200.550781,21200.550781,7963200
2020-03-26,21468.380859,22595.060547,21427.099609,22552.169922,22552.169922,7051800
2020-03-27,21898.470703,22327.570312,21469.269531,21636.779297,21636.779297,5888300


In [34]:
dji.to_csv("dji.csv")

In [35]:
dji.to_csv("dji.csv")