## Web Scraping

In [9]:
import pandas as pd

In [10]:
# read the Wiki page
const_raw = pd.read_html("https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average")[1]

In [11]:
const_raw.head()

Unnamed: 0,Company,Exchange,Symbol,Industry,Date added,Notes,Index weighting
0,3M,NYSE,MMM,Conglomerate,1976-08-09,As Minnesota Mining and Manufacturing,3.84%
1,American Express,NYSE,AXP,Financial services,1982-08-30,,2.88%
2,Amgen,NASDAQ,AMGN,Pharmaceutical industry,2020-08-31,,4.87%
3,Apple Inc.,NASDAQ,AAPL,Information technology,2015-03-19,,2.57%
4,Boeing,NYSE,BA,Aerospace and defense,1987-03-12,,4.92%


In [12]:
const = const_raw.iloc[:, :5].copy()
const.head()

Unnamed: 0,Company,Exchange,Symbol,Industry,Date added
0,3M,NYSE,MMM,Conglomerate,1976-08-09
1,American Express,NYSE,AXP,Financial services,1982-08-30
2,Amgen,NASDAQ,AMGN,Pharmaceutical industry,2020-08-31
3,Apple Inc.,NASDAQ,AAPL,Information technology,2015-03-19
4,Boeing,NYSE,BA,Aerospace and defense,1987-03-12


In [13]:
const.rename(columns = {"Date added":"Date_Added"}, inplace = True)
const.head()

Unnamed: 0,Company,Exchange,Symbol,Industry,Date_Added
0,3M,NYSE,MMM,Conglomerate,1976-08-09
1,American Express,NYSE,AXP,Financial services,1982-08-30
2,Amgen,NASDAQ,AMGN,Pharmaceutical industry,2020-08-31
3,Apple Inc.,NASDAQ,AAPL,Information technology,2015-03-19
4,Boeing,NYSE,BA,Aerospace and defense,1987-03-12


In [15]:
const['Date_Added'] = pd.to_datetime(const.Date_Added)

In [16]:
const.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Company     30 non-null     object        
 1   Exchange    30 non-null     object        
 2   Symbol      30 non-null     object        
 3   Industry    30 non-null     object        
 4   Date_Added  30 non-null     datetime64[ns]
dtypes: datetime64[ns](1), object(4)
memory usage: 1.3+ KB


## Normalizing Unicode Strings and Getting the Ticker Symbols

In [21]:
import unicodedata

In [23]:
const.Symbol[1]

'AXP'

In [18]:
const.Symbol.to_list()

['MMM',
 'AXP',
 'AMGN',
 'AAPL',
 'BA',
 'CAT',
 'CVX',
 'CSCO',
 'KO',
 'DOW',
 'GS',
 'HD',
 'HON',
 'IBM',
 'INTC',
 'JNJ',
 'JPM',
 'MCD',
 'MRK',
 'MSFT',
 'NKE',
 'PG',
 'CRM',
 'TRV',
 'UNH',
 'VZ',
 'V',
 'WBA',
 'WMT',
 'DIS']

In [19]:
const.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Company     30 non-null     object        
 1   Exchange    30 non-null     object        
 2   Symbol      30 non-null     object        
 3   Industry    30 non-null     object        
 4   Date_Added  30 non-null     datetime64[ns]
dtypes: datetime64[ns](1), object(4)
memory usage: 1.3+ KB


In [22]:
const.Symbol.apply(lambda x: unicodedata.normalize("NFKD", x))

0      MMM
1      AXP
2     AMGN
3     AAPL
4       BA
5      CAT
6      CVX
7     CSCO
8       KO
9      DOW
10      GS
11      HD
12     HON
13     IBM
14    INTC
15     JNJ
16     JPM
17     MCD
18     MRK
19    MSFT
20     NKE
21      PG
22     CRM
23     TRV
24     UNH
25      VZ
26       V
27     WBA
28     WMT
29     DIS
Name: Symbol, dtype: object

In [24]:
const.Symbol = const.Symbol.apply(lambda x: unicodedata.normalize("NFKD", x))

In [25]:
const["Ticker"] = const.Symbol.str.split(": ").apply(lambda x: x[-1])