In [1]:
import os
import pandas_datareader.data as web
from datetime import datetime
from pprint import pprint

## Download html table with SP500 constituents

In [2]:
sp_url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
sp500_constituents = pd.read_html(sp_url, header=0)[0]

In [3]:
sp500_constituents.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 505 entries, 0 to 504
Data columns (total 9 columns):
Security                  505 non-null object
Symbol                    505 non-null object
SEC filings               505 non-null object
GICS Sector               505 non-null object
GICS Sub Industry         505 non-null object
Headquarters Location     505 non-null object
Date first added[3][4]    402 non-null object
CIK                       505 non-null int64
Founded                   171 non-null object
dtypes: int64(1), object(8)
memory usage: 35.6+ KB


In [4]:
sp500_constituents.head()

Unnamed: 0,Security,Symbol,SEC filings,GICS Sector,GICS Sub Industry,Headquarters Location,Date first added[3][4],CIK,Founded
0,3M Company,MMM,reports,Industrials,Industrial Conglomerates,"St. Paul, Minnesota",,66740,1902
1,Abbott Laboratories,ABT,reports,Health Care,Health Care Equipment,"North Chicago, Illinois",1964-03-31,1800,1888
2,AbbVie Inc.,ABBV,reports,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
3,ABIOMED Inc,ABMD,reports,Health Care,Health Care Equipment,"Danvers, Massachusetts",2018-05-31,815094,1981
4,Accenture plc,ACN,reports,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989


## pandas-datareader for Market Data

See [documentation](https://pandas-datareader.readthedocs.io/en/latest/); functionality frequently changes as underlying provider APIs evolve.

In [5]:
start = '2014'
end = datetime(2017, 5, 24)

yahoo= web.DataReader('FB', 'yahoo', start=start, end=end)
yahoo.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 856 entries, 2014-01-02 to 2017-05-25
Data columns (total 6 columns):
High         856 non-null float64
Low          856 non-null float64
Open         856 non-null float64
Close        856 non-null float64
Volume       856 non-null int64
Adj Close    856 non-null float64
dtypes: float64(5), int64(1)
memory usage: 46.8 KB


### IEX

In [6]:
start = datetime(2015, 2, 9)
# end = datetime(2017, 5, 24)

iex = web.DataReader('FB', 'iex', start)
iex.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1024 entries, 2015-02-09 to 2019-03-05
Data columns (total 5 columns):
open      1024 non-null float64
high      1024 non-null float64
low       1024 non-null float64
close     1024 non-null float64
volume    1024 non-null int64
dtypes: float64(4), int64(1)
memory usage: 48.0+ KB


In [7]:
iex.tail()

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-02-27,162.9,163.93,160.41,162.81,12697485
2019-02-28,162.37,163.5,160.86,161.45,11114185
2019-03-01,162.6,163.132,161.69,162.28,11097770
2019-03-04,163.9,167.5,163.83,167.37,18894689
2019-03-05,167.37,171.88,166.55,171.26,28187890


#### Book Data

DEEP is used to receive real-time depth of book quotations direct from IEX. The depth of book quotations received via DEEP provide an aggregated size of resting displayed orders at a price and side, and do not indicate the size or number of individual orders at any price level. Non-displayed orders and non-displayed portions of reserve orders are not represented in DEEP.

DEEP also provides last trade price and size information. Trades resulting from either displayed or non-displayed orders matching on IEX will be reported. Routed executions will not be reported.

Only works on trading days.

In [8]:
book = web.get_iex_book('AAPL')

In [9]:
list(book.keys())

['symbol',
 'marketPercent',
 'volume',
 'lastSalePrice',
 'lastSaleSize',
 'lastSaleTime',
 'lastUpdated',
 'bids',
 'asks',
 'systemEvent',
 'tradingStatus',
 'opHaltStatus',
 'ssrStatus',
 'securityEvent',
 'trades',
 'tradeBreaks']

In [10]:
orders = pd.concat([pd.DataFrame(book[side]).assign(side=side) for side in ['bids', 'asks']])
orders.head()

Unnamed: 0,side


In [11]:
for key in book.keys():
    try:
        print(f'\n{key}')
        print(pd.DataFrame(book[key]))
    except:
        print(book[key])


symbol
AAPL

marketPercent
0.02182

volume
451892

lastSalePrice
174.545

lastSaleSize
100

lastSaleTime
1551905988634

lastUpdated
1551906830343

bids
Empty DataFrame
Columns: []
Index: []

asks
Empty DataFrame
Columns: []
Index: []

systemEvent
{'systemEvent': 'C', 'timestamp': 1551910200000}

tradingStatus
{'status': 'T', 'reason': '    ', 'timestamp': 1551874921659}

opHaltStatus
{'isHalted': False, 'timestamp': 1551874921659}

ssrStatus
{'isSSR': False, 'detail': ' ', 'timestamp': 1551874921659}

securityEvent
{'securityEvent': 'MarketClose', 'timestamp': 1551906000000}

trades
    isISO  isOddLot  isOutsideRegularHours  isSinglePriceCross  \
0   False     False                  False               False   
1    True     False                  False               False   
2   False     False                  False               False   
3    True     False                  False               False   
4    True     False                  False               False   
5    True    

In [12]:
pd.DataFrame(book['trades']).head()

Unnamed: 0,isISO,isOddLot,isOutsideRegularHours,isSinglePriceCross,isTradeThroughExempt,price,size,timestamp,tradeId
0,False,False,False,False,False,174.545,100,1551905988634,967084383
1,True,False,False,False,True,174.54,100,1551905982812,965993229
2,False,False,False,False,False,174.525,100,1551905981473,965674472
3,True,False,False,False,False,174.53,300,1551905979913,965286850
4,True,False,False,False,False,174.58,100,1551905974422,964468588


### Quandl

In [13]:
symbol = 'FB.US'

quandl = web.DataReader(symbol, 'quandl', '2015-01-01')
quandl.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 813 entries, 2018-03-27 to 2015-01-02
Data columns (total 12 columns):
Open          813 non-null float64
High          813 non-null float64
Low           813 non-null float64
Close         813 non-null float64
Volume        813 non-null float64
ExDividend    813 non-null float64
SplitRatio    813 non-null float64
AdjOpen       813 non-null float64
AdjHigh       813 non-null float64
AdjLow        813 non-null float64
AdjClose      813 non-null float64
AdjVolume     813 non-null float64
dtypes: float64(12)
memory usage: 82.6 KB


### FRED

In [14]:
start = datetime(2010, 1, 1)

end = datetime(2013, 1, 27)

gdp = web.DataReader('GDP', 'fred', start, end)

gdp.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 13 entries, 2010-01-01 to 2013-01-01
Data columns (total 1 columns):
GDP    13 non-null float64
dtypes: float64(1)
memory usage: 208.0 bytes


In [15]:
inflation = web.DataReader(['CPIAUCSL', 'CPILFESL'], 'fred', start, end)
inflation.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 37 entries, 2010-01-01 to 2013-01-01
Freq: MS
Data columns (total 2 columns):
CPIAUCSL    37 non-null float64
CPILFESL    37 non-null float64
dtypes: float64(2)
memory usage: 888.0 bytes


### Fama/French

In [16]:
from pandas_datareader.famafrench import get_available_datasets
get_available_datasets()

['F-F_Research_Data_Factors',
 'F-F_Research_Data_Factors_weekly',
 'F-F_Research_Data_Factors_daily',
 'F-F_Research_Data_5_Factors_2x3',
 'F-F_Research_Data_5_Factors_2x3_daily',
 'Portfolios_Formed_on_ME',
 'Portfolios_Formed_on_ME_Wout_Div',
 'Portfolios_Formed_on_ME_Daily',
 'Portfolios_Formed_on_BE-ME',
 'Portfolios_Formed_on_BE-ME_Wout_Div',
 'Portfolios_Formed_on_BE-ME_Daily',
 'Portfolios_Formed_on_OP',
 'Portfolios_Formed_on_OP_Wout_Div',
 'Portfolios_Formed_on_INV',
 'Portfolios_Formed_on_INV_Wout_Div',
 '6_Portfolios_2x3',
 '6_Portfolios_2x3_Wout_Div',
 '6_Portfolios_2x3_weekly',
 '6_Portfolios_2x3_daily',
 '25_Portfolios_5x5',
 '25_Portfolios_5x5_Wout_Div',
 '25_Portfolios_5x5_Daily',
 '100_Portfolios_10x10',
 '100_Portfolios_10x10_Wout_Div',
 '100_Portfolios_10x10_Daily',
 '6_Portfolios_ME_OP_2x3',
 '6_Portfolios_ME_OP_2x3_Wout_Div',
 '6_Portfolios_ME_OP_2x3_daily',
 '25_Portfolios_ME_OP_5x5',
 '25_Portfolios_ME_OP_5x5_Wout_Div',
 '25_Portfolios_ME_OP_5x5_daily',
 '100_Po

In [17]:
ds = web.DataReader('5_Industry_Portfolios', 'famafrench')
print(ds['DESCR'])

5 Industry Portfolios
---------------------

This file was created by CMPT_IND_RETS using the 201901 CRSP database. It contains value- and equal-weighted returns for 5 industry portfolios. The portfolios are constructed at the end of June. The annual returns are from January to December. Missing data are indicated by -99.99 or -999. Copyright 2019 Kenneth R. French

  0 : Average Value Weighted Returns -- Monthly (109 rows x 5 cols)
  1 : Average Equal Weighted Returns -- Monthly (109 rows x 5 cols)
  2 : Average Value Weighted Returns -- Annual (9 rows x 5 cols)
  3 : Average Equal Weighted Returns -- Annual (9 rows x 5 cols)
  4 : Number of Firms in Portfolios (109 rows x 5 cols)
  5 : Average Firm Size (109 rows x 5 cols)
  6 : Sum of BE / Sum of ME (9 rows x 5 cols)
  7 : Value-Weighted Average of BE/ME (9 rows x 5 cols)


### World Bank

### OECD

### EuroStat



### Stooq

SP500 - Sourced from Shiller?
lots of stuff, hard to navigate, eastern europe

In [18]:
index_url = 'https://stooq.com/t/'
ix = pd.read_html(index_url)
len(ix)

48

Currently broken, awaiting [fix](https://github.com/pydata/pandas-datareader/issues/594)

In [19]:
f = web.DataReader('^SPX', 'stooq', start='20000101')
f.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Empty DataFrame

In [20]:
f.head()

In [21]:
f.to_csv('sp_test.csv')

In [22]:
%matplotlib inline
f.resample('M').Close.mean().plot()

TypeError: Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex, but got an instance of 'Index'

### NASDAQ Symbols

In [23]:
from pandas_datareader.nasdaq_trader import get_nasdaq_symbols
symbols = get_nasdaq_symbols()
symbols.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8701 entries, A to ZYXI
Data columns (total 11 columns):
Nasdaq Traded       8701 non-null bool
Security Name       8701 non-null object
Listing Exchange    8701 non-null category
Market Category     8701 non-null object
ETF                 8701 non-null bool
Round Lot Size      8701 non-null float64
Test Issue          8701 non-null bool
Financial Status    3411 non-null category
CQS Symbol          5290 non-null object
NASDAQ Symbol       8701 non-null object
NextShares          8701 non-null bool
dtypes: bool(4), category(2), float64(1), object(4)
memory usage: 459.2+ KB


In [24]:
url = 'https://www.nasdaq.com/screening/companies-by-industry.aspx?exchange=NASDAQ'
res = pd.read_html(url)
len(res)

4

In [25]:
for r in res:
    print(r.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 2 columns):
0    1 non-null object
1    1 non-null object
dtypes: object(2)
memory usage: 96.0+ bytes
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 6 columns):
Name          101 non-null object
Symbol        51 non-null object
Market Cap    47 non-null object
Country       51 non-null object
IPO Year      28 non-null object
Subsector     51 non-null object
dtypes: object(6)
memory usage: 4.8+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 1 columns):
0    1 non-null object
dtypes: object(1)
memory usage: 88.0+ bytes
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 1 columns):
0    1 non-null object
dtypes: object(1)
memory usage: 88.0+ bytes
None


### Tiingo

Requires [signing up](https://api.tiingo.com/) and storing API key in environment

In [26]:
df = web.get_data_tiingo('GOOG', api_key=os.getenv('TIINGO_API_KEY'))

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1244 entries, (GOOG, 2014-03-27 00:00:00) to (GOOG, 2019-03-06 00:00:00)
Data columns (total 12 columns):
adjClose       1244 non-null float64
adjHigh        1244 non-null float64
adjLow         1244 non-null float64
adjOpen        1244 non-null float64
adjVolume      1244 non-null int64
close          1244 non-null float64
divCash        1244 non-null float64
high           1244 non-null float64
low            1244 non-null float64
open           1244 non-null float64
splitFactor    1244 non-null float64
volume         1244 non-null int64
dtypes: float64(10), int64(2)
memory usage: 130.1+ KB
