# Download historical equity data for NASDAQ stocks from yahoo finance

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from pathlib import Path
import pandas as pd

from pandas_datareader.nasdaq_trader import get_nasdaq_symbols
import yfinance as yf

In [3]:
idx = pd.IndexSlice

In [None]:
results_path = Path('results', 'asset_pricing')
if not results_path.exists():
    results_path.mkdir(parents=True)

In [4]:
def chunks(l, n): 
    for i in range(0, len(l), n):  
        yield l[i:i + n] 

## Get NASDAQ symbols

In [5]:
traded_symbols = get_nasdaq_symbols()

In [6]:
traded_symbols.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8895 entries, A to ZYXI
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   Nasdaq Traded     8895 non-null   bool    
 1   Security Name     8895 non-null   object  
 2   Listing Exchange  8895 non-null   category
 3   Market Category   8895 non-null   object  
 4   ETF               8895 non-null   bool    
 5   Round Lot Size    8895 non-null   float64 
 6   Test Issue        8895 non-null   bool    
 7   Financial Status  3566 non-null   category
 8   CQS Symbol        5329 non-null   object  
 9   NASDAQ Symbol     8895 non-null   object  
 10  NextShares        8895 non-null   bool    
dtypes: bool(4), category(2), float64(1), object(4)
memory usage: 469.5+ KB


## Download metadata from yahoo finance

### NASDAQ symbols

In [21]:
all_tickers = traded_symbols[~traded_symbols.ETF].index.unique().to_list()

In [7]:
yf_tickers = yf.Tickers(all_tickers)

Currently, there's a `yfinance` [bug](https://github.com/ranaroussi/yfinance/issues/208) that causes some stock info downloads to fail; apply the workaround described in the comments or wait for a new release to get the full dataset. Currently, we are losing a few hundred.

In [8]:
info = []
for ticker in yf_tickers.tickers:
    try:
        info.append(pd.Series(ticker.info).to_frame(ticker.ticker))
    except Exception as e:
        print(e, ticker.ticker)
info = pd.concat(info, axis=1).dropna(how='all').T
info = info.apply(pd.to_numeric, errors='ignore')
info.to_hdf(results_path / 'data.h5', 'stocks/info')

list index out of range AAN
list index out of range AAON
list index out of range AAU
list index out of range AB
list index out of range ABC
list index out of range ABCB
list index out of range ABIO
No tables found ABR$A
No tables found ABR$B
No tables found ABR$C
No tables found ACAMW
list index out of range ACB
No tables found ACEL.W
list index out of range ACGL
list index out of range ACHV
list index out of range ACLS
list index out of range ACN
list index out of range ACST
No tables found ACTTW
list index out of range ADCT
list index out of range ADES
No tables found ADILW
list index out of range ADMP
list index out of range ADXS
list index out of range AE
index 0 is out of bounds for axis 0 with size 0 AEB
No tables found AEFC
No tables found AEL$A
list index out of range AEMD
list index out of range AEO
No tables found AEP$B
list index out of range AEY
list index out of range AEYE
list index out of range AEZS
No tables found AFC
No tables found AFGB
list index out of range AFGC
li

list index out of range BSGM
list index out of range BSQR
list index out of range BSRR
No tables found BSX$A
list index out of range BTO
list index out of range BTT
list index out of range BUSE
list index out of range BW
list index out of range BWEN
list index out of range BWFG
list index out of range BWL.A
list index out of range BXC
No tables found BXP$B
No tables found BXS$A
list index out of range BZH
list index out of range C
list index out of range C$J
list index out of range C$K
list index out of range C$N
list index out of range C$S
list index out of range CACI
list index out of range CAH
list index out of range CAI$A
list index out of range CAI$B
list index out of range CAL
list index out of range CALM
list index out of range CALT
list index out of range CANF
list index out of range CAPR
list index out of range CARE
list index out of range CARV
list index out of range CASH
list index out of range CASS
list index out of range CATB
list index out of range CATO
list index out of 

No tables found DLR$K
No tables found DLR$L
list index out of range DLY
list index out of range DMAC
list index out of range DMPI
list index out of range DMYT
No tables found DMYT.U
No tables found DMYT.W
list index out of range DNI
list index out of range DNK
list index out of range DNR
No tables found DPHCW
list index out of range DPW
list index out of range DQ
list index out of range DRAD
list index out of range DRADP
list index out of range DRD
list index out of range DRE
list index out of range DRIO
No tables found DRIOW
index 0 is out of bounds for axis 0 with size 0 DRUA
list index out of range DS$B
list index out of range DS$C
list index out of range DS$D
No tables found DSKEW
list index out of range DSS
list index out of range DSU
No tables found DSX$B
index 0 is out of bounds for axis 0 with size 0 DTJ
list index out of range DTLA$
No tables found DTP
No tables found DTQ
list index out of range DTSS
index 0 is out of bounds for axis 0 with size 0 DTW
index 0 is out of bounds 

No tables found GNL$A
No tables found GNL$B
list index out of range GNRS
No tables found GNRSW
list index out of range GNSS
list index out of range GNT
No tables found GNT$A
list index out of range GNUS
list index out of range GOF
list index out of range GOGL
list index out of range GOL
list index out of range GOLD
list index out of range GOODN
No tables found GPAQW
index 0 is out of bounds for axis 0 with size 0 GPJA
list index out of range GPK
list index out of range GPL
list index out of range GPOR
list index out of range GRAF.U
list index out of range GRAF.W
list index out of range GRBK
list index out of range GRFS
HTTP Error 503: Service Unavailable GRIL
list index out of range GRMN
No tables found GRNVR
No tables found GRNVW
list index out of range GRP.U
list index out of range GRPN
list index out of range GRVY
No tables found GRX$B
list index out of range GS$A
list index out of range GS$C
list index out of range GS$D
list index out of range GS$J
list index out of range GS$K
list

list index out of range LACQU
No tables found LACQW
list index out of range LAMR
list index out of range LARK
No tables found LATNW
list index out of range LC
No tables found LCAHW
list index out of range LCII
list index out of range LCTX
list index out of range LCUT
list index out of range LEAF
list index out of range LECO
list index out of range LEDS
list index out of range LEGN
'regularMarketOpen' LEN.B
No tables found LFACW
list index out of range LFVN
list index out of range LGC.U
list index out of range LGC.W
list index out of range LGF.A
'regularMarketOpen' LGF.B
list index out of range LGHL
list index out of range LGHLW
list index out of range LGND
No tables found LGVW.U
list index out of range LH
list index out of range LHC.U
list index out of range LHC.W
list index out of range LHX
list index out of range LIFE
list index out of range LIQT
list index out of range LIVE
No tables found LIVKW
list index out of range LJPC
list index out of range LL
list index out of range LLEX
lis

list index out of range NTEST.C
list index out of range NTG
list index out of range NTIC
list index out of range NTN
list index out of range NTP
list index out of range NTRP
list index out of range NTWK
list index out of range NTZ
list index out of range NURO
list index out of range NUZE
list index out of range NVAX
list index out of range NVCN
list index out of range NVEC
list index out of range NVFY
list index out of range NVGS
list index out of range NVIV
list index out of range NWGI
list index out of range NWL
list index out of range NXTD
No tables found NYCB$A
No tables found NYCB$U
list index out of range NYMT
list index out of range OAC.U
list index out of range OAC.W
No tables found OAK$A
list index out of range OAK$B
list index out of range OBAS
list index out of range OBLG
list index out of range OBLN
list index out of range OCC
list index out of range OCCIP
list index out of range OCFC
list index out of range OCFCP
list index out of range OCFT
list index out of range OCN
lis

list index out of range RHP
list index out of range RIBT
list index out of range RIG
list index out of range RIGL
No tables found RILYG
No tables found RILYH
index 0 is out of bounds for axis 0 with size 0 RILYI
No tables found RILYM
No tables found RILYN
No tables found RILYO
list index out of range RILYP
No tables found RILYZ
list index out of range RIO
list index out of range RIOT
list index out of range RJF
list index out of range RKDA
list index out of range RL
list index out of range RLGT
list index out of range RLH
No tables found RLJ$A
list index out of range RLMD
list index out of range RMBL
list index out of range RMCF
list index out of range RMG.U
list index out of range RMG.W
list index out of range RMPL$
list index out of range RNA
No tables found RNR$E
list index out of range RNR$F
list index out of range RNST
list index out of range RNWK
list index out of range ROCH
list index out of range ROCHU
list index out of range ROCHW
list index out of range ROIC
list index out of

No tables found TCRW
index 0 is out of bounds for axis 0 with size 0 TCRZ
list index out of range TCX
No tables found TDA
No tables found TDACW
No tables found TDE
No tables found TDI
No tables found TDJ
list index out of range TDS
list index out of range TDW.A
'regularMarketOpen' TDW.B
list index out of range TDW.W
list index out of range TEF
list index out of range TEL
list index out of range TENX
list index out of range TERP
list index out of range TEUM
list index out of range TEX
list index out of range TFC
list index out of range TFC$F
No tables found TFC$G
No tables found TFC$H
No tables found TFC$I
list index out of range TFC$O
list index out of range TGA
list index out of range TGC
list index out of range TGLS
list index out of range TGNA
No tables found TGP$A
No tables found TGP$B
list index out of range TGT
list index out of range TGTX
No tables found THBRW
list index out of range THC
No tables found THCAW
No tables found THCBW
list index out of range THG
No tables found THGA

list index out of range YTRA
list index out of range YUM
list index out of range YVR
list index out of range ZAGG
No tables found ZAZZT
No tables found ZBZX
No tables found ZBZZT
No tables found ZCZZT
No tables found ZEXIT
list index out of range ZGNX
list index out of range ZGYH
No tables found ZGYHR
list index out of range ZGYHU
No tables found ZGYHW
list index out of range ZI
list index out of range ZIEXT
index 0 is out of bounds for axis 0 with size 0 ZIONL
list index out of range ZIOP
list index out of range ZIXI
No tables found ZJZZT
list index out of range ZNTL
No tables found ZNWAA
list index out of range ZOM
list index out of range ZSAN
No tables found ZTEST
list index out of range ZTO
list index out of range ZVO
No tables found ZVV
No tables found ZVZZC
No tables found ZVZZT
No tables found ZWZZT
No tables found ZXIET
list index out of range ZXYZ.A
No tables found ZXZZT
list index out of range ZYXI


## Download adjusted price data using yfinance

In [22]:
prices_adj = []
with pd.HDFStore('chunks.h5') as store:
    for i, chunk in enumerate(chunks(all_tickers, 100)):
        print(i, end=' ', flush=True)
        prices_adj.append(yf.download(chunk, period='max', auto_adjust=True).stack(-1))

[*********************100%***********************]  100 of 100 completed

7 Failed downloads:
- ABR$C: No data found, symbol may be delisted
- ADILW: 1d data not available for startTime=-2208988800 and endTime=1592829199. Only 100 years worth of day granularity data are allowed to be fetched per request.
- ABR$A: No data found, symbol may be delisted
- ACTTW: 1d data not available for startTime=-2208988800 and endTime=1592829200. Only 100 years worth of day granularity data are allowed to be fetched per request.
- ACAMW: 1d data not available for startTime=-2208988800 and endTime=1592829201. Only 100 years worth of day granularity data are allowed to be fetched per request.
- ABR$B: No data found, symbol may be delisted
- ACEL.W: No data found, symbol may be delisted
[*********************100%***********************]  100 of 100 completed

26 Failed downloads:
- AHT$F: No data found, symbol may be delisted
- AGO$B: No data found, symbol may be delisted
- AGBAR: 1d data not available fo

[*********************100%***********************]  100 of 100 completed

4 Failed downloads:
- DCP$C: No data found, symbol may be delisted
- DCP$B: No data found, symbol may be delisted
- CYRXW: 1d data not available for startTime=-2208988800 and endTime=1592829340. Only 100 years worth of day granularity data are allowed to be fetched per request.
- CWEN.A: No data found, symbol may be delisted
[*********************100%***********************]  100 of 100 completed

18 Failed downloads:
- DFPHW: 1d data not available for startTime=-2208988800 and endTime=1592829343. Only 100 years worth of day granularity data are allowed to be fetched per request.
- DMYT.W: No data found, symbol may be delisted
- DLR$G: No data found, symbol may be delisted
- DLNG$A: No data found, symbol may be delisted
- DMYT.U: No data found, symbol may be delisted
- DLR$L: No data found, symbol may be delisted
- DHR$A: No data found, symbol may be delisted
- DFNS.U: No data found, symbol may be delisted
- DLPN

[*********************100%***********************]  100 of 100 completed

17 Failed downloads:
- GLU$A: No data found, symbol may be delisted
- GLOG$A: No data found, symbol may be delisted
- GLOP$A: No data found, symbol may be delisted
- GNRSW: 1d data not available for startTime=-2208988800 and endTime=1592829415. Only 100 years worth of day granularity data are allowed to be fetched per request.
- GPAQW: 1d data not available for startTime=-2208988800 and endTime=1592829416. Only 100 years worth of day granularity data are allowed to be fetched per request.
- GMRE$A: No data found, symbol may be delisted
- GLP$A: No data found, symbol may be delisted
- GLU$B: No data found, symbol may be delisted
- GLEO.W: No data found, symbol may be delisted
- GMHIW: 1d data not available for startTime=-2208988800 and endTime=1592829416. Only 100 years worth of day granularity data are allowed to be fetched per request.
- GNT$A: No data found, symbol may be delisted
- GMBLW: 1d data not available

[*********************100%***********************]  100 of 100 completed

8 Failed downloads:
- KSU$: No data found, symbol may be delisted
- KIM$M: No data found, symbol may be delisted
- KLR.W: No data found, symbol may be delisted
- KKR$A: No data found, symbol may be delisted
- LACQW: 1d data not available for startTime=-2208988800 and endTime=1592829484. Only 100 years worth of day granularity data are allowed to be fetched per request.
- KKR$B: No data found, symbol may be delisted
- KTOVW: 1d data not available for startTime=-2208988800 and endTime=1592829484. Only 100 years worth of day granularity data are allowed to be fetched per request.
- KIM$L: No data found, symbol may be delisted
[*********************100%***********************]  100 of 100 completed

12 Failed downloads:
- LGC.U: No data found, symbol may be delisted
- LATNW: 1d data not available for startTime=-2208988800 and endTime=1592829488. Only 100 years worth of day granularity data are allowed to be fetched p

[*********************100%***********************]  100 of 100 completed

21 Failed downloads:
- SB$C: No data found, symbol may be delisted
- SBE.W: No data found, symbol may be delisted
- SCVX.W: No data found, symbol may be delisted
- SBE.U: No data found, symbol may be delisted
- SCPE.W: No data found, symbol may be delisted
- SCE$H: No data found, symbol may be delisted
- SCHW$D: No data found, symbol may be delisted
- SB$D: No data found, symbol may be delisted
- SCPE.U: No data found, symbol may be delisted
- SAN$B: No data found, symbol may be delisted
- SCE$D: No data found, symbol may be delisted
- SAQNW: 1d data not available for startTime=-2208988800 and endTime=1592829638. Only 100 years worth of day granularity data are allowed to be fetched per request.
- SCVX.U: No data found, symbol may be delisted
- SCE$G: No data found, symbol may be delisted
- SCHW$C: No data found, symbol may be delisted
- SCE$C: No data found, symbol may be delisted
- SCE$J: No data found, symbol 

[*********************100%***********************]  100 of 100 completed

10 Failed downloads:
- VERBW: 1d data not available for startTime=-2208988800 and endTime=1592829704. Only 100 years worth of day granularity data are allowed to be fetched per request.
- VER$F: No data found, symbol may be delisted
- VERT.U: No data found, symbol may be delisted
- USB$P: No data found, symbol may be delisted
- USWSW: 1d data not available for startTime=-2208988800 and endTime=1592829705. Only 100 years worth of day granularity data are allowed to be fetched per request.
- USB$A: No data found, symbol may be delisted
- USB$M: No data found, symbol may be delisted
- USB$H: No data found, symbol may be delisted
- USB$O: No data found, symbol may be delisted
- UUUU.W: No data found, symbol may be delisted
[*********************100%***********************]  100 of 100 completed

9 Failed downloads:
- VNO$K: No data found, symbol may be delisted
- VKTXW: 1d data not available for startTime=-2208988800

In [23]:
prices_adj = (pd.concat(prices_adj)
              .dropna(how='all', axis=1)
              .rename(columns=str.lower)
              .swaplevel())

In [24]:
prices_adj.index.names = ['ticker', 'date']

In [25]:
len(prices_adj.index.unique('ticker'))

5879

### Remove outliers

In [26]:
df = prices_adj.close.unstack('ticker')
pmax = df.pct_change().max()
pmin = df.pct_change().min()
to_drop = pmax[pmax > 1].index.union(pmin[pmin<-1].index)
len(to_drop)

696

In [27]:
prices_adj = prices_adj.drop(to_drop, level='ticker')

In [28]:
len(prices_adj.index.unique('ticker'))

5183

In [29]:
prices_adj.sort_index().loc[idx[:, '1990': '2019'], :].to_hdf(results_path / 'data.h5', 
                                                              'stocks/prices/adjusted')