# Download historical equity data for NASDAQ stocks from yahoo finance

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from time import time
from pathlib import Path
import pandas as pd

from pandas_datareader.nasdaq_trader import get_nasdaq_symbols
import yfinance as yf

In [3]:
idx = pd.IndexSlice

In [4]:
results_path = Path('results', 'asset_pricing')
if not results_path.exists():
    results_path.mkdir(parents=True)

In [5]:
def chunks(l, n): 
    for i in range(0, len(l), n):  
        yield l[i:i + n] 

In [6]:
def format_time(t):
    """Return a formatted time string 'HH:MM:SS
    based on a numeric time() value"""
    m, s = divmod(t, 60)
    h, m = divmod(m, 60)
    return f'{h:0>2.0f}:{m:0>2.0f}:{s:0>2.0f}'

## Get NASDAQ symbols

In [7]:
traded_symbols = get_nasdaq_symbols()

In [8]:
traded_symbols.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9903 entries, A to ZYXI
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   Nasdaq Traded     9903 non-null   bool    
 1   Security Name     9903 non-null   object  
 2   Listing Exchange  9903 non-null   category
 3   Market Category   9903 non-null   object  
 4   ETF               9903 non-null   bool    
 5   Round Lot Size    9903 non-null   float64 
 6   Test Issue        9903 non-null   bool    
 7   Financial Status  4196 non-null   category
 8   CQS Symbol        5707 non-null   object  
 9   NASDAQ Symbol     9903 non-null   object  
 10  NextShares        9903 non-null   bool    
dtypes: bool(4), category(2), float64(1), object(4)
memory usage: 522.6+ KB


## Download metadata from yahoo finance

### NASDAQ symbols

In [9]:
all_tickers = traded_symbols[~traded_symbols.ETF].index.unique().to_list()
n = len(all_tickers)
print(f'# Tickers: {n:,.0f}')

# Tickers: 7,544


In [10]:
yf_tickers = yf.Tickers(all_tickers)

In [11]:
info = []
start = time()
for i, ticker in enumerate(yf_tickers.tickers, 1):
    try:
        info.append(pd.Series(ticker.info).to_frame(ticker.ticker))
    except Exception as e:
        pass
        # optional: track errors
        # print(ticker.ticker, e)
    if i % 100 == 0:
        per_ticker = (time()-start) / i
        to_do = n - i
        to_go = to_do * per_ticker
        
        print(f'Success: {len(info):5,.0f} / {i:5,.0f} | To go: {format_time(to_go)} ({to_do:5,.0f})')
info = pd.concat(info, axis=1).dropna(how='all').T
info = info.apply(pd.to_numeric, errors='ignore')
info.to_hdf(results_path / 'data.h5', 'stocks/info')

Success:    83/  100 | To go: 06:07:55 (7,444)
Success:   165/  200 | To go: 05:59:37 (7,344)
Success:   236/  300 | To go: 05:54:24 (7,244)
Success:   316/  400 | To go: 05:46:35 (7,144)
Success:   399/  500 | To go: 05:42:38 (7,044)
Success:   481/  600 | To go: 05:38:07 (6,944)
Success:   557/  700 | To go: 05:37:05 (6,844)
Success:   641/  800 | To go: 05:33:51 (6,744)
Success:   724/  900 | To go: 05:28:23 (6,644)
Success:   810/1,000 | To go: 05:23:36 (6,544)
Success:   891/1,100 | To go: 05:19:13 (6,444)
Success:   977/1,200 | To go: 05:14:41 (6,344)
Success: 1,061/1,300 | To go: 05:09:07 (6,244)
Success: 1,144/1,400 | To go: 05:03:39 (6,144)
Success: 1,221/1,500 | To go: 04:58:57 (6,044)
Success: 1,298/1,600 | To go: 04:54:08 (5,944)
Success: 1,382/1,700 | To go: 04:48:12 (5,844)
Success: 1,460/1,800 | To go: 04:44:15 (5,744)
Success: 1,541/1,900 | To go: 04:38:30 (5,644)
Success: 1,628/2,000 | To go: 04:33:19 (5,544)
Success: 1,707/2,100 | To go: 04:28:30 (5,444)
Success: 1,78

## Download adjusted price data using yfinance

In [12]:
prices_adj = []
start = time()
for i, chunk in enumerate(chunks(all_tickers, 100), 1):
    prices_adj.append(yf.download(chunk, period='max', auto_adjust=True).stack(-1))

    per_ticker = (time()-start) / (i * 100)
    to_do = n - (i * 100)
    to_go = to_do * per_ticker    
    print(f'Success: {len(prices_adj):5,.0f}/{i:5,.0f} | To go: {format_time(to_go)} ({to_do:5,.0f})')

[*********************100%***********************]  100 of 100 completed

15 Failed downloads:
- ACIC.U: No data found, symbol may be delisted
- AAIC$C: No data found, symbol may be delisted
- ABR$B: No data found, symbol may be delisted
- AAC.U: No data found, symbol may be delisted
- ACIC.W: No data found, symbol may be delisted
- ACR$C: No data found, symbol may be delisted
- ABR$C: No data found, symbol may be delisted
- ACKIW: 1d data not available for startTime=-2208988800 and endTime=1614159977. Only 100 years worth of day granularity data are allowed to be fetched per request.
- AACQW: 1d data not available for startTime=-2208988800 and endTime=1614159978. Only 100 years worth of day granularity data are allowed to be fetched per request.
- ACND.W: No data found, symbol may be delisted
- ABR$A: No data found, symbol may be delisted
- ACII.U: No data found, symbol may be delisted
- AAIC$B: No data found, symbol may be delisted
- ACND.U: No data found, symbol may be delisted
- AC

Success:     6/    6 | To go: 00:13:31 (6,944)
[*********************100%***********************]  100 of 100 completed

24 Failed downloads:
- ATEST.C: No data found, symbol may be delisted
- ATA.U: No data found, symbol may be delisted
- ATCO$D: No data found, symbol may be delisted
- ATH$D: No data found, symbol may be delisted
- ATAC.U: No data found, symbol may be delisted
- ATCO$E: No data found, symbol may be delisted
- ATEST.B: No data found, symbol may be delisted
- AVAN.W: No data found, symbol may be delisted
- ATH$A: No data found, symbol may be delisted
- ATCO$H: No data found, symbol may be delisted
- ATNFW: 1d data not available for startTime=-2208988800 and endTime=1614160047. Only 100 years worth of day granularity data are allowed to be fetched per request.
- AUUDW: 1d data not available for startTime=-2208988800 and endTime=1614160047. Only 100 years worth of day granularity data are allowed to be fetched per request.
- ATCO$I: No data found, symbol may be delisted
-

Success:    13/   13 | To go: 00:11:27 (6,244)
[*********************100%***********************]  100 of 100 completed

15 Failed downloads:
- CFG$D: No data found, symbol may be delisted
- CEQP$: No data found, symbol may be delisted
- CETXW: 1d data not available for startTime=-2208988800 and endTime=1614160117. Only 100 years worth of day granularity data are allowed to be fetched per request.
- CFIIW: 1d data not available for startTime=-2208988800 and endTime=1614160117. Only 100 years worth of day granularity data are allowed to be fetched per request.
- CFG$E: No data found, symbol may be delisted
- CCX.U: No data found, symbol may be delisted
- CCV.W: No data found, symbol may be delisted
- CFR$B: No data found, symbol may be delisted
- CELG.R: No data found, symbol may be delisted
- CDR$B: No data found, symbol may be delisted
- CEREW: 1d data not available for startTime=-2208988800 and endTime=1614160121. Only 100 years worth of day granularity data are allowed to be fetched

Success:    20/   20 | To go: 00:09:48 (5,544)
[*********************100%***********************]  100 of 100 completed

19 Failed downloads:
- DRH$A: No data found, symbol may be delisted
- DLNG$B: No data found, symbol may be delisted
- DMYI.U: No data found, symbol may be delisted
- DLR$L: No data found, symbol may be delisted
- DHR$A: No data found, symbol may be delisted
- DRIOW: 1d data not available for startTime=-2208988800 and endTime=1614160188. Only 100 years worth of day granularity data are allowed to be fetched per request.
- DNMR.W: No data found, symbol may be delisted
- DHCNL: 1d data not available for startTime=-2208988800 and endTime=1614160188. Only 100 years worth of day granularity data are allowed to be fetched per request.
- DMYI.W: No data found, symbol may be delisted
- DLR$J: No data found, symbol may be delisted
- DLNG$A: No data found, symbol may be delisted
- DMYD.W: No data found, symbol may be delisted
- DMYD.U: No data found, symbol may be delisted
- DM

Success:    28/   28 | To go: 00:08:14 (4,744)
[*********************100%***********************]  100 of 100 completed

17 Failed downloads:
- FUSE.U: No data found, symbol may be delisted
- GAB$G: No data found, symbol may be delisted
- GDV$G: No data found, symbol may be delisted
- GDYNW: 1d data not available for startTime=-2208988800 and endTime=1614160266. Only 100 years worth of day granularity data are allowed to be fetched per request.
- GCMGW: 1d data not available for startTime=-2208988800 and endTime=1614160267. Only 100 years worth of day granularity data are allowed to be fetched per request.
- FUSE.W: No data found, symbol may be delisted
- GBLIL: 1d data not available for startTime=-2208988800 and endTime=1614160268. Only 100 years worth of day granularity data are allowed to be fetched per request.
- FVT.U: No data found, symbol may be delisted
- GECCL: 1d data not available for startTime=-2208988800 and endTime=1614160270. Only 100 years worth of day granularity data 

Success:    35/   35 | To go: 00:07:15 (4,044)
[*********************100%***********************]  100 of 100 completed

12 Failed downloads:
- IEAWW: 1d data not available for startTime=-2208988800 and endTime=1614160350. Only 100 years worth of day granularity data are allowed to be fetched per request.
- IMRNW: 1d data not available for startTime=-2208988800 and endTime=1614160350. Only 100 years worth of day granularity data are allowed to be fetched per request.
- IMPX.W: No data found, symbol may be delisted
- IIAC.U: No data found, symbol may be delisted
- IIAC.W: No data found, symbol may be delisted
- IMPX.U: No data found, symbol may be delisted
- IIPR$A: No data found, symbol may be delisted
- IGZ: No data found, symbol may be delisted
- IGICW: 1d data not available for startTime=-2208988800 and endTime=1614160355. Only 100 years worth of day granularity data are allowed to be fetched per request.
- IMTXW: 1d data not available for startTime=-2208988800 and endTime=161416035

Success:    43/   43 | To go: 00:05:44 (3,244)
[*********************100%***********************]  100 of 100 completed

15 Failed downloads:
- MITT$A: No data found, symbol may be delisted
- MET$F: No data found, symbol may be delisted
- MFA$C: No data found, symbol may be delisted
- MITT$B: No data found, symbol may be delisted
- MET$A: No data found, symbol may be delisted
- MITT$C: No data found, symbol may be delisted
- MH$A: No data found, symbol may be delisted
- MH$D: No data found, symbol may be delisted
- MER$K: No data found, symbol may be delisted
- MH$C: No data found, symbol may be delisted
- MFA$B: No data found, symbol may be delisted
- MIT.U: No data found, symbol may be delisted
- MET$E: No data found, symbol may be delisted
- MILEW: 1d data not available for startTime=-2208988800 and endTime=1614160437. Only 100 years worth of day granularity data are allowed to be fetched per request.
- METXW: 1d data not available for startTime=-2208988800 and endTime=1614160437. O

Success:    52/   52 | To go: 00:04:13 (2,344)
[*********************100%***********************]  100 of 100 completed

21 Failed downloads:
- PAYAW: 1d data not available for startTime=-2208988800 and endTime=1614160534. Only 100 years worth of day granularity data are allowed to be fetched per request.
- PDAC.U: No data found, symbol may be delisted
- PAICW: 1d data not available for startTime=-2208988800 and endTime=1614160535. Only 100 years worth of day granularity data are allowed to be fetched per request.
- PCG$D: No data found, symbol may be delisted
- PDAC.W: No data found, symbol may be delisted
- PEB$D: No data found, symbol may be delisted
- PEB$C: No data found, symbol may be delisted
- PCG$C: No data found, symbol may be delisted
- PCG$B: No data found, symbol may be delisted
- PCPC.W: No data found, symbol may be delisted
- PCPC.U: No data found, symbol may be delisted
- PCG$G: No data found, symbol may be delisted
- PCG$A: No data found, symbol may be delisted
- PCG$I

Success:    60/   60 | To go: 00:02:52 (1,544)
[*********************100%***********************]  100 of 100 completed

21 Failed downloads:
- SF$A: No data found, symbol may be delisted
- SBE.U: No data found, symbol may be delisted
- SCPE.U: No data found, symbol may be delisted
- SCE$L: No data found, symbol may be delisted
- SCHW$C: No data found, symbol may be delisted
- SBG.U: No data found, symbol may be delisted
- SEAH.W: No data found, symbol may be delisted
- SF$C: No data found, symbol may be delisted
- SCE$J: No data found, symbol may be delisted
- SCE$K: No data found, symbol may be delisted
- SCE$G: No data found, symbol may be delisted
- SEAH.U: No data found, symbol may be delisted
- SBE.W: No data found, symbol may be delisted
- SBG.W: No data found, symbol may be delisted
- SCHW$D: No data found, symbol may be delisted
- SCPE.W: No data found, symbol may be delisted
- SF$B: No data found, symbol may be delisted
- SCVX.U: No data found, symbol may be delisted
- SCE$H:

Success:    67/   67 | To go: 00:01:36 (  844)
[*********************100%***********************]  100 of 100 completed

15 Failed downloads:
- TRITW: 1d data not available for startTime=-2208988800 and endTime=1614160733. Only 100 years worth of day granularity data are allowed to be fetched per request.
- TMTSW: 1d data not available for startTime=-2208988800 and endTime=1614160733. Only 100 years worth of day granularity data are allowed to be fetched per request.
- TINV.W: No data found, symbol may be delisted
- TMPMW: 1d data not available for startTime=-2208988800 and endTime=1614160734. Only 100 years worth of day granularity data are allowed to be fetched per request.
- TMAC.U: No data found, symbol may be delisted
- TPGY.U: No data found, symbol may be delisted
- TREB.U: No data found, symbol may be delisted
- TNP$D: No data found, symbol may be delisted
- TREB.W: No data found, symbol may be delisted
- TLGA.U: No data found, symbol may be delisted
- TLMDW: 1d data not availab

Success:    75/   75 | To go: 00:00:05 (   44)
[*********************100%***********************]  44 of 44 completed

12 Failed downloads:
- ZJZZT: 1d data not available for startTime=-2208988800 and endTime=1614160843. Only 100 years worth of day granularity data are allowed to be fetched per request.
- ZGYHR: 1d data not available for startTime=-2208988800 and endTime=1614160843. Only 100 years worth of day granularity data are allowed to be fetched per request.
- ZWZZT: 1d data not available for startTime=-2208988800 and endTime=1614160843. Only 100 years worth of day granularity data are allowed to be fetched per request.
- ZVV: 1d data not available for startTime=-2208988800 and endTime=1614160843. Only 100 years worth of day granularity data are allowed to be fetched per request.
- ZVZZC: 1d data not available for startTime=-2208988800 and endTime=1614160844. Only 100 years worth of day granularity data are allowed to be fetched per request.
- ZVZZT: 1d data not available for st

In [13]:
prices_adj = (pd.concat(prices_adj)
              .dropna(how='all', axis=1)
              .rename(columns=str.lower)
              .swaplevel())

In [14]:
prices_adj.index.names = ['ticker', 'date']

In [15]:
len(prices_adj.index.unique('ticker'))

6466

### Remove outliers

In [16]:
df = prices_adj.close.unstack('ticker')
pmax = df.pct_change().max()
pmin = df.pct_change().min()
to_drop = pmax[pmax > 1].index.union(pmin[pmin<-1].index)
len(to_drop)

749

In [17]:
prices_adj = prices_adj.drop(to_drop, level='ticker')

In [18]:
len(prices_adj.index.unique('ticker'))

5717

In [19]:
prices_adj.sort_index().loc[idx[:, '1990': '2019'], :].to_hdf(results_path / 'data.h5', 
                                                              'stocks/prices/adjusted')