# Download historical equity data for NASDAQ stocks from yahoo finance

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from pandas_datareader.nasdaq_trader import get_nasdaq_symbols
import yfinance as yf

In [5]:
sns.set_style('whitegrid')

In [6]:
idx = pd.IndexSlice

In [5]:
def chunks(l, n): 
    for i in range(0, len(l), n):  
        yield l[i:i + n] 

## Get NASDAQ symbols

In [109]:
traded_symbols = get_nasdaq_symbols()

In [110]:
traded_symbols.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8882 entries, A to ZYXI
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   Nasdaq Traded     8882 non-null   bool    
 1   Security Name     8882 non-null   object  
 2   Listing Exchange  8882 non-null   category
 3   Market Category   8882 non-null   object  
 4   ETF               8882 non-null   bool    
 5   Round Lot Size    8882 non-null   float64 
 6   Test Issue        8882 non-null   bool    
 7   Financial Status  3559 non-null   category
 8   CQS Symbol        5323 non-null   object  
 9   NASDAQ Symbol     8882 non-null   object  
 10  NextShares        8882 non-null   bool    
dtypes: bool(4), category(2), float64(1), object(4)
memory usage: 468.8+ KB


## Download metadata from yahoo finance

### NASDAQ symbols

In [93]:
tickers = yf.Tickers(traded_symbols[~traded_symbols.ETF].index.to_list())

In [None]:
info = []
for ticker in tickers.tickers:
    info.append(pd.Series(ticker.info).to_frame(ticker.ticker))
info = pd.concat(info, axis=1).dropna(how='all').T
info = info.apply(pd.to_numeric, errors='ignore')
info.to_hdf('data.h5', 'stocks/info')

## Download adjusted price data using yfinance

In [None]:
prices_adj = []
with pd.HDFStore('chunks.h5') as store:
    for i, chunk in enumerate(chunks(tickers, 100)):
        print(i, end=' ', flush=True)
        prices_adj.append(yf.download(chunk, period='max', auto_adjust=True).stack(-1))

In [None]:
prices_adj = (pd.concat(prices_adj)
              .dropna(how='all', axis=1)
              .rename(columns=str.lower)
              .swaplevel())

In [61]:
prices_adj.index.names = ['ticker', 'date']

In [62]:
len(prices_adj.index.unique('ticker'))

4314

### Remove outliers

In [None]:
df = prices_adj.close.unstack('ticker')
pmax = df.pct_change().max()
pmin = df.pct_change().min()
to_drop = pmax[pmax > 1].index.union(pmin[pmin<-1].index)
len(to_drop)

In [None]:
prices_adj = prices_adj.drop(to_drop, level='ticker')

In [None]:
len(prices_adj.index.unique('ticker'))

In [44]:
prices_adj.sort_index().loc[idx[:, '1990': '2019'], :].to_hdf('data.h5', 'stocks/prices/adjusted')

## Download price & adjustment factors using yfinance

In [13]:
tickers = pd.read_hdf('data.h5', 'stocks/prices/adjusted').index.unique('ticker').tolist()

In [None]:
prices = []
for i, chunk in enumerate(chunks(tickers, 100)):
    print(i, end=' ', flush=True)
    prices.append(yf.download(chunk, period='max', auto_adjust=False, actions=True).stack(-1))

In [None]:
prices = (pd.concat(prices)
          .dropna(how='all', axis=1)
          .rename(columns=str.lower)
          .drop('adj close', axis=1)
          .swaplevel())

In [None]:
prices.index.names = ['ticker', 'date']

In [50]:
prices = prices.drop('adj close', axis=1)

In [51]:
prices.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,close,dividends,high,low,open,stock splits,volume
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AA,1962-01-02,6.532155,0.0,6.556185,6.532155,6.532155,0.0,55900.0
AA,1962-01-03,6.63228,0.0,6.63228,6.524145,6.532155,0.0,74500.0
AA,1962-01-04,6.63228,0.0,6.66432,6.63228,6.63228,0.0,80500.0
AA,1962-01-05,6.62427,0.0,6.65631,6.61626,6.63228,0.0,70500.0
AA,1962-01-08,6.408,0.0,6.60825,6.339915,6.60825,0.0,93800.0


In [52]:
prices = prices.drop(to_drop, level='ticker')

In [56]:
prices = prices.sort_index().loc[idx[:, '1990': '2019'], :]

In [57]:
prices.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 14125803 entries, ('A', Timestamp('1999-11-18 00:00:00')) to ('ZYME', Timestamp('2019-12-31 00:00:00'))
Data columns (total 7 columns):
 #   Column        Non-Null Count     Dtype  
---  ------        --------------     -----  
 0   close         14124675 non-null  float64
 1   dividends     14125803 non-null  float64
 2   high          14124675 non-null  float64
 3   low           14124675 non-null  float64
 4   open          14124675 non-null  float64
 5   stock splits  14125803 non-null  float64
 6   volume        14124675 non-null  float64
dtypes: float64(7)
memory usage: 808.4+ MB


In [59]:
prices[['open', 'high', 'low', 'close', 'volume', 'dividends', 'stock splits']].to_hdf('data.h5', 'stocks/prices/unadjusted')