## **Data Scraping and Insights from Historical Daily Prices of NASDAQ Stocks**

In [None]:
offset = 0
limit = 3000
period = 'max' # valid periods: 1d,5d,1mo,3mo,6mo,1y,2y,5y,10y,ytd,max

In [None]:
import pandas as pd

data = pd.read_csv("http://www.nasdaqtrader.com/dynamic/SymDir/nasdaqtraded.txt", sep='|')
data_clean = data[data['Test Issue'] == 'N']
symbols = data_clean['NASDAQ Symbol'].tolist()
print('total number of symbols traded = {}'.format(len(symbols)))

total number of symbols traded = 11017


In [None]:
! pip install yfinance > /dev/null 2>&1
! mkdir hist

In [None]:
import yfinance as yf
import os, contextlib


In [None]:
%%time

limit = limit if limit else len(symbols)
end = min(offset + limit, len(symbols))
is_valid = [False] * len(symbols)
# force silencing of verbose API
with open(os.devnull, 'w') as devnull:
    with contextlib.redirect_stdout(devnull):
        for i in range(offset, end):
            s = symbols[i]
            try:
              data = yf.download(s, period=period)
              if len(data.index) == 0:
                continue
              is_valid[i] = True
              data.to_csv('hist/{}.csv'.format(s))
            except KeyError:
              print(f"Error downloading data for symbol: {s}")

print('Total number of valid symbols downloaded = {}'.format(sum(is_valid)))

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
ERROR:yfinance:
1 Failed download:
ERROR:yfinance:['AACT=']: Exception('%ticker%: No timezone found, symbol may be delisted')
[*********************100%%**********************]  1 of 1 completed
ERROR:yfinance:
1 Failed download:
ERROR:yfinance:['AACT+']: Exception('%ticker%: No timezone found, symbol may be 

Total number of valid symbols downloaded = 2804
CPU times: user 5min 56s, sys: 13.9 s, total: 6min 10s
Wall time: 19min 2s





In [None]:
valid_data = data_clean[is_valid]
valid_data.to_csv('symbols_valid_meta.csv', index=False)
print(type(valid_data))
valid_data

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Nasdaq Traded,Symbol,Security Name,Listing Exchange,Market Category,ETF,Round Lot Size,Test Issue,Financial Status,CQS Symbol,NASDAQ Symbol,NextShares
0,Y,A,"Agilent Technologies, Inc. Common Stock",N,,N,100.0,N,,A,A,N
1,Y,AA,Alcoa Corporation Common Stock,N,,N,100.0,N,,AA,AA,N
2,Y,AAA,Alternative Access First Priority CLO Bond ETF,P,,Y,100.0,N,,AAA,AAA,N
3,Y,AAAU,Goldman Sachs Physical Gold ETF Shares,Z,,Y,100.0,N,,AAAU,AAAU,N
4,Y,AACG,ATA Creativity Global - American Depositary Sh...,Q,G,N,100.0,N,N,,AACG,N
...,...,...,...,...,...,...,...,...,...,...,...,...
3003,Y,EFAS,Global X MSCI SuperDividend EAFE ETF,Q,G,Y,100.0,N,N,,EFAS,N
3004,Y,EFAV,iShares MSCI EAFE Min Vol Factor ETF,Z,,Y,100.0,N,,EFAV,EFAV,N
3005,Y,EFAX,SPDR MSCI EAFE Fossil Fuel Reserves Free ETF,P,,Y,100.0,N,,EFAX,EFAX,N
3006,Y,EFC,Ellington Financial Inc. Common Stock,N,,N,100.0,N,,EFC,EFC,N


In [None]:
!mkdir stocks
!mkdir etfs

In [None]:
etfs = valid_data[valid_data['ETF'] == 'Y']['NASDAQ Symbol'].tolist()
stocks = valid_data[valid_data['ETF'] == 'N']['NASDAQ Symbol'].tolist()

In [None]:
import shutil
from os.path import isfile, join

def move_symbols(symbols, dest):
    for s in symbols:
        filename = '{}.csv'.format(s)
        shutil.move(join('hist', filename), join(dest, filename))

move_symbols(etfs, "etfs")
move_symbols(stocks, "stocks")