# Downloading NASDAQ data using Yahoo! Finance.

In [1]:
# incase yahoo finance is not installed...
# !pip install --upgrade --no-cache-dir yfinance

In [2]:
import pandas as pd
import yfinance as yf
import os, contextlib
import shutil
from os.path import isfile, join

import os
import logging
import pandas as pd
import yfinance as yf
import requests
from io import StringIO
from concurrent.futures import ThreadPoolExecutor, as_completed
from requests.adapters import HTTPAdapter

In [3]:
# Set up a global session with increased connection pool size
global_session = requests.Session()
adapter = HTTPAdapter(pool_connections=100, pool_maxsize=100)
global_session.mount("http://", adapter)
global_session.mount("https://", adapter)

In [4]:
# Disable InsecureRequestWarning (only for testing purposes)
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)

In [5]:
# Configure logging to log errors to "errors.log"
logging.basicConfig(filename="errors.log", level=logging.WARNING,
                    format='%(asctime)s - %(levelname)s - %(message)s')

In [6]:
def get_nasdaq_symbols(data_dir):
    """
    Download the list of NASDAQ traded symbols from the NASDAQ Trader website.
    The file is pipe-separated and contains footer rows and test issues.
    Filters out rows where 'Test Issue' is not 'N', saves the full
    reference DataFrame to the provided directory, and returns the list of symbols.
    """
    url = "http://www.nasdaqtrader.com/dynamic/SymDir/nasdaqtraded.txt"
    try:
        response = requests.get(url, verify=False)
        response.raise_for_status()  # Raise exception if the request failed
        data = StringIO(response.text)
        df = pd.read_csv(data, sep="|")
        # Filter out rows where 'Test Issue' is not 'N'
        df_clean = df[df['Test Issue'] == 'N']
        # Remove any potential trailer rows (often the last row contains summary info)
        df_clean = df_clean[df_clean['Symbol'].notna()]
        # Persist the full reference list to a CSV file in the provided directory
        ref_file = os.path.join(data_dir, "nasdaq_symbols_reference.csv")
        df_clean.to_csv(ref_file, index=False)
        symbols = df_clean['Symbol'].tolist()
        return symbols
    except Exception as e:
        logging.error(f"Error fetching NASDAQ traded symbols: {e}")
        return []

When you see the parameter `auto_adjust` in yfinance’s `download()` function, it refers to automatically adjusting historical price data for corporate actions—mainly stock splits and dividends.

### What Does Auto-Adjustment Mean?

- **Stock Splits:**  
  When a company splits its stock (e.g., a 2-for-1 split), the number of shares increases while the share price decreases proportionally. For instance, if a stock priced at \$100 splits 2-for-1, the new price will be around \$50. Without adjustment, the historical data would show a sudden drop in price on the split date, which could be misleading if you’re analyzing trends over time.

- **Dividends:**  
  Dividends can also affect the raw price data. If you reinvest dividends or need to calculate total returns, it’s useful to have prices that reflect those cash distributions.

### Why Adjust Prices?

When `auto_adjust=True` is set (the default in recent versions), yfinance automatically transforms the raw price data so that historical prices are consistent over time. This means:

- **Continuity in the Price Series:**  
  The historical prices are modified to "smooth out" the effect of splits and dividends. This makes it easier to compute returns or compare prices over long periods without being distorted by corporate actions.
  
- **Simplified Analysis:**  
  Analysts and investors often need adjusted prices to accurately calculate metrics like growth rates, returns, or moving averages. With adjusted prices, you don’t have to manually correct the data for splits or dividends.

### The Warning Message

The warning  
```
YF.download() has changed argument auto_adjust default to True
```  
is simply notifying you that the default behavior has changed. If you need the raw, unadjusted data (for example, if you want to see the exact trading prices as reported without any modification), you would now need to explicitly set `auto_adjust=False`. Otherwise, yfinance will automatically adjust the data for you.

### Summary

- **With `auto_adjust=True`:**  
  You receive price data that’s been normalized for stock splits and dividends, which is usually what you want for long-term analysis.

- **With `auto_adjust=False`:**  
  You get the raw data, which might show sudden jumps or drops due to corporate actions—this is useful if you need to see the actual trading history without adjustments.

In [7]:
def download_symbol_data(symbol, period="max"):
    """
    Download historical data for a single symbol using yfinance.
    Returns a tuple: (symbol, DataFrame or None)
    Passes the global_session to reuse connections.
    """
    try:
        auto_adjust_prices = True #Turn to False for unadjusted data...
        # data = yf.download(symbol, period=period, progress=False, session=global_session, auto_adjust = auto_adjust_prices) #using global_session seems to be slowing things down...
        data = yf.download(symbol, period=period, progress=False, auto_adjust = auto_adjust_prices)
        return symbol, data
    except Exception as e:
        logging.error(f"Error downloading data for {symbol}: {e}")
        return symbol, None

In [8]:
def classify_symbol(symbol):
    """
    Classify the symbol as 'ETF' or 'Stock' using yfinance Ticker info.
    Passes the global_session so that requests use the custom connection pool.
    If classification fails or is unknown, returns "Unknown".
    """
    try:
        ticker = yf.Ticker(symbol, session=global_session)
        info = ticker.info
        qtype = info.get("quoteType", None)
        if qtype == "ETF":
            return "ETF"
        elif qtype == "EQUITY":
            return "Stock"
        else:
            logging.error(f"Unknown or missing quoteType for {symbol}. Info: {info}")
            return "Unknown"
    except Exception as e:
        logging.error(f"Error classifying symbol {symbol}: {e}")
        return "Unknown"

In [9]:
# Create the main data directory first
base_dir = "data"
os.makedirs(base_dir, exist_ok=True)

In [10]:
# Fetch NASDAQ traded symbols and persist the reference list
symbols = get_nasdaq_symbols(base_dir)
if not symbols:
    print("No symbols found. Please check errors.log for details.")
print(f"Found {len(symbols)} traded symbols on NASDAQ. Reference list saved to {os.path.join(base_dir, 'nasdaq_symbols_reference.csv')}")

Found 11238 traded symbols on NASDAQ. Reference list saved to data\nasdaq_symbols_reference.csv


In [11]:
# Create classification subdirectories inside the data folder
folders = {
    "ETF": os.path.join(base_dir, "ETF"),
    "Stock": os.path.join(base_dir, "Stock"),
    "Unknown": os.path.join(base_dir, "Unknown")
}
for folder in folders.values():
    os.makedirs(folder, exist_ok=True)

In [12]:
# Dictionary to store downloaded data (if needed)
all_data = {}

In [13]:
# Set up ThreadPoolExecutor for parallel downloads
max_workers = 8  # adjust based on your system/network speed

In [14]:
# Set up ThreadPoolExecutor for parallel downloads
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    future_to_symbol = {executor.submit(download_symbol_data, sym): sym for sym in symbols}
    
    for future in as_completed(future_to_symbol):
        symbol = future_to_symbol[future]
        try:
            sym, data = future.result()
            if data is not None and not data.empty:
                # Classify the symbol (ETF or Stock)
                classification = classify_symbol(sym)
                if classification not in folders:
                    classification = "Unknown"
                
                # Save data to CSV in the corresponding folder
                file_path = os.path.join(folders[classification], f"{sym}.csv")
                data.to_csv(file_path)
                all_data[sym] = data
                print(f"Downloaded and saved data for {sym} in folder '{classification}'")
            else:
                error_msg = f"No data available for {sym}"
                logging.error(error_msg)
                print(error_msg)
        except Exception as exc:
            logging.error(f"{symbol} generated an exception: {exc}")
            print(f"{symbol} generated an exception. See errors.log for details.")

Downloaded and saved data for AAA in folder 'Unknown'
Downloaded and saved data for A in folder 'Unknown'
Downloaded and saved data for AAAU in folder 'Unknown'
Downloaded and saved data for AA in folder 'Unknown'
Downloaded and saved data for AACT in folder 'Unknown'
Downloaded and saved data for AACT.U in folder 'Unknown'
Downloaded and saved data for AACT.W in folder 'Unknown'
Downloaded and saved data for AACBU in folder 'Unknown'
Downloaded and saved data for AACG in folder 'Unknown'
Downloaded and saved data for AADR in folder 'Unknown'
Downloaded and saved data for AAME in folder 'Unknown'
Downloaded and saved data for AAL in folder 'Unknown'
Downloaded and saved data for AAM in folder 'Unknown'
Downloaded and saved data for AAM.W in folder 'Unknown'
Downloaded and saved data for AAMI in folder 'Unknown'
Downloaded and saved data for AAM.U in folder 'Unknown'
Downloaded and saved data for AAOI in folder 'Unknown'
Downloaded and saved data for AAPB in folder 'Unknown'
Downloaded 