### This script loads historic OHLCV data of Nasdaq stocks listed in the file. It needs to be run only once (first downloading of OHLCV). Afterwards scripts for updating existing data should be used.
### Data saved to data/ folder to .csv files.

In [2]:
import yfinance as yf
import pandas as pd
import os
# TODO: Multithread the process

In [4]:
stable_symbols_files = ['nasdaq_symbols_stable.csv', 'nyse_symbols_stable.csv'] # before I fetched only stocks from file 'nasdaq_stocks_stable.csv'
stable_symbols_file = 'nyse_symbols_stable.csv'
ohlcv_folder = 'data'

# Number of threads to use
num_threads = 6

In [5]:
# Create a data folder if it doesn't exist
os.makedirs(ohlcv_folder, exist_ok=True)

In [7]:
# Load the stable symbols from 'nasdaq_stocks_stable.csv'
symbols_df = pd.read_csv(stable_symbols_file)
symbols = symbols_df['Symbol'].tolist()  # Convert to a list

In [9]:
len(symbols)

4877

In [27]:
existing_symbols = set(data_file.split('.')[0] for data_file in os.listdir('data/') if data_file.endswith('.csv'))

In [29]:
symbols_to_fetch = [symbol for symbol in symbols if symbol not in existing_symbols]

In [31]:
len(symbols_to_fetch)

0

In [6]:
start_date = '2017-01-01'

In [8]:
# fetches and saves a symbol to file
def fetch_save_ohlcv(symbol):
    try:
        # Fetch daily OHLCV data
        stock_data = yf.download(symbol, start=start_date)  # Adjust start date as needed

        # !! NEW CODE FOR OPTIMIZING MEMORY: float64 to float32
        for col in stock_data.select_dtypes(include=['float64']).columns:
            stock_data[col] = stock_data[col].astype('float32')
        # !! NEW CODE END
        
        # Save the data to a CSV file named according to the symbol
        stock_data.to_csv(f'{ohlcv_folder}/{symbol}.csv')
        
        print(f"Saved data for {symbol} to {ohlcv_folder}/{symbol}.csv")
        
    except Exception as e:
        print(f"Failed to fetch data for {symbol}: {e}")

In [14]:
symbols_to_fetch = ['NRO']

In [16]:
for symbol in symbols_to_fetch:
    fetch_save_ohlcv(symbol)

[*********************100%%**********************]  1 of 1 completed
Saved data for NRO to data/NRO.csv


  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')
