In [4]:
import warnings
from IPython.display import clear_output
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import random
import csv
import yfinance as yf
import datetime

ETF ticker source: 
https://finance.yahoo.com/etfs/?count=100&guce_referrer=aHR0cHM6Ly93d3cuZ29vZ2xlLmNvbS8&guce_referrer_sig=AQAAAGlqeLAfPj2GvkSnTOykXCTYzfK2sRYb4Dq-WKV-73O-thvqsk4NUimbru0k5_piUM8X85RAfOMFcUJMBMsnL3HUeJDOkZqXV_qnKXsiMK3hE2NHGzC7bfz8MOwZikYtb1WFFVpu7hdpx47p2cWamSGH_DxKg3pm1vN_GB_006qs&offset=0

In [5]:
def ETF_Data_Generator(csv_input, start_date, end_date, ETF_Class=None, ETF_Class_Proportion=None):
    # csv_ input: Iput csv source as a string (csv_input = 'ETFs_List.csv').
    # start_date: Start data to begin data extraction from.
    # end_date: End data to begin data extraction from
    # ETF_Class: Restricted to 5 classes when no input is given.
    # ETF_Class_Proportion: Equal proportions are assigned if no input is given.
    with open(csv_input, newline='') as lines: # Opening the input csv file to extract ticker data
        csv_reader = csv.reader(lines) # Reading each line from the input csv file
        ETF_tickers_wrangled = list(csv_reader) # Creating a list of each line read from the input csv file
    
    ETF_tickers = [] # Empyt list of tickers
    for i in range(len(ETF_tickers_wrangled)): # Iterating through each line of ETF_tickers_wrngled list to extract ticker names
        if i%2 == 0: # Ignoring unwanted information except for ticker names
            ETF_tickers.append(ETF_tickers_wrangled[i][0]) # Creating a list of tickers
    ETF_tickers = set(ETF_tickers) # Eliminating duplicates in the ticker list
    
    # Crating a dataframe with ticker names and corresponding classes
    Sample_Length = len(ETF_tickers) # Number of tickers in the list
    if ETF_Class:
        Class_Samples = random.choices(ETF_Class, weights=ETF_Class_Proportion, k=Sample_Length) # Creating a random list of input class names with configured proportions
    else:
        ETF_Class = ETF_tickers # using the full set of ETFs to generate random classes
        ETF_Class_Proportion = np.ones(5)/5 # assigning equal proportion to each ETF to generate random classes
        Random_Classes = ['A','B','C','D','E'] # Generating random classes
        Class_Samples = random.choices(Random_Classes, weights=ETF_Class_Proportion, k=Sample_Length)  # Creating a random list of input class names with equal proportions
    
    Ticker_Class_df = pd.DataFrame() # Empty data frame to store ticker names and randomly assigned classes
    Ticker_Class_df['Tickers'] = list(ETF_tickers) # Adding the list of tickers to the data frame
    Ticker_Class_df['Class'] = Class_Samples # Adding the list of classes to the data frame
    
    # Downloading ETF data from yahoo finance
    ETF_df = pd.DataFrame() # Empty data frame to store ETF data
    status_count = list(range(0, Sample_Length, (Sample_Length - 0) // 99)) # Counter list to print status report
    status_counter = 0 # Counter to track download status
    for ETF in ETF_tickers: # Iterating through each ticker and downloading data
        if status_counter in status_count: # Tracking and printing download status
            clear_output(wait=True) # Clearing previous status print statement
            print("{}% complete!".format(int(status_counter*100/Sample_Length))) # Printing ETF data download status
        status_counter += 1 # Tracking download status
        ETF_data = yf.download(tickers=ETF, start=start_date, end=end_date, interval='1d', progress=False) # Downloading ETF data
        ETF_df[ETF] = ETF_data['Adj Close'] # Extracting relevant ETF data
    Ticker_Class_df.to_csv('Ticker_Class_df.csv', index=False)
    ETF_df.to_csv('ETF_df.csv', index=False)
    clear_output()
    print("Data generation complete!") # Printing final download status
    return Ticker_Class_df, ETF_df # return ETF class information and ETF data

In [3]:
start_date = datetime.datetime(2018, 1, 1)
end_date = datetime.datetime(2024, 3, 10) 
_,_ = ETF_Data_Generator('ETFs_List.csv', start_date, end_date)

Data generation complete!
