In [2]:
import yfinance as yf
import pandas as pd
import seaborn as sns
import os

In [None]:
# Most traded stock + VIX the 19 november 2025 and listed before 2005
most_traded_ticker = ["^VIX", "AAPL", "ABEV", "ABT", "AES", "AMAT", "AMD", "AMZN", "APA", "APH", "ASX", "AZN", "B", "BA", "BAC", "BAX", "BB", "BBD", "BBWI", "BCS", "BKR", "BMY", "BP", "BSX", "C", "CAG", "CCJ", "CCL", "CDE", "CLF", "CMCSA", "CNC", "CNQ", "CPRT", "CRM", "CSCO", "CSX", "CTRA", "CVX", "DIS", "DVN", "EQNR", "EQT", "ERIC", "ES", "EXAS", "EXC", "F", "FAST", "FCX", "FE", "FHN", "GAP", "GGB", "GOOG", "GOOGL", "GSK", "HAL", "HBAN", "HD", "HL", "HPQ", "HST", "IAG", "INFY", "INTC", "IPG", "IRM", "ITUB", "JHX", "JNJ", "KEY", "KGC", "KMB", "KO", "KR", "LEN", "LNW", "LOW", "LRCX", "LUMN", "LUV", "LYG", "MCHP", "MDLZ", "MDT", "MNST", "MRK", "MRVL", "MS", "MSFT", "MSTR", "MU", "NEE", "NEM", "NFLX", "NGD", "NKE", "NOK", "NVDA", "NVO", "ON", "ORCL", "OXY", "PBR", "PBR-A", "PCG", "PFE", "PLUG", "PPL", "PTEN", "QCOM", "RF", "RIG", "SBUX", "SHEL", "SLB", "SONY", "T", "TEVA", "TGT", "TJX", "TSCO", "TSM", "TU", "TXN", "UMC", "UNH", "USB", "VALE", "VLY", "VTRS", "VZ", "WDC", "WFC", "WIT", "WMT", "XOM"]
start_date = "2005-01-01"
end_date = "2025-09-30"

In [None]:
# Get price for each ticker and store it

if not os.path.exists('raw_data/most_traded_stocks_data.csv'):
    print("Downloading data...")
    tickers_data = yf.download(most_traded_ticker, start=start_date, end=end_date, group_by='ticker', auto_adjust=True, threads=True)
    tickers_data.head()
    tickers_data.to_csv('raw_data/most_traded_stocks_data.csv')
    tickers_data = pd.read_csv('raw_data/most_traded_stocks_data.csv', header=[0,1], index_col=0, parse_dates=True)
else:
    print("Loading data from CSV...")
    tickers_data = pd.read_csv('raw_data/most_traded_stocks_data.csv', header=[0,1], index_col=0, parse_dates=True)

Downloading data...


[*********************100%***********************]  138 of 138 completed


In [None]:
# Get the traded currency for each ticker
ticker_currency = {}
if os.path.exists('raw_data/ticker_currency.csv'):
    print("Loading ticker currency from CSV...")
    ticker_currency = pd.read_csv('raw_data/ticker_currency.csv', index_col=0).to_dict()['0']
else:
    print("Downloading ticker currency data...")
    for ticker in most_traded_ticker:
        t = yf.Ticker(ticker)
        info = t.info
        currency = info.get('currency', 'N/A')
        ticker_currency[ticker] = currency
    pd.Series(ticker_currency).to_csv('raw_data/ticker_currency.csv')
    ticker_currency = pd.read_csv('raw_data/ticker_currency.csv', index_col=0).to_dict()[0]

Loading ticker currency from CSV...


In [None]:
# Check if there is only USD stocks (avoid foreign exchange issues)
def histogram_of_traded_currencies(ticker_currency):
    # Convert the dictionary to a DataFrame for easier plotting
    ticker_currency = ticker_currency
    currency_df = pd.DataFrame(list(ticker_currency.items()), columns=['Ticker', 'Currency'])
    
    # Count occurrences of each currency
    currency_counts = currency_df['Currency'].value_counts().reset_index()
    currency_counts.columns = ['Currency', 'Count']
    
    return currency_counts
    
currency_count = histogram_of_traded_currencies(ticker_currency)
print(f'{currency_count}')

  Currency  Count
0      USD    138


In [None]:
# Checking data quality for NA values
def check_data_quality(tickers_data):
    na_summary = {}
    for ticker in most_traded_ticker:
        ticker_df = tickers_data[ticker]
        na_count = ticker_df.isna().sum()
        na_summary[ticker] = na_count
        na_summary[ticker]['Total Missing'] = na_count.sum()
    na_summary_df = pd.DataFrame(na_summary)
    sum_total_missing = na_summary_df.loc['Total Missing'].sum()
    return na_summary_df, sum_total_missing

na_summary_df, sum_total_missing = check_data_quality(tickers_data)
print(f'Total missing values across all tickers: {sum_total_missing}')

Total missing values across all tickers: 0
