In [1]:
# pip install pandas yfinance pyarrow tqdm
import pandas as pd, yfinance as yf, textwrap, re, math, requests, time
from pathlib import Path
from tqdm import tqdm


Parses 500 most weighted tickers in the iShares MSCI World ETF

In [19]:
csv_file = "../data/archive/URTH_holdings.csv"  # Fund Holdings as of,""Jul 25, 2025"""
raw = pd.read_csv(csv_file)
tickers = (
    raw["Ticker"]
    .dropna()   # only drops National Bank of Canada with "NaN" Ticker
    .head(500)  # only considers 500 most weighted companies
    .tolist()
)

# (optional) manual fixes for non‑Yahoo tickers
# mapping = {"ADS": "ADS.DE", "NESN": "NESN.SW", ...}
# tickers = [mapping.get(t, t) for t in tickers]

yfinance requires exchange suffixes for non-american tickers, e.g., NOVN has Yahoo symbol NOVN.SW where .SW represents the SIX exchange in Switzerland

In [18]:
# Find strings containing spaces
strings_with_spaces = [t for t in tickers if ' ' in t]

# Check if any were found
if strings_with_spaces:
    print("Found strings with spaces:", strings_with_spaces)
else:
    print("No strings with spaces found")


Found strings with spaces: ['NOVO B', 'INVE B', 'VOLV B', 'NDA FI', 'ATCO A', 'ASSA B']


In [26]:
def resolve_yf_symbol(raw: str, retries: int = 3):
    """
    Return a valid Yahoo Finance ticker for `raw`
    using Yahoo’s public search endpoint.
    """
    # # quick patch for common manual errors -----------------------------
    # manual = {
    #     "BRKB": "BRK-B", "NOVO B": "NOVO-B.CO",
    #     "BP.": "BP.L", "RR.": "RR.L"
    # }
    # if raw in manual:
    #     return manual[raw]

    # exchanges " " with "-"
    raw = raw.replace(" ", "-")

    # checks the stock exchange of tickers
    url = "https://query2.finance.yahoo.com/v1/finance/search"
    for _ in range(retries):
        r = requests.get(url, params={"q": raw, "quotes_count": 1, "news_count": 0},
                         headers={"User-Agent": "Mozilla/5.0"})
        if r.ok and r.json().get("quotes"):
            return r.json()["quotes"][0]["symbol"]
        time.sleep(0.5)          # back‑off on errorsx
    return None                  # couldn’t resolve

tickers_raw = ['ATCO A', 'LSEG', 'EOAN', 'WBC', 'ENEL', '8031', 'HOLN', '6861', '7267', 'LONN', 'D05', 'O39', '6857', 'ENGI', 'VOLV B', '8058', 'CPG', 'CSU', '9433', '8411', 'BARC', 'LLOY', 'EXPN', '4502', 'GLEN', '6503', '8001', 'NG.', '388', '9983', 'NDA FI', 'BA.', '4063', 'XTSLA', 'DB1', '9434', '8766', 'INVE B', '7741', '4568', 'ADYEN', 'BAS']           # your original list
tickers_fixed = [resolve_yf_symbol(t) or t for t in tickers_raw]
# tickers_fixed = [t for t in tickers_fixed if t is not None]  # drop None’s


In [None]:
['ATCO-A.ST', 'LSEG.L', 'EOAN.DE', 'WBCPH.AX', 'ENIC', '8031.T', 'HOLN.SW', '6861.T', '7267.T', 'LONN.SW', 'D05.SI', 'O39.SI', '6857.T', '047040.KS', 'VOLV-B.ST', '8058.T', '0P0001JNFN', 'CSU.TO', '9433.T', '8411.HK', 'BCS', 'LLOY.L', 'EXPN.L', '4502.T', 'GLEN.L', '6503.T', '8001.T', 'NGNISX=X', '3882.HK', '9983.HK', 'NDA-FI.HE', 'BAMRSD=X', '4063.T', 'XTSLA', 'DB1.DE', '9434P.T', '8766.T', 'INVE-B.ST', '7741.T', '4568.TWO', 'ADYEY', 'NGO=F']

In [27]:
data = yf.download(tickers_fixed, group_by="ticker", threads=False, multi_level_index=True)
data.xs('Close', level=1, axis=1)

  data = yf.download(tickers_fixed, group_by="ticker", threads=False, multi_level_index=True)
[*********************100%***********************]  42 of 42 completed

2 Failed downloads:
['9434P.T']: YFPricesMissingError('possibly delisted; no price data found  (period=1mo)')
['XTSLA']: YFPricesMissingError('possibly delisted; no price data found  (period=1mo) (Yahoo error = "No data found, symbol may be delisted")')


Ticker,3882.HK,8001.T,6857.T,GLEN.L,LSEG.L,8031.T,ATCO-A.ST,9983.HK,BAMRSD=X,CSU.TO,...,NGO=F,047040.KS,D05.SI,LLOY.L,ADYEY,6503.T,INVE-B.ST,HOLN.SW,NDA-FI.HE,NGNISX=X
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-06-30,1.16,7556.0,10655.0,283.600006,10635.0,2947.0,152.850006,1.1,,4993.200195,...,,4180.0,44.91,76.699997,18.309999,3111.0,279.75,58.900002,12.61,
2025-07-01,,7523.0,10765.0,291.25,10650.0,2925.0,153.25,,,,...,,4225.0,45.110001,76.360001,18.02,3142.0,279.799988,58.400002,12.435,
2025-07-02,1.15,7605.0,10425.0,306.0,10805.0,2957.0,157.350006,1.08,,4973.240234,...,,4160.0,44.849998,73.540001,18.01,3081.0,281.649994,59.52,12.51,
2025-07-03,1.13,7592.0,10530.0,306.200012,10785.0,3015.0,157.899994,1.1,,5030.089844,...,,4180.0,44.950001,75.879997,18.0,3056.0,283.399994,59.68,12.685,
2025-07-04,1.15,7537.0,10675.0,302.850006,10700.0,2997.5,156.800003,1.12,,5005.0,...,,4095.0,45.209999,75.360001,,3085.0,281.299988,59.360001,12.52,
2025-07-07,1.12,7575.0,10685.0,297.899994,10800.0,2996.0,157.100006,1.14,,4998.970215,...,,4035.0,45.669998,75.68,18.17,3056.0,282.0,60.740002,12.665,
2025-07-08,1.13,7619.0,10955.0,306.399994,10870.0,3026.0,158.0,1.13,,4972.02002,...,,4100.0,45.73,75.660004,18.34,3066.0,284.149994,61.459999,12.635,
2025-07-09,1.11,7663.0,10980.0,298.200012,10840.0,3064.0,160.0,1.12,,4956.279785,...,,4175.0,45.650002,75.559998,18.48,3116.0,286.200012,62.560001,12.935,
2025-07-10,1.11,7590.0,11245.0,309.950012,10880.0,3018.0,163.449997,1.14,,4955.589844,...,,4155.0,45.82,76.040001,17.690001,3096.0,293.600006,63.560001,12.96,
2025-07-11,1.11,7617.0,11325.0,312.5,10740.0,3046.0,160.649994,1.13,,4888.060059,...,,4095.0,45.990002,75.68,17.65,3068.0,289.200012,63.16,12.66,


In [20]:
data = yf.download(tickers, group_by="ticker", threads=False, multi_level_index=True)
data.xs('Close', level=1, axis=1)

  data = yf.download(tickers, group_by="ticker", threads=False, multi_level_index=True)
[*********************100%***********************]  495 of 495 completed

122 Failed downloads:
['ATCO A', 'EOAN', '6098', 'PGHN', '3382', '4519', 'LONN', 'UMG', 'CS', 'U11', '9432', 'CSU', '6503', 'GLEN', '388', 'NDA FI', 'NESN', '9984', '4063', 'ULVR', 'DB1', 'SIKA', '7741', 'RR.', '4568', 'HSBA', 'BAS', '8750', 'LSEG', '8002', '7974', '7203', '8031', '1299', '6861', '6702', 'ASSA B', 'CPG', 'LLOY', '4502', 'NG.', '9983', 'ZURN', '9434', '8725', 'MUV2', 'ADYEN', 'BRKB', '8316', '8306', 'BAYN', 'BP.', 'NOVO B', '7011', '6301', 'O39', '7267', 'ADS', 'VOLV B', '8058', '8411', '8035', 'EXPN', 'BA.', 'XTSLA', 'ABBN', '8766', 'INVE B', 'WBC', 'ENEL', 'NOVN', 'HOLN', 'D05', '6367', '6857', 'ENGI', '9433', 'UBSG', '6501', 'BARC', '8001', '2914', '6758', 'CBK', '6701']: YFPricesMissingError('possibly delisted; no price data found  (period=1mo) (Yahoo error = "No data found, symbol may be delisted")')
['SGO

Ticker,MCO,ACN,JNJ,ATCO A,GOOGL,FTNT,TEL,EOAN,PCG,APP,...,CNQ,CNR,CBK,AXP,MRK,6701,SRE,SIE,ETN,DGE
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-06-30,501.589996,298.890015,152.75,,176.229996,105.720001,168.669998,,13.94,350.079987,...,31.4,69.739998,,318.176697,79.160004,,75.769997,,356.98999,
2025-07-01,502.480011,302.619995,155.919998,,175.839996,102.43,170.270004,,14.13,336.690002,...,31.66,65.269997,,321.717743,81.809998,,76.18,,355.040009,
2025-07-02,497.119995,302.290009,155.559998,,178.639999,102.209999,173.410004,,13.99,336.0,...,32.41,73.25,,324.789978,82.389999,,74.82,,358.190002,
2025-07-03,505.059998,304.779999,156.009995,,179.529999,105.660004,174.289993,,13.91,341.640015,...,32.220001,72.779999,,328.130005,80.93,,75.120003,,362.220001,
2025-07-07,502.369995,300.540009,155.270004,,176.789993,106.650002,171.139999,,13.64,345.0,...,31.75,70.75,,322.730011,80.900002,,74.389999,,358.48999,
2025-07-08,499.019989,303.329987,155.789993,,174.360001,107.540001,172.429993,,13.67,344.75,...,32.130001,69.470001,,316.980011,81.370003,,74.32,,356.980011,
2025-07-09,502.220001,297.399994,156.279999,,176.619995,107.650002,172.649994,,13.52,352.73999,...,31.98,69.080002,,317.350006,83.709999,,74.489998,,359.779999,
2025-07-10,505.720001,288.359985,157.690002,,177.619995,100.199997,175.029999,,13.54,346.320007,...,31.790001,72.389999,,325.23999,84.019997,,74.900002,,357.640015,
2025-07-11,499.529999,281.059998,156.899994,,180.190002,99.059998,177.110001,,13.42,335.100006,...,32.099998,75.599998,,319.470001,83.360001,,74.559998,,360.619995,
2025-07-14,503.420013,279.98999,156.820007,,181.559998,102.970001,176.179993,,13.39,355.899994,...,31.790001,75.290001,,320.920013,83.669998,,74.550003,,360.290009,


In [21]:
set(['NOVO B', 'INVE B', 'VOLV B', 'NDA FI', 'ATCO A', 'ASSA B']).issubset(set(['ATCO A', 'EOAN', '6098', 'PGHN', '3382', '4519', 'LONN', 'UMG', 'CS', 'U11', '9432', 'CSU', '6503', 'GLEN', '388', 'NDA FI', 'NESN', '9984', '4063', 'ULVR', 'DB1', 'SIKA', '7741', 'RR.', '4568', 'HSBA', 'BAS', '8750', 'LSEG', '8002', '7974', '7203', '8031', '1299', '6861', '6702', 'ASSA B', 'CPG', 'LLOY', '4502', 'NG.', '9983', 'ZURN', '9434', '8725', 'MUV2', 'ADYEN', 'BRKB', '8316', '8306', 'BAYN', 'BP.', 'NOVO B', '7011', '6301', 'O39', '7267', 'ADS', 'VOLV B', '8058', '8411', '8035', 'EXPN', 'BA.', 'XTSLA', 'ABBN', '8766', 'INVE B', 'WBC', 'ENEL', 'NOVN', 'HOLN', 'D05', '6367', '6857', 'ENGI', '9433', 'UBSG', '6501', 'BARC', '8001', '2914', '6758', 'CBK', '6701']))


True

In [13]:
# -------- 2. Batch‑download
start = "1980-01-01"
parquet_path = Path("msci_world_daily.parquet")
bsize = 200
frames = []

for batch in tqdm(range(math.ceil(len(tickers)/bsize))):
    block = tickers[batch*bsize : (batch+1)*bsize]
    data = yf.download(block, start=start, group_by="ticker", progress=False)   # progress bar hidden,
    frames.append(data.xs('Close', level=1, axis=1))    # keep just Close prices to save space


  data = yf.download(block, start=start, group_by="ticker", progress=False)   # progress bar hidden,
HTTP Error 404: 
HTTP Error 404: 

25 Failed downloads:
['8316', 'NOVO B', '6098', '7011', 'BP.', '8306', 'NOVN', '7974', '7203', '1299', 'CS', 'UBSG', '8035', '6501', 'ZURN', 'NESN', '9984', 'ULVR', 'ABBN', '6758', 'RR.', 'MUV2', 'HSBA', 'BRKB']: YFTzMissingError('possibly delisted; no timezone found')
['BATS']: YFPricesMissingError('possibly delisted; no price data found  (1d 1980-01-01 -> 2025-07-30)')
  data = yf.download(block, start=start, group_by="ticker", progress=False)   # progress bar hidden,

43 Failed downloads:
['ATCO A', 'LSEG', 'EOAN', 'WBC', 'ENEL', '8031', 'HOLN', '6861', '7267', 'LONN', 'D05', 'O39', '6857', 'ENGI', 'VOLV B', '8058', 'CPG', 'CSU', '9433', '8411', 'BARC', 'LLOY', 'EXPN', '4502', 'GLEN', '6503', '8001', 'NG.', '388', '9983', 'NDA FI', 'BA.', '4063', 'XTSLA', 'DB1', '9434', '8766', 'INVE B', '7741', '4568', 'ADYEN', 'BAS']: YFTzMissingError('possibly de

In [None]:
prices = pd.concat(frames, axis=1).sort_index()
prices.to_parquet(parquet_path, compression="zstd")

In [None]:
# Let's inspect what data we get from yfinance
sample_data = yf.download(tickers[0], start=start, auto_adjust=True)
print("Sample data columns:", sample_data.columns.tolist())


In [None]:
# -------- 3. Quick sanity snapshot
snapshot = prices.tail(1).T.reset_index()
snapshot.columns = ["Ticker", "Last Close"]
snapshot.to_csv("snapshot_latest.csv", index=False)

print("Saved", parquet_path, "and snapshot_latest.csv")