In [1]:
!pip -q install yfinance statsmodels plotly


In [2]:
import numpy as np
import pandas as pd
import yfinance as yf
import statsmodels.api as sm
import plotly.express as px


In [3]:
START = "2018-01-01"
END = None              # None = today
FREQ = "W"              # "D" daily or "W" weekly (weekly often cleaner for TE)

# Choose ONE benchmark (index or ETF proxy). Examples: "SPY", "^GSPC", "QQQ", "ACWI"
BENCHMARK_TICKER = "^NSEI"

# ETFs you want to evaluate (friendly_name: ticker)
ETF_TICKERS = {
    "iShares MSCI": "INDA",
    "Frnklin FTSE": "FLIN",
    "WisdomTree India Earnings Fund": "EPI",
    "iShares MSCI India Small-Cap": "SMIN",
    "iShares India 50 ETF": "INDY",
    "Columbia Consumer ETF": "INCO",
    "Invesco India ETF": "PIN",
    "The India Internet": "INQQ",
    "VanEck India Growth Leaders": "GLIN",
    "Daily MSCI India": "INDL",
    # add more:
    # "India": "INDA",
    # "EM": "EEM",
}

In [4]:

def download_ohlcv(tickers, start, end):
    raw = yf.download(tickers, start=start, end=end, progress=False, auto_adjust=False)
    # expected MultiIndex columns for multiple tickers
    return raw

def resample_prices(px: pd.DataFrame, freq: str):
    if freq.upper().startswith("W"):
        return px.resample("W-FRI").last()
    return px

def safe_prices(series: pd.Series) -> pd.Series:
    # Fix bad ticks: non-positive -> NaN -> ffill
    s = series.copy()
    s = s.mask(s <= 0, np.nan).ffill()
    return s.dropna()

def returns_simple(px: pd.Series, freq: str) -> pd.Series:
    px = safe_prices(px)
    px = resample_prices(px.to_frame("p"), freq)["p"]
    return px.pct_change().dropna()

def annual_factor(freq: str) -> float:
    return 52.0 if freq.upper().startswith("W") else 252.0

def tracking_metrics(etf_ret: pd.Series, bmk_ret: pd.Series, freq: str):
    df = pd.DataFrame({"etf": etf_ret, "bmk": bmk_ret}).dropna()
    df["active"] = df["etf"] - df["bmk"]

    ann = annual_factor(freq)
    te = df["active"].std() * np.sqrt(ann)                    # Tracking Error (annualized)
    td = df["active"].mean() * ann                            # Tracking Difference (annualized mean active return)
    ir = (df["active"].mean() / df["active"].std()) * np.sqrt(ann) if df["active"].std() > 0 else np.nan

    # Regression-based beta/alpha vs benchmark (not CAPM, just benchmark regression)
    X = sm.add_constant(df["bmk"])
    fit = sm.OLS(df["etf"], X).fit()
    alpha_ann = fit.params["const"] * ann
    beta = fit.params["bmk"]
    r2 = fit.rsquared

    return {
        "TrackingError_ann": float(te),
        "TrackingDiff_ann": float(td),
        "InfoRatio": float(ir),
        "Alpha_vs_Bmk_ann": float(alpha_ann),
        "Beta_vs_Bmk": float(beta),
        "R2_vs_Bmk": float(r2),
        "Obs": int(len(df))
    }

def liquidity_metrics(adj_close: pd.Series, volume: pd.Series, freq: str):
    # Use daily data for liquidity by default (even if returns are weekly)
    px = safe_prices(adj_close)
    vol = volume.reindex(px.index).replace(0, np.nan)

    dollar_vol = (px * vol).dropna()

    # Simple liquidity proxies
    avg_vol = float(vol.dropna().mean())
    avg_dollar_vol = float(dollar_vol.mean())
    med_dollar_vol = float(dollar_vol.median())

    # % of days with "missing/zero" volume (a red flag for liquidity)
    pct_missing_vol = float((volume.reindex(px.index).fillna(0) <= 0).mean())

    # Amihud illiquidity proxy: |return| / dollar volume  (lower is more liquid)
    ret_d = px.pct_change().dropna()
    dv_align = dollar_vol.reindex(ret_d.index).dropna()
    ret_align = ret_d.reindex(dv_align.index)
    amihud = float((ret_align.abs() / dv_align).replace([np.inf, -np.inf], np.nan).dropna().mean())

    return {
        "AvgVolume_shares": avg_vol,
        "AvgDollarVolume": avg_dollar_vol,
        "MedianDollarVolume": med_dollar_vol,
        "PctDays_ZeroOrMissingVolume": pct_missing_vol,
        "Amihud_Illiquidity": amihud,
    }

def explain_tracking_row(name, m):
    te = m["TrackingError_ann"]
    td = m["TrackingDiff_ann"]
    ir = m["InfoRatio"]
    beta = m["Beta_vs_Bmk"]
    r2 = m["R2_vs_Bmk"]
    alpha = m["Alpha_vs_Bmk_ann"]

    return f"""
[{name}] Tracking vs {BENCHMARK_TICKER}
- Tracking Error (annualized): {te:.2%}
  Meaning: typical yearly size of deviations from the benchmark. Lower = tracks closer.
  Numbers: Under 0.5% is Excellent, 0.5-1.5% is Good/Acceptable and anything above 2% is very high and indicates significant performance deviation from the index.
- Tracking Difference (annualized): {td:.2%}
  Meaning: average annual over/underperformance vs benchmark. Positive = beat benchmark on average.
- Information Ratio (IR): {ir:.2f}
  Meaning: active return per unit of tracking error. Higher is better (if stable).
- Beta vs benchmark: {beta:.2f}, R²: {r2:.2f}
  Meaning: beta shows sensitivity to benchmark; R² shows how much ETF moves are explained by benchmark.
- Alpha vs benchmark (annualized intercept): {alpha:.2%}
  Meaning: average return not explained by benchmark exposure (not “true” CAPM alpha—benchmark-based).
"""

def explain_liquidity_row(name, l):
    adv = l["AvgDollarVolume"]
    amihud = l["Amihud_Illiquidity"]
    pct0 = l["PctDays_ZeroOrMissingVolume"]

    return f"""
[{name}] Liquidity proxies
- Average Dollar Volume (ADV): ${adv:,.0f} per day
  Meaning: bigger ADV usually = easier to trade with less price impact.
- Amihud Illiquidity: {amihud:.2e}
  Meaning: |return| per $ traded. Lower = more liquid (less price move per unit volume).
- % Days zero/missing volume: {pct0:.2%}
  Meaning: should be ~0% for liquid ETFs; higher can indicate stale/poor data or illiquidity.
"""


In [5]:
tickers_all = list(set(list(ETF_TICKERS.values()) + [BENCHMARK_TICKER]))
raw = download_ohlcv(tickers_all, START, END)

# Grab Adj Close and Volume in a ticker-safe way
adj = raw["Adj Close"] if isinstance(raw.columns, pd.MultiIndex) else raw[["Adj Close"]]
vol = raw["Volume"] if isinstance(raw.columns, pd.MultiIndex) else raw[["Volume"]]

# Benchmark returns (for tracking)
bmk_px = adj[BENCHMARK_TICKER] if BENCHMARK_TICKER in adj.columns else adj.iloc[:, 0]
bmk_ret = returns_simple(bmk_px, FREQ)

In [6]:
tracking_rows = []
liquidity_rows = []
explanations = []

for name, tkr in ETF_TICKERS.items():
    if tkr not in adj.columns:
        print(f"Skipping {name} ({tkr}): no data returned.")
        continue

    etf_ret = returns_simple(adj[tkr], FREQ)

    # Check if etf_ret is empty before proceeding with tracking calculations
    if etf_ret.empty or bmk_ret.empty:
        print(f"Skipping {name} ({tkr}): no valid return data for tracking metrics (either ETF or benchmark returns are empty).")
        t_metrics = {
            "TrackingError_ann": np.nan,
            "TrackingDiff_ann": np.nan,
            "InfoRatio": np.nan,
            "Alpha_vs_Bmk_ann": np.nan,
            "Beta_vs_Bmk": np.nan,
            "R2_vs_Bmk": np.nan,
            "Obs": 0
        }
        tracking_rows.append({"ETF": name, "Ticker": tkr, **t_metrics})
        explanations.append(f"\n[{name}] No valid return data for tracking metrics (either ETF or benchmark returns are empty).")
    else:
        # tracking
        t_metrics = tracking_metrics(etf_ret, bmk_ret, FREQ)
        tracking_rows.append({"ETF": name, "Ticker": tkr, **t_metrics})
        explanations.append(explain_tracking_row(name, t_metrics))

    # liquidity (daily-based) - we can still try to get liquidity metrics
    # even if returns were empty, as liquidity uses daily prices and volumes.
    # However, if adj[tkr] is completely empty/NaN, safe_prices will return an empty series,
    # which liquidity_metrics should handle gracefully by returning NaNs.
    l_metrics = liquidity_metrics(adj[tkr], vol[tkr] if tkr in vol.columns else pd.Series(dtype=float), FREQ)
    liquidity_rows.append({"ETF": name, "Ticker": tkr, **l_metrics})
    explanations.append(explain_liquidity_row(name, l_metrics))

tracking_table = pd.DataFrame(tracking_rows).sort_values("TrackingError_ann")
liquidity_table = pd.DataFrame(liquidity_rows).sort_values("AvgDollarVolume", ascending=False)

tracking_table

Unnamed: 0,ETF,Ticker,TrackingError_ann,TrackingDiff_ann,InfoRatio,Alpha_vs_Bmk_ann,Beta_vs_Bmk,R2_vs_Bmk,Obs
4,iShares India 50 ETF,INDY,0.084448,-0.050222,-0.594708,-0.052155,1.015687,0.788835,419
1,Frnklin FTSE,FLIN,0.088128,-0.04068,-0.4616,-0.0377,0.976314,0.76036,414
6,Invesco India ETF,PIN,0.09062,-0.043287,-0.477677,-0.038767,0.963327,0.745417,419
0,iShares MSCI,INDA,0.091456,-0.049014,-0.535931,-0.050144,1.009169,0.758587,419
2,WisdomTree India Earnings Fund,EPI,0.095263,-0.032482,-0.340973,-0.036532,1.03286,0.752617,419
5,Columbia Consumer ETF,INCO,0.121855,-0.047536,-0.390106,-0.040052,0.939277,0.606741,419
3,iShares MSCI India Small-Cap,SMIN,0.138604,-0.054806,-0.395411,-0.057515,1.021979,0.583957,419
7,The India Internet,INQQ,0.140262,-0.10298,-0.734195,-0.122614,1.187996,0.526383,197
8,VanEck India Growth Leaders,GLIN,0.148386,-0.140643,-0.947822,-0.150939,1.083536,0.581078,419
9,Daily MSCI India,INDL,0.365252,-0.065247,-0.178634,-0.266034,2.629084,0.733072,419


In [7]:
liquidity_table

Unnamed: 0,ETF,Ticker,AvgVolume_shares,AvgDollarVolume,MedianDollarVolume,PctDays_ZeroOrMissingVolume,Amihud_Illiquidity
0,iShares MSCI,INDA,4333070.0,176503700.0,147811900.0,0.029313,5.864656e-11
2,WisdomTree India Earnings Fund,EPI,1067965.0,31484040.0,26483640.0,0.029313,3.867458e-10
3,iShares MSCI India Small-Cap,SMIN,106162.7,6112630.0,2717965.0,0.029313,5.949246e-09
4,iShares India 50 ETF,INDY,120325.3,4213373.0,2715919.0,0.029313,4.003348e-09
1,Frnklin FTSE,FLIN,114975.3,4213326.0,235746.5,0.039435,1.766368e-07
9,Daily MSCI India,INDL,56360.57,2423966.0,1700569.0,0.029313,1.223644e-08
5,Columbia Consumer ETF,INCO,20832.5,1109812.0,404566.4,0.029313,4.084931e-08
8,VanEck India Growth Leaders,GLIN,24378.52,965611.4,493071.0,0.029313,4.404732e-08
6,Invesco India ETF,PIN,46673.67,817412.8,527610.3,0.029313,2.450196e-08
7,The India Internet,INQQ,18334.92,262850.0,157361.2,0.036735,8.537772e-07


In [8]:
fig1 = px.bar(tracking_table, x="ETF", y="TrackingError_ann", title=f"Tracking Error (annualized) vs {BENCHMARK_TICKER}")
fig1.update_layout(height=450)
fig1.show()

# Liquidity: Avg Dollar Volume bar
fig2 = px.bar(liquidity_table, x="ETF", y="AvgDollarVolume", title="Average Dollar Volume (ADV)")
fig2.update_layout(height=450)
fig2.show()

In [9]:
# -------------------------
# 6) PRINT EXPLANATIONS NEXT TO *YOUR* RESULTS
# -------------------------
print("===== WHAT YOUR NUMBERS MEAN (with your results) =====")
print(f"Benchmark: {BENCHMARK_TICKER} | Frequency: {'Weekly' if FREQ.startswith('W') else 'Daily'}")
print("\nKey concepts:")
print("- Tracking Error: how tightly ETF follows benchmark (annualized std of active returns).")
print("- Tracking Difference: average annual over/underperformance vs benchmark (annualized mean active return).")
print("- Information Ratio: active return per unit of tracking error (higher is better, but can be unstable).")
print("- Liquidity proxies: ADV (bigger=more liquid), Amihud (lower=more liquid), and % zero-volume days.\n")

for block in explanations:
    print(block)

===== WHAT YOUR NUMBERS MEAN (with your results) =====
Benchmark: ^NSEI | Frequency: Weekly

Key concepts:
- Tracking Error: how tightly ETF follows benchmark (annualized std of active returns).
- Tracking Difference: average annual over/underperformance vs benchmark (annualized mean active return).
- Information Ratio: active return per unit of tracking error (higher is better, but can be unstable).
- Liquidity proxies: ADV (bigger=more liquid), Amihud (lower=more liquid), and % zero-volume days.


[iShares MSCI] Tracking vs ^NSEI
- Tracking Error (annualized): 9.15%
  Meaning: typical yearly size of deviations from the benchmark. Lower = tracks closer.
  Numbers: Under 0.5% is Excellent, 0.5-1.5% is Good/Acceptable and anything above 2% is very high and indicates significant performance deviation from the index.
- Tracking Difference (annualized): -4.90%
  Meaning: average annual over/underperformance vs benchmark. Positive = beat benchmark on average.
- Information Ratio (IR): -0.54