<a href="https://colab.research.google.com/github/shatlykgurdov/3.1.2/blob/main/gwp1_task_2-4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ==========================================================
# MScFE 600 – Group Work Project 1
# Tasks 2–4 Combined Notebook  (with lettered subparts)
# ==========================================================

# --------------------------
# Common Imports
# --------------------------
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numpy.linalg import eigh
from scipy.optimize import curve_fit
from scipy.interpolate import CubicSpline
from datetime import date, timedelta

np.random.seed(42)  # for reproducibility

# ==========================================================
# Task 2 – Yield Curve Modeling
# (a)–(g)
# ==========================================================

# (a) Pick government securities from a country (example placeholders: US Treasuries)
#     -> Using example data here; replace with actual country/maturities if needed.
# (b) Pick maturities from short-term to long-term (0.5Y–30Y)
maturities = np.array([0.5, 2, 5, 10, 20, 30])            # in years
yields_pct = np.array([4.5, 4.7, 4.9, 5.1, 5.3, 5.4])     # in percent

# (c) Fit a Nelson–Siegel model
def _ns_terms_stable(tau, lam):
    x = tau / lam
    term1 = np.empty_like(x, dtype=float)
    small = np.abs(x) < 1e-6
    term1[~small] = (-np.expm1(-x[~small])) / x[~small]
    term1[small] = 1 - x[small]/2 + x[small]**2/6
    term2 = term1 - np.exp(-x)
    return term1, term2

def nelson_siegel(tau, beta0, beta1, beta2, lam):
    t1, t2 = _ns_terms_stable(tau, lam)
    return beta0 + beta1 * t1 + beta2 * t2

p0     = [np.mean(yields_pct), -1.0, 1.0, 2.0]
bounds = ([0.0, -10.0, -10.0, 0.05], [20.0, 10.0, 10.0, 50.0])
params, _ = curve_fit(nelson_siegel, maturities, yields_pct, p0=p0, bounds=bounds, maxfev=20000)
beta0, beta1, beta2, lam = params

# (d) Fit a Cubic Spline model
cs = CubicSpline(maturities, yields_pct, bc_type='not-a-knot')

# Plot both fits
tau_fit     = np.linspace(maturities.min(), maturities.max(), 400)
ns_fit      = nelson_siegel(tau_fit, *params)
spline_fit  = cs(tau_fit)

plt.figure(figsize=(9,6))
plt.scatter(maturities, yields_pct, label="Observed Yields", zorder=3)
plt.plot(tau_fit, ns_fit, label="Nelson–Siegel Fit", linewidth=2)
plt.plot(tau_fit, spline_fit, label="Cubic Spline Fit", linestyle="--", linewidth=2)
plt.xlabel("Maturity (Years)"); plt.ylabel("Yield (%)")
plt.title("Task 2: Yield Curve — Nelson–Siegel vs Cubic Spline")
plt.legend(); plt.tight_layout(); plt.show()

# (e) Compare models in terms of fit (RMSE) and interpretation
def rmse(y_true, y_hat):
    return np.sqrt(np.mean((y_true - y_hat)**2))

ns_on_points     = nelson_siegel(maturities, *params)
spline_on_points = cs(maturities)
rmse_ns          = rmse(yields_pct, ns_on_points)
rmse_spline      = rmse(yields_pct, spline_on_points)

print("Task 2 (e): Fit comparison (lower = better)")
print(f"  RMSE Nelson–Siegel: {rmse_ns:.6f}")
print(f"  RMSE Cubic Spline : {rmse_spline:.6f}")

# (f) Specify model parameter levels
print("\nTask 2 (f): Nelson–Siegel parameters")
print(f"  β0 (level)     = {beta0:.6f}")
print(f"  β1 (slope)     = {beta1:.6f}")
print(f"  β2 (curvature) = {beta2:.6f}")
print(f"  λ  (decay)     = {lam:.6f}")

# (g) Ethics of smoothing (write in report)
# TODO (report): Explain why using a parametric smoother like Nelson–Siegel for yield curves
# is not inherently unethical when used transparently to estimate the term structure (cite sources),
# and contrast with manipulative smoothing that obscures risk. :contentReference[oaicite:1]{index=1}


# ==========================================================
# Task 3 – Exploiting Correlation (PCA)
# (a)–(j)
# ==========================================================

# ---- Uncorrelated toy data ----
# (a) Generate 5 uncorrelated Gaussian RVs simulating yield changes
np.random.seed(0)
X = np.random.normal(0, 0.01, size=(250, 5))
cols = [f"Y{i+1}" for i in range(5)]
df_sim = pd.DataFrame(X, columns=cols)

# (b) PCA using correlation matrix
C = np.corrcoef(df_sim.T)
eigvals, eigvecs = eigh(C)
idx = eigvals.argsort()[::-1]
eigvals = eigvals[idx]; eigvecs = eigvecs[:, idx]
var_exp = eigvals / eigvals.sum()

# (c) Paragraph explanation (print the variance shares you’ll discuss in report)
print("\nTask 3 (a–c): Variance explained (simulated, uncorrelated)")
for i, v in enumerate(var_exp, 1):
    print(f"  Component {i}: {v:.3f}")

# (d) Scree plot (simulated)
plt.figure(figsize=(6,4))
plt.plot(range(1, len(var_exp)+1), var_exp, marker="o")
plt.xlabel("Principal Component"); plt.ylabel("Variance Explained")
plt.title("Task 3 (d): Screeplot – Simulated (Uncorrelated)")
plt.tight_layout(); plt.show()

# ---- Realistic yield data ----
# (e) Collect daily closing yields for 5 gov’t securities over ~6 months
#     Try FRED via pandas_datareader; if unavailable (offline), simulate a realistic term-structure dynamic.
try:
    import pandas_datareader.data as pdr
    end = date.today(); start = end - timedelta(days=210)
    fred_series = {"3M":"DGS3MO","2Y":"DGS2","5Y":"DGS5","10Y":"DGS10","30Y":"DGS30"}
    frames = []
    for label, code in fred_series.items():
        s = pdr.DataReader(code, "fred", start, end)
        if isinstance(s, pd.DataFrame): s = s.iloc[:,0]
        frames.append(s.rename(label))
    yields = pd.concat(frames, axis=1).ffill().dropna()
    source = "FRED"
except Exception:
    # Simulated fallback with Level/Slope/Curvature factors
    np.random.seed(123)
    n = 126; maturities_lbl = ["3M","2Y","5Y","10Y","30Y"]
    epsL = np.random.normal(0, 0.015, n); epsS = np.random.normal(0, 0.02, n); epsC = np.random.normal(0, 0.012, n)
    L, S, Cc = np.cumsum(epsL), np.cumsum(epsS), np.cumsum(epsC)
    load = {"3M":(1,1,-0.2),"2Y":(1,0.4,0.4),"5Y":(1,0,1),"10Y":(1,-0.3,0.6),"30Y":(1,-0.6,-0.1)}
    base = {"3M":5.2,"2Y":4.9,"5Y":4.7,"10Y":4.6,"30Y":4.5}
    Y = pd.DataFrame(index=np.arange(n), columns=maturities_lbl, dtype=float)
    for t in range(n):
        for m in maturities_lbl:
            a,b,c = load[m]; Y.loc[t,m] = base[m] + a*L[t] + b*S[t] + c*Cc[t]
    yields = Y
    source = "Simulated fallback"

# (f) Compute daily yield changes
dy = yields.diff().dropna()

# (g) Re-run PCA on correlation (or covariance) matrix
C_real = np.corrcoef(dy.T)
eigvals_r, eigvecs_r = eigh(C_real)
idx = eigvals_r.argsort()[::-1]
eigvals_r = eigvals_r[idx]; eigvecs_r = eigvecs_r[:, idx]
var_exp_r = eigvals_r / eigvals_r.sum()

# (h) Variance shares for real yield changes
print(f"\nTask 3 (e–h) [{source}]: Variance explained")
for i, v in enumerate(var_exp_r, 1):
    print(f"  Component {i}: {v:.3f}")

# (i) Scree plot (realistic data)
plt.figure(figsize=(6,4))
plt.plot(range(1, len(var_exp_r)+1), var_exp_r, marker="o")
plt.xlabel("Principal Component"); plt.ylabel("Variance Explained")
plt.title(f"Task 3 (i): Screeplot – Gov’t Yield Changes [{source}]")
plt.tight_layout(); plt.show()

# (j) Compare the two screeplots (write in report)
# TODO (report): Discuss how uncorrelated data yields flat-ish variance shares,
# while real yield changes are typically dominated by Level/Slope/Curvature PCs. :contentReference[oaicite:2]{index=2}


# ==========================================================
# Task 4 – Empirical Analysis of ETFs
# (a)–(f)
# NOTE: The current code focuses on SPY/TLT/GLD (bonds/equity/gold).
# The official prompt asks for a SECTOR ETF's top 30 holdings (e.g., XLRE).
# Below we label (c)–(f) for the current analysis, and leave (a)–(b) placeholders.
# ==========================================================

# (a) Find the 30 largest holdings of a chosen sector ETF (e.g., XLRE)
# TODO (data & report): If internet access is available, pull holdings from a data provider;
# otherwise paste a static table of top 30 holdings from the fund factsheet. :contentReference[oaicite:3]{index=3}

# (b) Get at least 6 months (~120 data points)
# TODO (data): If you cannot install yfinance, constrain to simulated prices below.
START, END = "2022-01-01", "2023-01-01"  # > 6 months for illustration
tickers = ["SPY", "TLT", "GLD"]

# Try real data; else simulate
use_simulated = False
try:
    import yfinance as yf
    df_raw = yf.download(tickers, start=START, end=END, auto_adjust=True, progress=False)
    if isinstance(df_raw.columns, pd.MultiIndex):
        prices = df_raw['Close'].copy()
    else:
        prices = df_raw[['Close']] if 'Close' in df_raw.columns else df_raw.copy()
        if prices.shape[1] == 1: prices.columns = [tickers[0]]
    if prices.isna().all().all() or prices.shape[0] < 60:
        raise RuntimeError("Insufficient data fetched; falling back to simulation.")
except Exception:
    use_simulated = True
    dates = pd.date_range(start=START, end=END, freq="B")
    n = len(dates)
    prices = pd.DataFrame(index=dates, columns=tickers, dtype=float)
    # Geometric random walk simulation with modest drift/vol
    drift = np.array([0.06, 0.02, 0.03]) / 252.0
    vol   = np.array([0.15, 0.12, 0.10]) / np.sqrt(252.0)
    shocks = np.random.normal(0, 1, size=(n, len(tickers)))
    log_rets = drift + vol * shocks
    prices.iloc[0] = 100.0
    for t in range(1, n):
        prices.iloc[t] = prices.iloc[t-1] * np.exp(log_rets[t])

print(f"\nTask 4 data source: {'Simulated' if use_simulated else 'yfinance'}")

# (c) Compute daily returns
returns = prices.pct_change().dropna()

# (d) Compute the covariance matrix
cov_mat = returns.cov()

# (e) Compute the PCA (from covariance)
eigvals_e, eigvecs_e = eigh(cov_mat.values)
idx = eigvals_e.argsort()[::-1]
eigvals_e = eigvals_e[idx]; eigvecs_e = eigvecs_e[:, idx]
var_exp_e = eigvals_e / eigvals_e.sum()

print("\nTask 4 (e): PCA variance explained")
for i, v in enumerate(var_exp_e, 1):
    print(f"  Component {i}: {v:.3f}")

# (f) Compute the SVD (on mean-centered returns matrix)
X = returns.values - returns.values.mean(axis=0, keepdims=True)
U, S, VT = np.linalg.svd(X, full_matrices=False)
print("\nTask 4 (f): SVD summary")
print(f"  Singular values: {np.round(S, 6)}")
# Connection: eigenvalues of (X^T X)/(n-1) equal (S^2)/(n-1)
svd_var = (S**2) / (X.shape[0]-1)
svd_var_ratio = svd_var / svd_var.sum()
print(f"  Variance explained via SVD mapping: {np.round(svd_var_ratio, 3)}")

# Extra: Annualized performance metrics (helpful but beyond (a)-(f))
mean_ann = returns.mean() * 252
vol_ann  = returns.std() * np.sqrt(252)
sharpe   = (mean_ann / vol_ann).replace([np.inf, -np.inf], np.nan)

metrics = pd.DataFrame({
    "Mean Return (ann.)": mean_ann,
    "Volatility (ann.)": vol_ann,
    "Sharpe Ratio": sharpe
}).round(4)

print("\nTask 4 – Annualized Performance Metrics:\n", metrics)
print("\nTask 4 – Correlation Matrix:\n", returns.corr().round(3))

# Correlation heatmap (visual)
plt.figure(figsize=(6,5))
plt.imshow(returns.corr(), cmap="coolwarm", interpolation="none")
plt.xticks(range(len(returns.corr())), returns.columns, rotation=45)
plt.yticks(range(len(returns.corr())), returns.columns)
plt.colorbar(label="Correlation")
plt.title("Task 4: ETF Correlation Matrix (Daily Returns)")
plt.tight_layout(); plt.show()
