In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from run_backtest import load_trackers
from utils import clean_stale_prices

from entities import EM_CDS_TRACKER_DICT, FX_TRACKER_DICT
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

from matplotlib.ticker import PercentFormatter
import matplotlib.pyplot as plt

In [2]:
OUTPUT_FOLDER = Path(
    r"C:\Users\pcampos\OneDrive - Insper - Instituto de Ensino e Pesquisa\Dissertação Mestrado\Analysis"
)

def plot_bar_contribution(
    heights: pd.Series,
    title: str,
):
    plt.figure(figsize=(10, 6))
    cumulative_values = np.cumsum(heights) - heights
    plt.bar(
        range(1, len(heights) + 1),
        heights,
        bottom=cumulative_values,
        alpha=0.5,
        align="center",
        label="Explained Variance",
    )

    plt.ylim(0, 1)
    plt.xlabel("Componente Principal", labelpad=10)
    plt.ylabel("Explicação da Variância", labelpad=10)
    plt.title(title, fontweight="bold", pad=20)

    plt.gca().xaxis.set_major_locator(plt.MultipleLocator(1))
    plt.gca().xaxis.set_minor_locator(plt.MultipleLocator(0.5))
    plt.grid(True, which="major", axis="y", linestyle="--", linewidth=0.75)
    plt.grid(True, which="minor", axis="x", linestyle="--", linewidth=0.75)

    plt.gca().tick_params(axis="x", which="both", length=0)

    plt.gca().yaxis.set_major_formatter(PercentFormatter(1))

    plt.xticks(range(1, len(heights) + 1))
    plt.tight_layout()
    plt.savefig(
        OUTPUT_FOLDER.joinpath(f"Charts_overleaf/{title}_pca.pdf"),
        bbox_inches="tight",
    )
    plt.savefig(
        OUTPUT_FOLDER.joinpath(f"Charts_powerpoint/{title}_pca.svg"),
        bbox_inches="tight",
        transparent=True,
    )
    plt.show()


def plot_historical_correlation(
    rolling_corr: pd.Series,
    total_correlation: float,
    title: str = "Correlação entre a Primeira Componente Principal de FX e a de CDS",
    y_lim: tuple[float, float] = (0, 1),
):
    plt.figure(figsize=(10, 6))
    rolling_corr.plot()
    plt.axhline(
        total_correlation,
        color="red",
        linestyle="--",
        label="Correlação Média do Período",
    )
    plt.grid(True, which="both", axis="both", linestyle="--", linewidth=0.75)
    plt.xlabel(None)
    plt.xlim(rolling_corr.dropna().index.min(), rolling_corr.index.max())
    plt.ylim(*y_lim)
    plt.legend(ncol=2)
    plt.title(title, fontweight="bold", pad=20)
    plt.tight_layout()
    plt.savefig(
        OUTPUT_FOLDER.joinpath(f"Charts_overleaf/{title}_pca_corr.pdf"),
        bbox_inches="tight",
    )
    plt.savefig(
        OUTPUT_FOLDER.joinpath(f"Charts_powerpoint/{title}_pca_corr.svg"),
        bbox_inches="tight",
        transparent=True,
    )
    plt.show()

In [12]:
# cds = load_trackers(EM_CDS_TRACKER_DICT).rename(columns=lambda col: col + "_cds")
# fx = load_trackers(FX_TRACKER_DICT).rename(columns=lambda col: col + "_fx")

# cds = pd.concat([clean_stale_prices(cds[col].copy()) for col in cds], axis=1)
# fx = pd.concat([clean_stale_prices(fx[col].copy()) for col in fx], axis=1)

fx = load_trackers(FX_TRACKER_DICT).rename(columns=lambda col: col + "_fx")
cds = load_trackers(EM_CDS_TRACKER_DICT).rename(columns=lambda col: col + "_cds")

trackers = pd.concat(
    [fx, cds],
    axis=1,
).fillna(method="ffill")
trackers = pd.concat(
    [clean_stale_prices(trackers[col].copy()) for col in trackers], axis=1
)
trackers = trackers[:"2024-09-30"]
fx = trackers.filter(like="fx")
cds = trackers.filter(like="cds")

In [None]:
RETURN_WINDOW = 21

df_cds = (
    np.log(cds)
    .fillna(method="ffill", limit=5)
    .diff(RETURN_WINDOW)
    .dropna(how="all")
    .copy()
)
df_cds = df_cds.loc["2009-12-10":].dropna(how="any", axis=1)
df_cds = (df_cds - df_cds.mean())/ df_cds.std()

df_fx = (
    np.log(fx)
    .fillna(method="ffill", limit=5)
    .diff(RETURN_WINDOW)
    .dropna(how="all")
    .copy()
)
df_fx = df_fx.loc["2004-08-02":].dropna(how="any", axis=1)
df_fx = (df_fx - df_fx.mean()) / df_fx.std()

new_index = df_fx.dropna().index.intersection(df_cds.dropna().index)
df_fx = df_fx.reindex(new_index)
df_cds = df_cds.reindex(new_index)

df_all = pd.concat([df_cds, df_fx], axis=1).dropna()
df_all

In [None]:
pca_all = PCA(n_components=10).fit(df_all)
plot_bar_contribution(
    pca_all.explained_variance_ratio_,
    "Variação explicada pelas 10 primeiras Componentes Principais (CDS + FX trackers)",
)
########################################################################################
pca_cds = PCA(n_components=10).fit(df_cds.dropna())
explained_variance_ratio = pca_cds.explained_variance_ratio_
plot_bar_contribution(
    pca_cds.explained_variance_ratio_,
    "Variação explicada pelas 10 primeiras Componentes Principais (CDS Trackers)",
)
########################################################################################
pca_fx = PCA(n_components=10).fit(df_fx.dropna())
explained_variance_ratio = pca_fx.explained_variance_ratio_
plot_bar_contribution(
    pca_fx.explained_variance_ratio_,
    "Variação explicada pelas 10 primeiras Componentes Principais (FX trackers)",
)
########################################################################################
pca_1_fx = pd.DataFrame(pca_fx.fit_transform(df_fx), index=new_index).iloc[:, 0]
pca_1_cds = pd.DataFrame(pca_cds.fit_transform(df_cds), index=new_index).iloc[:, 0]
plot_historical_correlation(
    pca_1_cds.ewm(halflife=252)
    .corr(pca_1_fx)
    .iloc[100:]
    .rename("Correlação EWM (meia-vida = 252 dias)"),
    pca_1_cds.corr(pca_1_fx),
    title="Correlação entre a primeira Componente Principal de FX e a de CDS",
    y_lim=(-1, 1),
)
########################################################################################
pca_2_fx = pd.DataFrame(pca_fx.fit_transform(df_fx), index=new_index).iloc[:, 1]
pca_2_cds = pd.DataFrame(pca_cds.fit_transform(df_cds), index=new_index).iloc[:, 1]
plot_historical_correlation(
    pca_2_cds.ewm(halflife=252)
    .corr(pca_2_fx)
    .iloc[100:]
    .rename("Correlação EWM (meia-vida = 252 dias)"),
    pca_2_cds.corr(pca_2_fx),
    title="Correlação entre a segunda Componente Principal de FX e a de CDS",
    y_lim=(-1, 1),
)
########################################################################################
pca_3_fx = pd.DataFrame(pca_fx.fit_transform(df_fx), index=new_index).iloc[:, 2]
pca_3_cds = pd.DataFrame(pca_cds.fit_transform(df_cds), index=new_index).iloc[:, 2]
plot_historical_correlation(
    pca_3_cds.ewm(halflife=252)
    .corr(pca_3_fx)
    .iloc[100:]
    .rename("Correlação EWM (meia-vida = 252 dias)"),
    pca_3_cds.corr(pca_3_fx),
    title="Correlação entre a terceira Componente Principal de FX e a de CDS",
    y_lim=(-1, 1),
)
########################################################################################
plot_historical_correlation(
    (pca_1_cds.ewm(halflife=252).cov(pca_1_fx) / pca_1_fx.ewm(halflife=252).var())
    .iloc[100:]
    .rename("Beta EWM (meia-vida = 252 dias)"),
    pca_1_cds.cov(pca_1_fx) / pca_1_fx.var(),
    y_lim=(0, 2),
    title="Beta entre as primeiras componentes principais de FX e a de CDS",
)

In [19]:
pd.Series(pca_fx.explained_variance_ratio_).to_clipboard()

# Tail Similarity

In [31]:
N = 5

tail_fx = pca_1_fx.rolling(21).sum().dropna().copy()
width = 1 / N
percentils = {}
for i in range(N):
    lower_bound, upper_bound = i * width, (i + 1) * width
    filt = (tail_fx.quantile(lower_bound) <= tail_fx) & (
        tail_fx <= tail_fx.quantile(upper_bound)
    )
    tail_fx_filtered = tail_fx[filt]
    percentils[f"{lower_bound:.0%} - {upper_bound:.0%}"] = tail_fx_filtered.corr(
        pca_1_cds.rolling(21).sum().reindex(tail_fx_filtered.index)
    )
pd.Series(percentils).to_clipboard()