# Supervised Learning on FOMC Statements: Predicting Rate Decisions and Market Reactions

In [None]:
# hide
%load_ext autoreload
%autoreload 2
%matplotlib inline

from datetime import date
from pathlib import Path

import pandas as pd
from matplotlib import pyplot as plt
from skfin.plot import bar, line
from tqdm.auto import tqdm

import numpy as np
from pandas.tseries.offsets import BDay
from skfin.datasets_ import load_kf_returns
from skfin.text import show_text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.pipeline import Pipeline

## Supervised learning: vector representation + Elastic net

In this section, we use the corpus of FOMC statements for supervised learning. More precisely, we match the text of the statements to the decision of the committee to raise rates, decrease rates or do nothing.  

In practice, this implemented by using `scikit-learn pipelines` and chaining the `TfidfVectorizer` with a logistic regression. 

In [None]:
from skfin.datasets_ import load_fomc_statements
from skfin.dataloaders.constants.dates import load_fomc_change_date

statements = load_fomc_statements(force_reload=False)

fomc_change_up, fomc_change_dw = load_fomc_change_date()

In [None]:
other = {
    "other_dt_change": ["2003-01-09", "2008-03-16", "2011-06-22"],
    "statements_dt_change_other": ["2007-08-16"],
    "qe1": ["2008-11-25", "2008-12-01", "2008-12-16", "2009-03-18"],
    "qe2": ["2010-11-03"],
    "twist": ["2011-09-21", "2012-06-20"],
    "qe3": ["2012-09-13", "2012-12-12", "2013-12-13"],
    "corona": ["2020-03-20"],
}

In [None]:
dates = {
    "up": fomc_change_up,
    "dw": fomc_change_dw,
    "other": [d for c in other.values() for d in c],
}
dates["no change"] = statements.index.difference([d for c in dates.values() for d in c])

In [None]:
from skfin.text import coefs_plot, show_text
from sklearn.linear_model import ElasticNet, LogisticRegression
from sklearn.preprocessing import FunctionTransformer

In [None]:
est = Pipeline(
    [
        (
            "tfidf",
            TfidfVectorizer(
                vocabulary=None,
                ngram_range=(1, 3),
                max_features=500,
                stop_words="english",
                token_pattern=r"\b[a-zA-Z]{3,}\b",
            ),
        ),
        ("log1p", FunctionTransformer(np.log1p)),
        (
            "reg",
            LogisticRegression(
                C=1, l1_ratio=0.35, penalty="elasticnet", solver="saga", max_iter=500
            ),
        ),
    ]
)
X, y = pd.concat(
    [
        statements.loc[fomc_change_up].assign(change=1),
        statements.loc[fomc_change_dw].assign(change=-1),
    ]
).pipe(lambda df: (df["text"], df["change"]))
est.fit(X, y)
vocab_ = pd.Series(est.named_steps["tfidf"].vocabulary_).sort_values().index

In [None]:
interpret_coef = pd.DataFrame(np.transpose(est.named_steps["reg"].coef_), index=vocab_)
coefs_plot(interpret_coef, title="Interpreted coefficients for trained model")

A trick is that using a linear regression (e.g. ElasticNet) instead of a logistic regression is faster and as efficient (even sometimes better)

In [None]:
est = Pipeline(
    [
        (
            "tfidf",
            TfidfVectorizer(
                vocabulary=None,
                ngram_range=(1, 3),
                max_features=500,
                stop_words="english",
                token_pattern=r"\b[a-zA-Z]{3,}\b",
            ),
        ),
        ("log1p", FunctionTransformer(np.log1p)),
        ("reg", ElasticNet(alpha=0.01)),
    ]
)
X, y = pd.concat(
    [
        statements.loc[fomc_change_up].assign(change=1),
        statements.loc[fomc_change_dw].assign(change=-1),
    ]
).pipe(lambda df: (df["text"], df["change"]))
est.fit(X, y)
vocab_ = pd.Series(est.named_steps["tfidf"].vocabulary_).sort_values().index

In [None]:
interpret_coef = pd.DataFrame(np.transpose(est.named_steps["reg"].coef_), index=vocab_)
coefs_plot(interpret_coef, title="Interpreted coefficients for trained model")

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
pred_tfidf = (
    pd.Series(est.predict(statements["text"]), index=statements.index)
    .resample("B")
    .last()
    .ffill()
)
line(
    pred_tfidf.rename("implied rate")
    .to_frame()
    .join(
        pd.Series(1, index=fomc_change_up)
        .reindex(pred_tfidf.index)
        .fillna(0)
        .rename("up")
    )
    .join(
        pd.Series(-1, index=fomc_change_dw)
        .reindex(pred_tfidf.index)
        .fillna(0)
        .rename("dw")
    ),
    sort=False,
    ax=ax,
    title="Implied interest rate (with forward information)",
)
cols = ["corona", "twist", "qe1", "qe2", "qe3"]
for c in cols:
    ax.plot(pred_tfidf.loc[other[c]], marker="*", ms=10)
ax.legend(
    ["implied rate", "up", "down"] + cols, loc="center left", bbox_to_anchor=(1, 0.5)
);

In [None]:
lexica = {
    "positive": interpret_coef.squeeze().nlargest(n=10),
    "negative": interpret_coef.squeeze().nsmallest(n=10),
}

In [None]:
idx_ = (
    pd.Series(est.predict(X), index=X.index)
    .sort_values()
    .pipe(lambda x: [x.index[0], x.index[-1]])
)
show_text(statements.loc[idx_], lexica=lexica, n=None)

### comparison with sentence transformer embeddings

To test the usefulness of these `SentenceTransformer` , we run a regression of the embeddings on the rate decision. Warning: this is a full sample regression, so this is just an illustration, not a statistical test. 

In [None]:
from sentence_transformers import SentenceTransformer

lm_name = "all-distilroberta-v1"
m = SentenceTransformer(lm_name, device="cpu", trust_remote_code=True)
X_sbert = m.encode(statements["text"].values, batch_size=2)

In [None]:
df = pd.DataFrame(X_sbert, index=statements.index)
m = ElasticNet(alpha=0.01)
X_, y_ = pd.concat(
    [df.loc[fomc_change_up].assign(change=1), df.loc[fomc_change_dw].assign(change=-1)]
).pipe(lambda df: (df.drop("change", axis=1), df["change"]))
m.fit(X_, y_)
pred_sbert = (
    pd.Series(m.predict(df), index=statements.index).resample("B").last().ffill()
)

In [None]:
corr_tfidf_sbert = (
    pd.concat({"sbert": pred_sbert, "tdfidf": pred_tfidf}, axis=1).corr().iloc[0, 1]
)
print(
    f"The correlation of the in-sample prediction for the decisions of the Fed for the two text representations (tfidf and sbert) is {corr_tfidf_sbert:.2f}."
)

In [None]:
line(
    pd.concat({"sbert": pred_sbert, "tdfidf": pred_tfidf}, axis=1).pipe(
        lambda x: x.div(x.std())
    )
)

## Sentiment in FOMC statements: supervised learning

Building on previous analyses, we build here a `scikit-learn pipeline` with a `Tfidfvectorizer` and a regularized regression`ElasticNet`. The target is the return of the market on the day of the statement. 

In [None]:
ret = load_kf_returns(filename="F-F_Research_Data_Factors_daily")["Daily"]

In [None]:
special_days = ["2008-01-22", "2010-05-09", "2020-03-15"]
idx0 = pd.to_datetime(pd.Index(special_days))
idx = statements.index.difference(idx0).union(idx0 + BDay(1))
ret_fomc = ret.div(ret.ewm(252).std()).loc[ret.index.intersection(idx)]

In [None]:
est = Pipeline(
    [
        (
            "tfidf",
            TfidfVectorizer(
                vocabulary=None,
                ngram_range=(1, 3),
                max_features=500,
                stop_words="english",
                token_pattern=r"\b[a-zA-Z]{3,}\b",
            ),
        ),
        ("reg", ElasticNet(alpha=0.0075)),
    ]
)
y = ret_fomc["Mkt-RF"].dropna()
X = statements["text"]
idx_ = y.index.intersection(X.index)
X, y = X.loc[idx_], y.loc[idx_]
est.fit(X, y)
vocab_ = pd.Series(est.named_steps["tfidf"].vocabulary_).sort_values().index
interpret_coef = pd.DataFrame(np.transpose(est.named_steps["reg"].coef_), index=vocab_)
coefs_plot(interpret_coef, title="Interpreted coefficients for trained model")

In [None]:
lexica = {
    "positive": interpret_coef.squeeze().nlargest(n=10),
    "negative": interpret_coef.squeeze().nsmallest(n=10),
}

In [None]:
idx_ = (
    pd.Series(est.predict(X), index=X.index)
    .sort_values()
    .pipe(lambda x: [x.index[0], x.index[-1]])
)
show_text(statements.loc[idx_], lexica=lexica, n=None)