# Text

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
from datetime import date
from tqdm.auto import tqdm
from pathlib import Path

from matplotlib import pyplot as plt
from ml4pmt.plot import bar, line

## Loading the FOMC statements 

In [None]:
from ml4pmt.dataset import load_fomc_statements
statements = load_fomc_statements(force_reload=False)

In [None]:
show_text(statements)

In [None]:
special_days = ['2008-01-22', '2010-05-09', '2020-03-15']

## TFIDF vectorization 

In order to extract features from text, the simplest way is to count words. In `scikit-learn`, this is done with the function `CountVectorizer`. A slightly more advanced feature is to select words based on a `TFIDF` score, defined as the product of the term frequency (`TF`) and the inverse document frequency (`IDF`). More precisely, the `TFIDF` score trades off: 
- the terms that are frequent and therefore important in a corpus: 
- the terms that appear in almost all documents and therefore are not helping to discriminate across documents. 

In `TfidfVectorizer`, terms can be filtered additionally with: 
- a `stop word` list
- min and max document frequencies or counts 
- some token pattern (e.g. that eliminates the short tokens). 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF, PCA
from sklearn.pipeline import Pipeline

In [None]:
vectorizer = TfidfVectorizer(stop_words='english',
                             min_df=5, max_df=.8, ngram_range=(1, 3),
                             token_pattern=r'\b[a-zA-Z]{3,}\b')
X = vectorizer.fit_transform(statements['text'].values)

In [None]:
cols = vectorizer.get_feature_names_out()
print(len(cols))
list(cols)[:10]

Here are the most frequent tokens

In [None]:
df = pd.DataFrame(X.toarray(), index=statements['text'].index, columns=cols)
bar(df.mean().sort_values(ascending=False).head(30), horizontal=True)

## Principal component exploration

To describe the matrix of tdidf scores, we first perform a simple principal component analysis (`PCA`) with two modes. 

In [None]:
m = PCA(n_components=2).fit(np.log1p(X.toarray().T))
df = pd.DataFrame(m.components_.T, index=statements.index)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 7))
years = [str(y) for y in df.index.year.unique()]
colors = plt.cm.RdBu(np.linspace(0, 1,len(years)))
for i, y in enumerate(years): 
    ax.scatter(x=df.loc[y][0], y=df.loc[y][1], color=colors[i])
ax.legend(years, loc='center left', bbox_to_anchor=(1, 0.5));
ax.set_xlabel("PC 0")
ax.set_ylabel("PC 1")

d = '2020-03-03'
ax.text(x=df.loc[d][0], y=df.loc[d][1], s=d);

These two modes can be related to labor market and growth. 

In [None]:
func = lambda x: pd.concat([x.nlargest(), x.sort_values(ascending=False).tail(5)])
W = pd.DataFrame(m.transform(np.log1p(X.toarray().T)), index=cols)

fig, ax = plt.subplots(1, 2, figsize=(16, 5))
plt.subplots_adjust(wspace=.25)
for i in [0, 1]: 
    bar(W[i].pipe(func), horizontal=True, ax=ax[i])

## Unsupervised learning: document clustering

It is often information to group tokens into topics that explain differences across documents. A powerful algorithm is the non-negative matrix factorisation (`NMF`): for a non-negative matrix $X$ (such as the one with tfidf scores), `NMF` finds two other non-negative matrices such that: 

$$ X \approx W H. $$ 

The number of topics (called `n_components` in the `scikit-learn` implementation) determines the number of columns in $W$ and the number of rows in $H$. 

In [None]:
n_components = 8
m = NMF(n_components=n_components, 
        init='nndsvd', 
        solver='cd', 
        beta_loss='frobenius', 
        random_state=1, 
        alpha_W=0, 
        l1_ratio=0, 
        max_iter=500).fit(X)

In [None]:
fig, ax = plt.subplots(4, 2, figsize=(20, 16), sharex=True)
ax = ax.ravel()
for i in range(8):
    bar(pd.Series(m.components_[i, :], cols)\
            .sort_values(ascending=False).head(10),
    horizontal=True, ax=ax[i], title=i)

Are these topics interesting? This is a matter of interpretation, but at least, the graph below shows that these topics capture a strong element of time-clustering which makes it a bit less useful.  

In [None]:
W = pd.DataFrame(m.transform(X), index=df.index)
line(W.resample('B').last().ffill(), cumsum=True, title='Cumulative topic loadings')

## Supervised learning: TFIDF + Elastic net

In this section, we use the corpus of FOMC statements for supervised learning. More precisely, we match the text of the statements to the decision of the committee to raise rates, decrease rates or do nothing.  

In practice, this implemented by using `scikit-learn pipelines` and chaining the `TfidfVectorizer` with a logistic regression. 

In [None]:
import numpy as np 
from ml4pmt.dataset import load_fomc_change_date
fomc_change_up, fomc_change_dw = load_fomc_change_date()

In [None]:
from sklearn.linear_model import LogisticRegression, ElasticNet
from ml4pmt.text import coefs_plot, show_text

In [None]:
est = Pipeline([('tfidf', TfidfVectorizer(vocabulary=None,
                                          ngram_range=(1, 3),
                                          max_features=500,
                                          stop_words='english',
                                          token_pattern=r'\b[a-zA-Z]{3,}\b')),
                ('reg', LogisticRegression(C=1, l1_ratio=.35, 
                                           penalty='elasticnet', 
                                           solver='saga', max_iter=500)),
               ])
X, y = pd.concat([statements.loc[fomc_change_up].assign(change=1),
                  statements.loc[fomc_change_dw].assign(change=-1)]).pipe(lambda df: (df['text'], df['change']))
est.fit(X, y);
vocab_ = pd.Series(est.named_steps['tfidf'].vocabulary_).sort_values().index

In [None]:
interpret_coef = pd.DataFrame(np.transpose(est.named_steps['reg'].coef_),index=vocab_)
coefs_plot(interpret_coef, title='Interpreted coefficients for trained model')

A trick is that using a linear regression (e.g. ElasticNet) instead of a logistic regression is faster and as efficient (even sometimes better)

In [None]:
est = Pipeline([
('tfidf', TfidfVectorizer(vocabulary=None,
                          ngram_range=(1, 3), 
                          max_features=500,
                          stop_words='english',
                          token_pattern=r'\b[a-zA-Z]{3,}\b')),
    ('reg', ElasticNet(alpha=0.01)),])
X, y = pd.concat([statements.loc[fomc_change_up].assign(change=1),
                  statements.loc[fomc_change_dw].assign(change=-1)]).pipe(lambda df: (df['text'], df['change']))
est.fit(X, y);
vocab_ = pd.Series(est.named_steps['tfidf'].vocabulary_).sort_values().index

In [None]:
interpret_coef = pd.DataFrame(np.transpose(est.named_steps['reg'].coef_), index=vocab_)
coefs_plot(interpret_coef, title='Interpreted coefficients for trained model')

In [None]:
other_dt_change = ['2003-01-09', '2008-03-16', '2011-06-22']
statements_dt_change_other = ['2007-08-16']
qe1 = ['2008-11-25', '2008-12-01', '2008-12-16', '2009-03-18']
qe2 = ['2010-11-03']
twist = ['2011-09-21', '2012-06-20']
qe3 = ['2012-09-13', '2012-12-12', '2013-12-13']
corona = ['2020-03-20']

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
df = pd.Series(est.predict(statements['text']), index=statements.index).resample('B').last().ffill()
line(df.rename('implied rate').to_frame()\
        .join(pd.Series(1, index=fomc_change_up).reindex(df.index).fillna(0).rename('up'))\
        .join(pd.Series(-1, index=fomc_change_dw).reindex(df.index).fillna(0).rename('dw')),
sort=False, ax=ax, title='Implied interest rate (with forward information)')
ax.plot(df.loc[corona], marker='*', ms=10)
ax.plot(df.loc[twist], marker='*', ms=10)
ax.plot(df.loc[qe1], marker='*', ms=10, ls = 'None')
ax.plot(df.loc[qe2], marker='*', ms=10)
ax.plot(df.loc[qe3], marker='*', ms=10, ls='None')
ax.legend(['implied rate', 'up', 'down', 'corona', 'twist', 'qe1', 'qe2','qe3'], 
          loc='center left', bbox_to_anchor=(1, 0.5));

In [None]:
lexica = {'positive': interpret_coef.squeeze().nlargest(n=10),
          'negative': interpret_coef.squeeze().nsmallest(n=10), }

In [None]:
idx_ = pd.Series(est.predict(X), index=X.index).sort_values().pipe(lambda x: [x.index[0], x.index[-1]])
show_text(statements.loc[idx_], lexica=lexica, n=None)