# Sentiment in FOMC statements

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
from datetime import date
from tqdm.auto import tqdm
from pathlib import Path

from matplotlib import pyplot as plt

from skfin.plot import bar, line

## Sentiment in FOMC statements: Loughran-McDonalds dictionary

In this section, we measure sentiment with the Loughran-McDonalds sentiment dictionary in two ways: 
- sentiment = (#positive - #negative)/(#positive + #negative)
- sentiment = (#positive - #negative)/(#words)

In the first case, short documents (with few or no sentiment words) might lead to biased estimates. 

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from skfin.datasets import load_fomc_statements, load_loughran_mcdonald_dictionary
from skfin.text import coefs_plot
from skfin.text import show_text
from skfin.plot import line

In [None]:
statements = load_fomc_statements()
lm = load_loughran_mcdonald_dictionary()

In [None]:
X = statements['text']

In [None]:
funcs = {'negative': lambda x: x.Negative>0, 'positive': lambda x: x.Positive>0,
         'all': lambda x: x.Word.notna()}
def get_total_count(X, lm, func):
    m = CountVectorizer(vocabulary=lm.loc[func].Word.str.lower().values)
    return pd.DataFrame(m.fit_transform(X).toarray(), index=X.index).sum(axis=1)

lm_counts = pd.concat({k: get_total_count(X, lm, v) for k, v in funcs.items()},
                          axis=1)

In [None]:
line(lm_counts.pipe(lambda x:(x.positive-x.negative)/(x.positive + x.negative)).resample('B').last().ffill(),
legend=False, title='Sentiment=(pos - neg)/(pos + neg) in FOMC statements')

In [None]:
line(lm_counts.pipe(lambda x:(x.positive-x.negative)/x['all']).resample('B').last().ffill(),
legend=False, title='Sentiment=(pos - neg)/(all) in FOMC statements')

In [None]:
lm_lexica = {'negative': pd.Series(1, lm.loc[lm.Negative>0].Word.str.lower().values),
'positive': pd.Series(1, lm.loc[lm.Positive>0].Word.str.lower().values)}
show_text(statements.loc[['2000-12-19', '2013-12-18', '2014-01-29']],
lexica=lm_lexica, n=None)

## Sentiment in FOMC statements: supervised learning

Building on previous analyses, we build here a `scikit-learn pipeline` with a `Tfidfvectorizer` and a regularized regression`ElasticNet`. The target is the return of the market on the day of the statement. 

In [None]:
import numpy as np
from skfin.datasets import load_kf_returns
from skfin.text import show_text
from pandas.tseries.offsets import BDay
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import ElasticNet, ElasticNetCV

In [None]:
ret = load_kf_returns(filename="F-F_Research_Data_Factors_daily")['Daily']

In [None]:
special_days = ['2008-01-22', '2010-05-09', '2020-03-15']
idx0 = pd.to_datetime(pd.Index(special_days))
idx = statements.index.difference(idx0).union(idx0 + BDay(1))
ret_fomc = ret.div(ret.ewm(252).std()).loc[ret.index.intersection(idx)]

In [None]:
est = Pipeline([('tfidf', TfidfVectorizer(vocabulary=None,
                                          ngram_range=(1, 3),
                                          max_features=500,
                                          stop_words='english',
                                          token_pattern=r'\b[a-zA-Z]{3,}\b')),
                ('reg', ElasticNet(alpha=0.0075)),])
y = ret_fomc['Mkt-RF'].dropna()
X = statements['text']
idx_ = y.index.intersection(X.index)
X, y = X.loc[idx_], y.loc[idx_]
est.fit(X, y);
vocab_ = pd.Series(est.named_steps['tfidf'].vocabulary_).sort_values().index
interpret_coef = pd.DataFrame(np.transpose(est.named_steps['reg'].coef_), index=vocab_)
coefs_plot(interpret_coef, title='Interpreted coefficients for trained model')

In [None]:
lexica = {'positive': interpret_coef.squeeze().nlargest(n=10),
          'negative': interpret_coef.squeeze().nsmallest(n=10), }

In [None]:
idx_ = pd.Series(est.predict(X), index=X.index).sort_values().pipe(lambda x: [x.index[0], x.index[-1]])
show_text(statements.loc[idx_], lexica=lexica, n=None)