# Processing Federal Reserve statements

This processes all collated Federal Reserve statements from the [scraper](https://github.com/pmagtulis/fed-statement-scraper.git).

## Do all your imports

In [1]:
import pandas as pd
import numpy as np
import re
import altair as alt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import stopwordsiso as stopwords

## Read CSV

In [2]:
merged= pd.read_csv('merged.csv')
merged.tail(15)

Unnamed: 0,meetings,links,statements
39,2022-03-16,https://www.federalreserve.gov/newsevents/pres...,Indicators of economic activity and employment...
40,2022-05-04,https://www.federalreserve.gov/newsevents/pres...,Although overall economic activity edged down ...
41,2022-06-15,https://www.federalreserve.gov/newsevents/pres...,Overall economic activity appears to have pick...
42,2022-07-27,https://www.federalreserve.gov/newsevents/pres...,Recent indicators of spending and production h...
43,2022-09-21,https://www.federalreserve.gov/newsevents/pres...,Recent indicators point to modest growth in sp...
44,2022-11-02,https://www.federalreserve.gov/newsevents/pres...,Recent indicators point to modest growth in sp...
45,2022-12-14,https://www.federalreserve.gov/newsevents/pres...,Recent indicators point to modest growth in sp...
46,2023-02-01,https://www.federalreserve.gov/newsevents/pres...,Recent indicators point to modest growth in sp...
47,2023-03-22,https://www.federalreserve.gov/newsevents/pres...,Recent indicators point to modest growth in sp...
48,2023-05-03,https://www.federalreserve.gov/newsevents/pres...,Economic activity expanded at a modest pace in...


## Data cleaning

We need to remove the entries with NaN for now because otherwise it won't be processed by our Vectorizer.

In [3]:
merged = merged.drop([52,53])
merged.tail(10)

Unnamed: 0,meetings,links,statements
42,2022-07-27,https://www.federalreserve.gov/newsevents/pres...,Recent indicators of spending and production h...
43,2022-09-21,https://www.federalreserve.gov/newsevents/pres...,Recent indicators point to modest growth in sp...
44,2022-11-02,https://www.federalreserve.gov/newsevents/pres...,Recent indicators point to modest growth in sp...
45,2022-12-14,https://www.federalreserve.gov/newsevents/pres...,Recent indicators point to modest growth in sp...
46,2023-02-01,https://www.federalreserve.gov/newsevents/pres...,Recent indicators point to modest growth in sp...
47,2023-03-22,https://www.federalreserve.gov/newsevents/pres...,Recent indicators point to modest growth in sp...
48,2023-05-03,https://www.federalreserve.gov/newsevents/pres...,Economic activity expanded at a modest pace in...
49,2023-06-14,https://www.federalreserve.gov/newsevents/pres...,Recent indicators suggest that economic activi...
50,2023-07-26,https://www.federalreserve.gov/newsevents/pres...,Recent indicators suggest that economic activi...
51,2023-09-20,https://www.federalreserve.gov/newsevents/pres...,Recent indicators suggest that economic activi...


## Text analysis

Now, we can proceed with the text analysis proper. First stop, we set the parameters in the immediate cell below, most importantly the stopwords we want our analysis to disregard.

In [4]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    return text #removes all numbers

In [5]:
y_columns = ['meetings', 'statements']
BINARY=False
NGRAM_RANGE=(1,1)
MIN_DF=5 #omit words not used that much in documents.
STPWORDS=stopwords.stopwords(["en"])
STPWORDS.update(['committee', 'patrick', 'harker', 'jerome', 'powell', 'lael', 'brainard',
                'michelle', 'bowman', 'christopher', 'waller', 'lisa', 'cook','esther',
                'george', 'loretta', 'mester', 'james', 'bullard', 'john', 'williams'])
                #these are names by FOMC members which often appear in the Fed's statements.

vectorizer = CountVectorizer(
    stop_words=STPWORDS,
    ngram_range=NGRAM_RANGE,
    binary=BINARY,
    min_df=MIN_DF,
    preprocessor=preprocess_text
)

## Vectorizing

Simple counting of words that occur in a speech.

In [6]:
X = vectorizer.fit_transform(merged['statements'])
X



<52x302 sparse matrix of type '<class 'numpy.int64'>'
	with 5950 stored elements in Compressed Sparse Row format>

In [7]:
fed_vectors = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
fed_vectors.round(2)
fed_vectors.tail(15)

Unnamed: 0,accommodative,account,achieve,achieved,action,activity,addition,additional,adjust,adjustments,...,virus,voted,voting,war,weak,weaker,weathered,weigh,weighing,wide
37,2,1,1,0,1,2,0,0,2,0,...,2,0,1,0,0,0,0,0,0,1
38,2,1,1,0,1,2,0,0,1,0,...,2,1,1,0,0,0,0,0,0,1
39,0,1,1,0,2,2,1,1,1,0,...,0,1,2,0,0,0,0,1,0,1
40,0,1,1,0,1,2,2,1,1,0,...,0,1,1,0,0,0,0,1,0,1
41,0,1,1,0,2,2,2,1,1,0,...,0,1,2,0,0,0,0,0,1,1
42,0,1,1,0,1,1,1,1,1,0,...,0,0,1,2,0,0,0,0,1,1
43,0,1,1,0,1,1,1,1,1,0,...,0,0,1,2,0,0,0,0,1,1
44,0,2,1,0,1,2,1,1,1,0,...,0,0,1,2,0,0,0,0,1,1
45,0,2,1,0,1,2,1,0,1,0,...,0,0,1,2,0,0,0,0,1,1
46,0,2,1,0,1,1,1,0,1,0,...,0,0,1,1,0,0,0,0,0,1


In [8]:
fed_slice2 = fed_vectors[['inflation', 'transitory', 'employment','raise','reduce']] # you can change this
fed_slice2.sort_index().round(decimals=2)

Unnamed: 0,inflation,transitory,employment,raise,reduce
0,13,0,3,0,0
1,13,0,2,1,0
2,12,0,2,0,0
3,9,0,2,1,0
4,9,0,2,0,0
5,8,0,2,1,0
6,8,0,2,0,0
7,8,0,2,1,0
8,9,0,2,0,0
9,9,0,3,0,0


In [9]:
fed_vectors = fed_vectors.transpose() #swapping columns and row positions

In [10]:
fed_vectors

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,42,43,44,45,46,47,48,49,50,51
accommodative,1,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
account,1,1,1,1,1,1,1,1,1,1,...,1,1,2,2,2,2,2,2,2,2
achieve,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
achieved,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
action,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
weaker,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
weathered,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
weigh,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,1,1,1
weighing,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,0,0,0,0,0,0


In [11]:
# fed_vectors.columns = ['SONA1', 'SONA2', 'SONA3', 'SONA4', 'SONA5', 'SONA6'] #rename columns
fed_vectors.sort_values(50, ascending=False).head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,42,43,44,45,46,47,48,49,50,51
inflation,13,13,12,9,9,8,8,8,9,9,...,7,7,9,9,8,9,9,9,9,9
policy,3,3,3,2,2,1,1,1,1,1,...,3,3,6,6,6,8,7,7,7,7
monetary,3,3,3,2,2,1,1,1,1,1,...,3,3,6,6,6,7,6,6,6,6
economic,6,7,6,4,4,4,4,6,4,4,...,3,3,5,5,4,4,5,5,5,5
percent,5,5,5,5,5,4,4,4,4,4,...,3,3,4,4,4,4,4,4,4,4
remains,1,1,1,1,1,0,0,0,0,2,...,1,1,1,1,1,2,3,3,3,3
continue,0,0,0,0,0,0,0,1,0,0,...,2,2,2,2,2,2,2,2,3,3
activity,2,2,2,2,2,2,2,2,2,2,...,1,1,2,2,1,2,3,3,3,3
rate,7,7,7,5,5,5,5,5,5,5,...,3,3,3,3,3,3,3,3,3,3
securities,0,0,0,0,0,0,0,0,0,0,...,2,2,2,2,2,2,2,2,2,2


# TF-IDF

In [12]:
vectorizer = TfidfVectorizer(
    stop_words=STPWORDS, 
    ngram_range=NGRAM_RANGE,
    binary=BINARY,
    min_df=MIN_DF,
    preprocessor=preprocess_text
)
X = vectorizer.fit_transform(merged['statements'])
fed_idf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
#[print(x) for x in statements.sentence]
fed_idf.round(2)



Unnamed: 0,accommodative,account,achieve,achieved,action,activity,addition,additional,adjust,adjustments,...,virus,voted,voting,war,weak,weaker,weathered,weigh,weighing,wide
0,0.06,0.03,0.0,0.0,0.03,0.06,0.0,0.0,0.0,0.11,...,0.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.03
1,0.06,0.03,0.0,0.0,0.03,0.06,0.0,0.0,0.0,0.1,...,0.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.03
2,0.06,0.03,0.0,0.0,0.03,0.06,0.0,0.0,0.0,0.11,...,0.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.03
3,0.07,0.04,0.0,0.0,0.04,0.08,0.0,0.0,0.0,0.07,...,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.04
4,0.08,0.04,0.0,0.0,0.04,0.08,0.0,0.0,0.0,0.07,...,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.04
5,0.0,0.04,0.0,0.0,0.04,0.09,0.0,0.0,0.0,0.07,...,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.04
6,0.0,0.04,0.0,0.0,0.04,0.08,0.0,0.0,0.0,0.07,...,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.04
7,0.0,0.04,0.0,0.0,0.04,0.08,0.0,0.0,0.0,0.07,...,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.04
8,0.0,0.04,0.0,0.0,0.04,0.08,0.0,0.0,0.0,0.14,...,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.04
9,0.0,0.04,0.0,0.0,0.04,0.08,0.0,0.0,0.0,0.13,...,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.04


In [13]:
fed_idf2 = fed_idf.transpose()
# fed_idf2.columns = ['SONA1', 'SONA2', 'SONA3', 'SONA4', 'SONA5', 'SONA6'] #rename columns

In [14]:
fed_idf2.sort_values(51, ascending=False).head(15)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,42,43,44,45,46,47,48,49,50,51
inflation,0.415084,0.386739,0.369012,0.351871,0.358899,0.344388,0.329075,0.31161,0.362574,0.344673,...,0.318346,0.31514,0.338661,0.342467,0.329796,0.3348,0.348406,0.342897,0.346382,0.34546
policy,0.090505,0.084325,0.087164,0.07388,0.075356,0.040674,0.038865,0.036803,0.038064,0.036185,...,0.128909,0.12761,0.213321,0.215718,0.233703,0.281184,0.256035,0.251987,0.254548,0.25387
extent,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.113438,0.204727,0.213047,0.209678,0.21181,0.211246
monetary,0.087153,0.081201,0.083936,0.071144,0.072565,0.039167,0.037426,0.035439,0.036654,0.034844,...,0.124134,0.122883,0.205419,0.207728,0.225047,0.236923,0.21133,0.207988,0.210102,0.209543
email,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.18988,0.192014,0.208023,0.187715,0.195344,0.192255,0.194209,0.193692
economic,0.187983,0.204337,0.181044,0.153453,0.156518,0.168964,0.161451,0.229323,0.158121,0.150314,...,0.133875,0.132526,0.184615,0.18669,0.161804,0.146008,0.189928,0.186924,0.188824,0.188322
additional,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.099528,0.098525,0.08235,0.0,0.0,0.081411,0.08472,0.166761,0.168456,0.168007
remains,0.046568,0.043388,0.04485,0.057022,0.058161,0.0,0.0,0.0,0.0,0.11171,...,0.066329,0.06566,0.054881,0.055498,0.060125,0.10851,0.16938,0.166702,0.168396,0.167948
percent,0.150842,0.140541,0.145274,0.184701,0.18839,0.162696,0.155462,0.147211,0.152256,0.144738,...,0.128909,0.12761,0.142214,0.143812,0.155802,0.140592,0.146306,0.143992,0.145456,0.145069
continue,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045413,0.0,0.0,...,0.106044,0.104976,0.087742,0.088728,0.096126,0.086742,0.090267,0.08884,0.134614,0.134255


## Looking for specific words

In this part, we are looking for specific words and see how relevant were they in the Fed's statements through time/ 

The cell below covers four words initially, ones that we believe matter to the Fed's statements.

In [19]:
fed_slice = fed_idf[['inflation','raise','increase','reduce']] # you can change this
fed_slice.sort_index().round(decimals=2)

Unnamed: 0,inflation,raise,increase,reduce
0,0.42,0.0,0.0,0.0
1,0.39,0.06,0.0,0.0
2,0.37,0.0,0.0,0.0
3,0.35,0.08,0.0,0.0
4,0.36,0.0,0.0,0.0
5,0.34,0.08,0.0,0.0
6,0.33,0.0,0.0,0.0
7,0.31,0.08,0.0,0.0
8,0.36,0.0,0.0,0.0
9,0.34,0.0,0.0,0.0


In [20]:
fed_slice = fed_slice.stack().reset_index()
fed_slice = fed_slice.rename(columns={'level_0': 'sona_no','level_1': 'term', 'tfidf': 'term', 0: 'tfidf'})
fed_slice.head()

Unnamed: 0,sona_no,term,tfidf
0,0,inflation,0.415084
1,0,raise,0.0
2,0,increase,0.0
3,0,reduce,0.0
4,1,inflation,0.386739


In [21]:
top_tfidf = fed_slice.sort_values(by=['sona_no','tfidf'], ascending=[True,False]).groupby(['sona_no']).head(10)
top_tfidf.head()

Unnamed: 0,sona_no,term,tfidf
0,0,inflation,0.415084
1,0,raise,0.0
2,0,increase,0.0
3,0,reduce,0.0
4,1,inflation,0.386739


## Chart it

In [25]:
# # Terms in this list will get a red dot in the visualization
term_list = ['boss', 'wangwang'] # you can change this

# adding a little randomness to break ties in term ranking
top_tfidf_plusRand = top_tfidf.copy()
top_tfidf_plusRand['tfidf'] = top_tfidf_plusRand['tfidf'] + np.random.rand(top_tfidf.shape[0])*0.0001

# base for all visualizations, with rank calculation
base = alt.Chart(top_tfidf_plusRand).encode(
    x = 'rank:O',
    y = 'sona_no:N'
).transform_window(
    rank = "rank()",
    sort = [alt.SortField("tfidf", order="descending")],
    groupby = ["sona_no"],
)

# heatmap specification
heatmap = base.mark_rect().encode(
    color = 'tfidf:Q'
)

# red circle over terms in above list
circle = base.mark_circle(size=100).encode(
    color = alt.condition(
        alt.FieldOneOfPredicate(field='term', oneOf=term_list),
        alt.value('red'),
        alt.value('#FFFFFF00')        
    )
)

# text labels, white for darker heatmap colors
text = base.mark_text(baseline='middle').encode(
    text = 'term:N',
    color = alt.condition(alt.datum.tfidf >= 0.23, alt.value('white'), alt.value('black'))
)

# display the three superimposed visualizations
(heatmap + circle + text).properties(width = 600, height=400)