# Processing Federal Reserve statements

This processes all collated Federal Reserve statements from the [scraper](https://github.com/pmagtulis/fed-statement-scraper.git).

## Do all your imports

In [1]:
import pandas as pd
import numpy as np
import re
import altair as alt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import stopwordsiso as stopwords

## Read CSV

In [2]:
merged= pd.read_csv('merged.csv')
merged.tail(30)

Unnamed: 0,meetings,links,statements
17,2020-01-29,https://www.federalreserve.gov/newsevents/pres...,Information received since the Federal Open Ma...
18,2020-03-03,https://www.federalreserve.gov/newsevents/pres...,The fundamentals of the U.S. economy remain st...
19,2020-03-15,https://www.federalreserve.gov/newsevents/pres...,The coronavirus outbreak has harmed communitie...
20,2020-03-19,https://www.federalreserve.gov/newsevents/pres...,The Federal Reserve on Thursday announced the ...
21,2020-03-23,https://www.federalreserve.gov/newsevents/pres...,The Federal Reserve is committed to use its fu...
22,2020-03-31,https://www.federalreserve.gov/newsevents/pres...,The Federal Reserve on Tuesday announced the e...
23,2020-04-29,https://www.federalreserve.gov/newsevents/pres...,The Federal Reserve is committed to using its ...
24,2020-06-10,https://www.federalreserve.gov/newsevents/pres...,The Federal Reserve is committed to using its ...
25,2020-07-29,https://www.federalreserve.gov/newsevents/pres...,The Federal Reserve is committed to using its ...
26,2020-08-27,https://www.federalreserve.gov/newsevents/pres...,Following an extensive review that included nu...


## From 2020 to 2022

In [3]:
df_twoyears = merged.drop(merged.index[0:25])
df_twoyears

Unnamed: 0,meetings,links,statements
25,2020-07-29,https://www.federalreserve.gov/newsevents/pres...,The Federal Reserve is committed to using its ...
26,2020-08-27,https://www.federalreserve.gov/newsevents/pres...,Following an extensive review that included nu...
27,2020-09-16,https://www.federalreserve.gov/newsevents/pres...,The Federal Reserve is committed to using its ...
28,2020-11-05,https://www.federalreserve.gov/newsevents/pres...,The Federal Reserve is committed to using its ...
29,2020-12-16,https://www.federalreserve.gov/newsevents/pres...,The Federal Reserve is committed to using its ...
30,2021-01-27,https://www.federalreserve.gov/newsevents/pres...,The Federal Reserve is committed to using its ...
31,2021-03-17,https://www.federalreserve.gov/newsevents/pres...,The Federal Reserve is committed to using its ...
32,2021-04-28,https://www.federalreserve.gov/newsevents/pres...,The Federal Reserve is committed to using its ...
33,2021-06-16,https://www.federalreserve.gov/newsevents/pres...,The Federal Reserve is committed to using its ...
34,2021-07-28,https://www.federalreserve.gov/newsevents/pres...,The Federal Reserve is committed to using its ...


## Text analysis

Now, we can proceed with the text analysis proper. First stop, we set the parameters in the immediate cell below, most importantly the stopwords we want our analysis to disregard.

In [4]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    return text #removes all numbers

In [5]:
y_columns = ['meetings', 'statements']
BINARY=False
NGRAM_RANGE=(1,1)
MIN_DF=5 #omit words not used that much in documents.
STPWORDS=stopwords.stopwords(["en"])
STPWORDS.update(['committee', 'patrick', 'harker', 'jerome', 'powell', 'lael', 'brainard',
                'michelle', 'bowman', 'christopher', 'waller', 'lisa', 'cook','esther',
                'george', 'loretta', 'mester', 'james', 'bullard', 'john', 'williams'])
                #these are names by FOMC members which often appear in the Fed's statements.

vectorizer = CountVectorizer(
    stop_words=STPWORDS,
    ngram_range=NGRAM_RANGE,
    binary=BINARY,
    min_df=MIN_DF,
    preprocessor=preprocess_text
)

## Vectorizing

Simple counting of words that occur in a speech.

In [6]:
X = vectorizer.fit_transform(df_twoyears['statements'])
X



<22x198 sparse matrix of type '<class 'numpy.int64'>'
	with 2620 stored elements in Compressed Sparse Row format>

In [7]:
fed_vectors = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
fed_vectors.round(2)
fed_vectors.tail(15)

Unnamed: 0,accommodative,account,achieve,achieved,action,activity,addition,additional,adjust,adversely,...,vaccinations,vice,virus,voted,voting,war,weaker,weigh,weighing,wide
7,3,1,2,1,1,1,1,0,1,1,...,2,1,1,0,1,0,0,1,0,1
8,3,1,2,1,1,1,1,0,1,1,...,2,1,1,0,1,0,0,0,0,1
9,3,1,2,1,1,1,0,0,1,1,...,2,1,1,0,1,0,0,0,0,1
10,3,1,2,1,1,1,0,0,1,1,...,2,1,1,0,1,0,0,0,0,1
11,3,1,2,1,1,2,0,0,2,1,...,2,1,1,0,1,0,0,0,0,1
12,2,1,1,0,1,2,0,0,2,1,...,2,1,2,0,1,0,0,0,0,1
13,2,1,1,0,1,2,0,0,1,1,...,1,1,2,1,1,0,0,0,0,1
14,0,1,1,0,2,2,1,1,1,0,...,0,1,0,1,2,0,0,1,0,1
15,0,1,1,0,1,2,2,1,1,0,...,0,1,0,1,1,0,0,1,0,1
16,0,1,1,0,2,2,2,1,1,0,...,0,1,0,1,2,0,0,0,1,1


In [8]:
fed_vectors2 = fed_vectors.transpose() #swapping columns and row positions

In [9]:
fed_vectors2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
accommodative,0,0,2,3,3,3,3,3,3,3,...,2,2,0,0,0,0,0,0,0,0
account,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,2,2,2
achieve,1,2,3,2,2,2,2,2,2,2,...,1,1,1,1,1,1,1,1,1,1
achieved,0,0,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
action,1,0,2,1,1,1,1,1,1,1,...,1,1,2,1,2,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
war,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,2,2,2,1
weaker,1,0,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
weigh,1,0,1,1,1,1,1,1,0,0,...,0,0,1,1,0,0,0,0,0,0
weighing,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,1,1,1,0


In [10]:
fed_vectors2.sort_values(21, ascending=False).head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
inflation,6,5,11,10,10,10,10,9,9,9,...,7,6,6,7,7,7,7,9,9,8
policy,4,14,7,5,5,5,5,6,6,6,...,5,4,4,4,3,3,3,6,6,6
monetary,3,11,4,4,4,4,4,4,4,4,...,3,3,4,4,3,3,3,6,6,6
range,4,2,6,4,4,4,4,4,4,4,...,4,3,4,3,4,3,3,4,4,4
economic,6,1,5,5,5,5,5,4,3,3,...,5,4,4,4,4,3,3,5,5,4
percent,2,4,8,7,7,7,8,7,7,7,...,3,3,4,3,5,3,3,4,4,4
rate,1,1,3,2,2,2,2,2,2,2,...,3,4,4,3,4,3,3,3,3,3
stance,1,0,3,3,3,3,3,3,3,3,...,2,2,3,3,2,2,2,3,3,3
target,2,0,4,2,2,2,2,2,2,2,...,2,2,3,2,3,2,2,3,3,3
continue,2,1,2,2,3,2,2,2,3,4,...,3,3,1,1,2,2,2,2,2,2


## Look for specific words

In [11]:
fed_vectors_slice = fed_vectors[['inflation', 'raise','increase','reduce', 'employment', 'job', 'transitory', 'elevated']]
fed_vectors_slice

Unnamed: 0,inflation,raise,increase,reduce,employment,job,transitory,elevated
0,6,0,1,0,5,0,0,0
1,5,0,1,0,3,1,0,0
2,11,0,1,0,6,0,0,0
3,10,0,1,0,5,0,0,0
4,10,0,1,0,6,0,0,0
5,10,0,1,0,6,0,0,0
6,10,0,1,0,6,0,0,0
7,9,0,1,0,5,0,1,0
8,9,0,1,1,5,0,1,0
9,9,0,1,1,5,0,1,0


# TF-IDF

In [12]:
vectorizer = TfidfVectorizer(
    stop_words=STPWORDS, 
    ngram_range=NGRAM_RANGE,
    binary=BINARY,
    min_df=MIN_DF,
    preprocessor=preprocess_text
)
X = vectorizer.fit_transform(df_twoyears['statements'])
fed_idf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
#[print(x) for x in statements.sentence]
fed_idf.round(2)



Unnamed: 0,accommodative,account,achieve,achieved,action,activity,addition,additional,adjust,adversely,...,vaccinations,vice,virus,voted,voting,war,weaker,weigh,weighing,wide
0,0.0,0.04,0.04,0.0,0.04,0.08,0.05,0.0,0.04,0.0,...,0.0,0.04,0.06,0.0,0.04,0.0,0.09,0.07,0.0,0.04
1,0.0,0.04,0.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04
2,0.1,0.03,0.1,0.06,0.07,0.07,0.04,0.0,0.03,0.0,...,0.0,0.03,0.05,0.0,0.07,0.0,0.07,0.06,0.0,0.03
3,0.17,0.04,0.07,0.06,0.04,0.07,0.05,0.0,0.04,0.0,...,0.0,0.04,0.05,0.08,0.04,0.0,0.08,0.07,0.0,0.04
4,0.16,0.03,0.07,0.06,0.04,0.07,0.05,0.0,0.04,0.0,...,0.0,0.04,0.05,0.0,0.04,0.0,0.08,0.06,0.0,0.03
5,0.16,0.03,0.07,0.06,0.04,0.07,0.05,0.0,0.04,0.06,...,0.06,0.04,0.05,0.0,0.04,0.0,0.08,0.06,0.0,0.03
6,0.16,0.03,0.07,0.06,0.04,0.07,0.04,0.0,0.04,0.06,...,0.06,0.04,0.05,0.0,0.04,0.0,0.0,0.06,0.0,0.03
7,0.16,0.03,0.07,0.06,0.04,0.04,0.04,0.0,0.04,0.06,...,0.13,0.04,0.05,0.0,0.04,0.0,0.0,0.06,0.0,0.03
8,0.16,0.03,0.07,0.06,0.04,0.04,0.04,0.0,0.04,0.06,...,0.13,0.04,0.05,0.0,0.04,0.0,0.0,0.0,0.0,0.03
9,0.16,0.03,0.07,0.06,0.03,0.03,0.0,0.0,0.03,0.06,...,0.12,0.03,0.05,0.0,0.03,0.0,0.0,0.0,0.0,0.03


In [13]:
fed_idf2 = fed_idf.transpose()
# fed_idf2.columns = ['SONA1', 'SONA2', 'SONA3', 'SONA4', 'SONA5', 'SONA6'] #rename columns

In [15]:
fed_idf2.sort_values(21, ascending=False).head(10)

#Nov 2021 marked drastic change in top 10.-- acommodative gone
#

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
inflation,0.239336,0.205819,0.351147,0.355072,0.349144,0.346108,0.34539,0.308599,0.309067,0.300311,...,0.238014,0.255599,0.277695,0.329465,0.318721,0.346764,0.346764,0.377892,0.379735,0.381407
monetary,0.119668,0.452803,0.12769,0.142029,0.139657,0.138443,0.138156,0.137155,0.137363,0.133472,...,0.102006,0.1278,0.18513,0.188266,0.136595,0.148613,0.148613,0.251928,0.253157,0.286055
policy,0.159557,0.576294,0.223457,0.177536,0.174572,0.173054,0.172695,0.205732,0.206044,0.200208,...,0.17001,0.1704,0.18513,0.188266,0.136595,0.148613,0.148613,0.251928,0.253157,0.286055
economic,0.239336,0.041164,0.159612,0.177536,0.174572,0.173054,0.172695,0.137155,0.103022,0.100104,...,0.17001,0.1704,0.18513,0.188266,0.182126,0.148613,0.148613,0.20994,0.210964,0.190703
range,0.159557,0.082328,0.191535,0.142029,0.139657,0.138443,0.138156,0.137155,0.137363,0.133472,...,0.136008,0.1278,0.18513,0.141199,0.182126,0.148613,0.148613,0.167952,0.168771,0.190703
percent,0.079779,0.164655,0.25538,0.24855,0.2444,0.242275,0.276312,0.240021,0.240385,0.233575,...,0.102006,0.1278,0.18513,0.141199,0.227658,0.148613,0.148613,0.167952,0.168771,0.190703
increases,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.084832,0.086268,0.083455,0.090798,0.090798,0.15392,0.154671,0.174771
elevated,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.053402,0.066905,0.072689,0.07392,0.071509,0.077801,0.077801,0.065944,0.066266,0.149754
target,0.083325,0.0,0.133366,0.074171,0.072933,0.072299,0.072149,0.071626,0.071734,0.069702,...,0.071027,0.088987,0.145019,0.098317,0.142667,0.10348,0.10348,0.131563,0.132205,0.149385
stance,0.041663,0.0,0.100024,0.111257,0.109399,0.108448,0.108223,0.107439,0.107602,0.104554,...,0.071027,0.088987,0.145019,0.147476,0.095111,0.10348,0.10348,0.131563,0.132205,0.149385


## Looking for specific words

In this part, we are looking for specific words and see how relevant were they in the Fed's statements through time/ 

The cell below covers four words initially, ones that we believe matter to the Fed's statements.

In [16]:
fed_slice = fed_idf[['inflation', 'raise','increase','reduce', 'transitory', 'elevated', 'risen', 'employment']] # you can change this
fed_slice.sort_index().round(decimals=4)

Unnamed: 0,inflation,raise,increase,reduce,transitory,elevated,risen,employment
0,0.2393,0.0,0.0569,0.0,0.0,0.0,0.0,0.1994
1,0.2058,0.0,0.0588,0.0,0.0,0.0,0.0,0.1235
2,0.3511,0.0,0.0456,0.0,0.0,0.0,0.0555,0.1915
3,0.3551,0.0,0.0507,0.0,0.0,0.0,0.0617,0.1775
4,0.3491,0.0,0.0498,0.0,0.0,0.0,0.0607,0.2095
5,0.3461,0.0,0.0494,0.0,0.0,0.0,0.0601,0.2077
6,0.3454,0.0,0.0493,0.0,0.0,0.0,0.06,0.2072
7,0.3086,0.0,0.0489,0.0,0.0804,0.0,0.1192,0.1714
8,0.3091,0.0,0.049,0.0805,0.0805,0.0,0.1193,0.1717
9,0.3003,0.0,0.0476,0.0782,0.0782,0.0,0.116,0.1668


In [17]:
fed_slice = fed_slice.stack().reset_index()
fed_slice = fed_slice.rename(columns={'level_0': 'sona_no','level_1': 'term', 'tfidf': 'term', 0: 'tfidf'})
fed_slice.head()

Unnamed: 0,sona_no,term,tfidf
0,0,inflation,0.239336
1,0,raise,0.0
2,0,increase,0.05694
3,0,reduce,0.0
4,0,transitory,0.0


In [18]:
top_tfidf = fed_slice.sort_values(by=['sona_no','tfidf'], ascending=[True,False]).groupby(['sona_no']).head(10)
top_tfidf.head()

Unnamed: 0,sona_no,term,tfidf
0,0,inflation,0.239336
7,0,employment,0.199447
2,0,increase,0.05694
1,0,raise,0.0
3,0,reduce,0.0


## Chart it

In [16]:
# # Terms in this list will get a red dot in the visualization
term_list = ['boss', 'wangwang'] # you can change this

# adding a little randomness to break ties in term ranking
top_tfidf_plusRand = top_tfidf.copy()
top_tfidf_plusRand['tfidf'] = top_tfidf_plusRand['tfidf'] + np.random.rand(top_tfidf.shape[0])*0.0001

# base for all visualizations, with rank calculation
base = alt.Chart(top_tfidf_plusRand).encode(
    x = 'rank:O',
    y = 'sona_no:N'
).transform_window(
    rank = "rank()",
    sort = [alt.SortField("tfidf", order="descending")],
    groupby = ["sona_no"],
)

# heatmap specification
heatmap = base.mark_rect().encode(
    color = 'tfidf:Q'
)

# red circle over terms in above list
circle = base.mark_circle(size=100).encode(
    color = alt.condition(
        alt.FieldOneOfPredicate(field='term', oneOf=term_list),
        alt.value('red'),
        alt.value('#FFFFFF00')        
    )
)

# text labels, white for darker heatmap colors
text = base.mark_text(baseline='middle').encode(
    text = 'term:N',
    color = alt.condition(alt.datum.tfidf >= 0.23, alt.value('white'), alt.value('black'))
)

# display the three superimposed visualizations
(heatmap + circle + text).properties(width = 600, height=400)

## Entire statements

In here, we do the same thing for all of Fed's statements *without* isolating key words.

In [17]:
# aquino_idf = aquino_idf.stack().reset_index()
# aquino_idf

In [18]:
# aquino_idf = aquino_idf.rename(columns={'level_0': 'sona_no','level_1': 'term', 0: 'tfidf'})
# aquino_idf

In [19]:
# all_aquino = aquino_idf.sort_values(by=['sona_no','tfidf'], ascending=[True,False]).groupby(['sona_no']).head(10)
# # all_aquino.head()

In [20]:
# # # Terms in this list will get a red dot in the visualization
# term_list = ['boss', 'wangwang']

# # adding a little randomness to break ties in term ranking
# all_aquino_plusRand = all_aquino.copy()
# all_aquino_plusRand['tfidf'] = all_aquino_plusRand['tfidf'] + np.random.rand(all_aquino.shape[0])*0.0001

# # base for all visualizations, with rank calculation
# base = alt.Chart(all_aquino_plusRand).encode(
#     x = 'rank:O',
#     y = 'sona_no:N'
# ).transform_window(
#     rank = "rank()",
#     sort = [alt.SortField("tfidf", order="descending")],
#     groupby = ["sona_no"],
# )

# # heatmap specification
# heatmap = base.mark_rect().encode(
#     color = 'tfidf:Q'
# )

# # red circle over terms in above list
# circle = base.mark_circle(size=100).encode(
#     color = alt.condition(
#         alt.FieldOneOfPredicate(field='term', oneOf=term_list),
#         alt.value('red'),
#         alt.value('#FFFFFF00')        
#     )
# )

# # text labels, white for darker heatmap colors
# text = base.mark_text(baseline='middle').encode(
#     text = 'term:N',
#     color = alt.condition(alt.datum.tfidf >= 0.23, alt.value('white'), alt.value('black'))
# )

# # display the three superimposed visualizations
# (heatmap + circle + text).properties(width = 600, height=400)