# Processing Federal Reserve statements

This processes all collated Federal Reserve statements from the [scraper](https://github.com/pmagtulis/fed-statement-scraper.git).

## Do all your imports

In [32]:
import pandas as pd
import numpy as np
import re
import altair as alt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import stopwordsiso as stopwords

## Read CSV

In [33]:
merged= pd.read_csv('merged.csv')
merged.tail(10)

Unnamed: 0,meetings,links,statements
44,2021-11-03,https://www.federalreserve.gov/newsevents/pres...,The Federal Reserve is committed to using its ...
45,2021-12-15,https://www.federalreserve.gov/newsevents/pres...,The Federal Reserve is committed to using its ...
46,2022-01-26,https://www.federalreserve.gov/newsevents/pres...,Indicators of economic activity and employment...
47,2022-03-16,https://www.federalreserve.gov/newsevents/pres...,Indicators of economic activity and employment...
48,2022-05-04,https://www.federalreserve.gov/newsevents/pres...,Although overall economic activity edged down ...
49,2022-06-15,https://www.federalreserve.gov/newsevents/pres...,Overall economic activity appears to have pick...
50,2022-07-27,https://www.federalreserve.gov/newsevents/pres...,Recent indicators of spending and production h...
51,2022-09-21,https://www.federalreserve.gov/newsevents/pres...,Recent indicators point to modest growth in sp...
52,2022-11-02,,
53,2022-11-02,,


## Data cleaning

We need to remove the entries with NaN for now because otherwise it won't be processed by our Vectorizer.

In [34]:
merged = merged.drop([52,53])
merged.tail()

Unnamed: 0,meetings,links,statements
47,2022-03-16,https://www.federalreserve.gov/newsevents/pres...,Indicators of economic activity and employment...
48,2022-05-04,https://www.federalreserve.gov/newsevents/pres...,Although overall economic activity edged down ...
49,2022-06-15,https://www.federalreserve.gov/newsevents/pres...,Overall economic activity appears to have pick...
50,2022-07-27,https://www.federalreserve.gov/newsevents/pres...,Recent indicators of spending and production h...
51,2022-09-21,https://www.federalreserve.gov/newsevents/pres...,Recent indicators point to modest growth in sp...


## Text analysis

Now, we can proceed with the text analysis proper. First stop, we set the parameters in the immediate cell below, most importantly the stopwords we want our analysis to disregard.

In [35]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    return text #removes all numbers

In [75]:
y_columns = ['meetings', 'statements']
BINARY=False
NGRAM_RANGE=(1,1)
MIN_DF=5 #omit words not used that much in documents.
STPWORDS=stopwords.stopwords(["en"])
STPWORDS.update(['committee', 'patrick', 'harker', 'jerome', 'powell', 'lael', 'brainard',
                'michelle', 'bowman', 'christopher', 'waller', 'lisa', 'cook','esther',
                'george', 'loretta', 'mester', 'james', 'bullard', 'john', 'williams'])
                #these are names by FOMC members which often appear in the Fed's statements.

vectorizer = CountVectorizer(
    stop_words=STPWORDS,
    ngram_range=NGRAM_RANGE,
    binary=BINARY,
    min_df=MIN_DF,
    preprocessor=preprocess_text
)

## Vectorizing

Simple counting of words that occur in a speech.

In [76]:
X = vectorizer.fit_transform(merged['statements'])
X



<52x312 sparse matrix of type '<class 'numpy.int64'>'
	with 6302 stored elements in Compressed Sparse Row format>

In [77]:
fed_vectors = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
fed_vectors.round(2)
fed_vectors.tail(15)

Unnamed: 0,accommodative,account,achieve,achieved,action,activity,actual,addition,additional,adjust,...,voted,voting,warrant,weak,weaker,weathered,weigh,wide,william,yellen
37,3,1,2,1,1,2,0,1,0,1,...,0,1,0,0,1,0,1,1,0,0
38,3,1,2,1,1,2,0,1,0,1,...,0,1,0,0,1,0,1,1,0,0
39,3,1,2,1,1,2,0,1,0,1,...,0,1,0,1,0,0,1,1,0,0
40,3,1,2,1,1,1,0,1,0,1,...,0,1,0,1,0,0,1,1,0,0
41,3,1,2,1,1,1,0,1,0,1,...,0,1,0,1,0,0,0,1,0,0
42,3,1,2,1,1,1,0,0,0,1,...,0,1,0,0,0,0,0,1,0,0
43,3,1,2,1,1,1,0,0,0,1,...,0,1,0,0,0,0,0,1,0,0
44,3,1,2,1,1,2,0,0,0,2,...,0,1,0,0,0,0,0,1,0,0
45,2,1,1,0,1,2,0,0,0,2,...,0,1,0,0,0,0,0,1,0,0
46,2,1,1,0,1,2,0,0,0,1,...,1,1,0,0,0,0,0,1,0,0


In [78]:
fed_vectors = fed_vectors.transpose() #swapping columns and row positions

In [79]:
fed_vectors

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,42,43,44,45,46,47,48,49,50,51
accommodative,2,2,2,1,1,1,1,1,1,1,...,3,3,3,2,2,0,0,0,0,0
account,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
achieve,0,0,0,0,0,0,0,0,0,0,...,2,2,2,1,1,1,1,1,1,1
achieved,0,0,0,0,0,0,0,0,0,0,...,1,1,1,0,0,0,0,0,0,0
action,1,2,1,2,1,1,1,2,1,1,...,1,1,1,1,1,2,1,2,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
weathered,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
weigh,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
wide,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
william,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [80]:
# fed_vectors.columns = ['SONA1', 'SONA2', 'SONA3', 'SONA4', 'SONA5', 'SONA6'] #rename columns
fed_vectors.sort_values(50, ascending=False).head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,42,43,44,45,46,47,48,49,50,51
inflation,12,13,13,12,12,13,15,14,13,13,...,9,9,10,7,6,6,7,7,7,7
policy,5,5,5,5,5,4,3,3,3,3,...,6,6,6,5,4,4,4,3,3,3
rate,7,8,7,7,6,7,8,9,7,7,...,2,2,2,3,4,4,3,4,3,3
economic,7,7,7,6,6,7,7,7,6,7,...,3,3,5,5,4,4,4,4,3,3
monetary,3,3,3,3,3,3,3,3,3,3,...,4,4,4,3,3,4,4,3,3,3
percent,6,6,6,6,6,6,6,6,5,5,...,7,7,7,3,3,4,3,5,3,3
range,3,4,3,4,3,3,3,4,3,3,...,4,4,4,4,3,4,3,4,3,3
agency,3,3,3,3,3,0,0,0,0,0,...,1,1,3,2,1,2,2,2,2,2
federal,7,8,7,8,6,6,6,7,6,6,...,2,2,3,3,3,2,2,3,2,2
securities,4,4,4,5,3,0,0,0,0,0,...,2,2,7,5,3,2,2,2,2,2


# TF-IDF

In [81]:
vectorizer = TfidfVectorizer(
    stop_words=STPWORDS, 
    ngram_range=NGRAM_RANGE,
    binary=BINARY,
    min_df=MIN_DF,
    preprocessor=preprocess_text
)
X = vectorizer.fit_transform(merged['statements'])
fed_idf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
#[print(x) for x in statements.sentence]
fed_idf.round(2)



Unnamed: 0,accommodative,account,achieve,achieved,action,activity,actual,addition,additional,adjust,...,voted,voting,warrant,weak,weaker,weathered,weigh,wide,william,yellen
0,0.09,0.03,0.0,0.0,0.03,0.06,0.13,0.0,0.0,0.0,...,0.0,0.03,0.06,0.0,0.0,0.0,0.0,0.03,0.06,0.07
1,0.08,0.03,0.0,0.0,0.05,0.05,0.12,0.0,0.0,0.0,...,0.0,0.05,0.06,0.0,0.0,0.0,0.0,0.03,0.06,0.06
2,0.09,0.03,0.0,0.0,0.03,0.05,0.12,0.0,0.0,0.0,...,0.0,0.03,0.06,0.0,0.0,0.0,0.0,0.03,0.06,0.07
3,0.04,0.03,0.0,0.0,0.05,0.05,0.12,0.0,0.0,0.0,...,0.0,0.05,0.06,0.0,0.0,0.0,0.0,0.03,0.06,0.06
4,0.04,0.03,0.0,0.0,0.03,0.06,0.13,0.0,0.0,0.0,...,0.0,0.03,0.06,0.0,0.0,0.0,0.0,0.03,0.06,0.07
5,0.05,0.03,0.0,0.0,0.03,0.09,0.13,0.0,0.0,0.0,...,0.0,0.03,0.07,0.0,0.0,0.0,0.0,0.03,0.06,0.07
6,0.04,0.03,0.0,0.0,0.03,0.08,0.13,0.0,0.0,0.0,...,0.0,0.03,0.06,0.0,0.0,0.0,0.0,0.03,0.06,0.07
7,0.05,0.03,0.0,0.0,0.06,0.09,0.13,0.0,0.0,0.0,...,0.0,0.06,0.07,0.0,0.0,0.0,0.0,0.03,0.06,0.07
8,0.05,0.03,0.0,0.0,0.03,0.06,0.15,0.0,0.0,0.0,...,0.0,0.03,0.07,0.0,0.0,0.0,0.0,0.03,0.07,0.08
9,0.05,0.03,0.0,0.0,0.03,0.06,0.14,0.0,0.0,0.0,...,0.0,0.03,0.07,0.0,0.0,0.0,0.0,0.03,0.07,0.0


In [82]:
fed_idf2 = fed_idf.transpose()
# fed_idf2.columns = ['SONA1', 'SONA2', 'SONA3', 'SONA4', 'SONA5', 'SONA6'] #rename columns

In [86]:
fed_idf2.sort_values(51, ascending=False).head(15)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,42,43,44,45,46,47,48,49,50,51
inflation,0.334281,0.348003,0.357265,0.321316,0.342936,0.378536,0.421888,0.408916,0.419635,0.396576,...,0.267402,0.261923,0.258317,0.213469,0.228537,0.254438,0.285565,0.284149,0.326492,0.325393
reducing,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.071081,0.0,0.0,0.11669,0.224512,0.223399,0.256689,0.255825
economic,0.191339,0.183871,0.188764,0.157644,0.168251,0.200003,0.193187,0.200622,0.190044,0.209535,...,0.087462,0.08567,0.126735,0.149617,0.1495,0.166443,0.160118,0.159325,0.1373,0.136838
goals,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.173333,0.169781,0.113025,0.133431,0.111105,0.123697,0.118997,0.118407,0.136052,0.135594
agency,0.121885,0.117128,0.120245,0.117158,0.125041,0.0,0.0,0.0,0.0,0.0,...,0.043333,0.042445,0.113025,0.088954,0.055553,0.123697,0.118997,0.118407,0.136052,0.135594
russia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.122638,0.117978,0.117393,0.134886,0.134432
robust,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.117978,0.117393,0.134886,0.134432
highly,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.122638,0.235955,0.117393,0.134886,0.134432
upward,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.122638,0.117978,0.117393,0.134886,0.134432
returning,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.117393,0.134886,0.134432


## Looking for specific words

In this part, we are looking for specific words and see how relevant were they in the Fed's statements through time/ 

The cell below covers four words initially, ones that we believe matter to the Fed's statements.

In [90]:
fed_slice = fed_idf[['inflation', 'raise','increase','reduce']] # you can change this
fed_slice.sort_index().round(decimals=2)

Unnamed: 0,inflation,raise,increase,reduce
0,0.33,0.0,0.0,0.0
1,0.35,0.06,0.0,0.0
2,0.36,0.0,0.0,0.0
3,0.32,0.06,0.0,0.07
4,0.34,0.0,0.0,0.0
5,0.38,0.0,0.0,0.0
6,0.42,0.0,0.0,0.0
7,0.41,0.06,0.0,0.0
8,0.42,0.0,0.0,0.0
9,0.4,0.06,0.0,0.0


In [23]:
fed_slice = fed_slice.stack().reset_index()
fed_slice = fed_slice.rename(columns={'level_0': 'sona_no','level_1': 'term', 'tfidf': 'term', 0: 'tfidf'})
fed_slice.head()

Unnamed: 0,sona_no,term,tfidf
0,0,boss,0.0
1,0,wangwang,0.0
2,0,mahirap,0.010778
3,0,corrupt,0.016809
4,1,boss,0.018041


In [24]:
top_tfidf = fed_slice.sort_values(by=['sona_no','tfidf'], ascending=[True,False]).groupby(['sona_no']).head(10)
top_tfidf.head()

Unnamed: 0,sona_no,term,tfidf
3,0,corrupt,0.016809
2,0,mahirap,0.010778
0,0,boss,0.0
1,0,wangwang,0.0
5,1,wangwang,0.187694


## Chart it

In [25]:
# # Terms in this list will get a red dot in the visualization
term_list = ['boss', 'wangwang'] # you can change this

# adding a little randomness to break ties in term ranking
top_tfidf_plusRand = top_tfidf.copy()
top_tfidf_plusRand['tfidf'] = top_tfidf_plusRand['tfidf'] + np.random.rand(top_tfidf.shape[0])*0.0001

# base for all visualizations, with rank calculation
base = alt.Chart(top_tfidf_plusRand).encode(
    x = 'rank:O',
    y = 'sona_no:N'
).transform_window(
    rank = "rank()",
    sort = [alt.SortField("tfidf", order="descending")],
    groupby = ["sona_no"],
)

# heatmap specification
heatmap = base.mark_rect().encode(
    color = 'tfidf:Q'
)

# red circle over terms in above list
circle = base.mark_circle(size=100).encode(
    color = alt.condition(
        alt.FieldOneOfPredicate(field='term', oneOf=term_list),
        alt.value('red'),
        alt.value('#FFFFFF00')        
    )
)

# text labels, white for darker heatmap colors
text = base.mark_text(baseline='middle').encode(
    text = 'term:N',
    color = alt.condition(alt.datum.tfidf >= 0.23, alt.value('white'), alt.value('black'))
)

# display the three superimposed visualizations
(heatmap + circle + text).properties(width = 600, height=400)

## Entire SONAs

In here, we do the same thing for all of SONA *without* isolating key words.

In [26]:
aquino_idf = aquino_idf.stack().reset_index()
aquino_idf

Unnamed: 0,level_0,level_1,0
0,0,____________________,0.000000
1,0,_________________________,0.000000
2,0,aabang,0.000000
3,0,aabot,0.043111
4,0,aabuso,0.000000
...,...,...,...
48655,5,yuri,0.000000
48656,5,zambales,0.000000
48657,5,zamboanga,0.014210
48658,5,zone,0.000000


In [27]:
aquino_idf = aquino_idf.rename(columns={'level_0': 'sona_no','level_1': 'term', 0: 'tfidf'})
aquino_idf

Unnamed: 0,sona_no,term,tfidf
0,0,____________________,0.000000
1,0,_________________________,0.000000
2,0,aabang,0.000000
3,0,aabot,0.043111
4,0,aabuso,0.000000
...,...,...,...
48655,5,yuri,0.000000
48656,5,zambales,0.000000
48657,5,zamboanga,0.014210
48658,5,zone,0.000000


In [28]:
all_aquino = aquino_idf.sort_values(by=['sona_no','tfidf'], ascending=[True,False]).groupby(['sona_no']).head(10)
all_aquino.head()

Unnamed: 0,sona_no,term,tfidf
6220,0,pesos,0.26122
5414,0,noong,0.140109
7484,0,taon,0.129332
5300,0,natuklasan,0.121397
4187,0,mas,0.118554


In [29]:
# # Terms in this list will get a red dot in the visualization
term_list = ['boss', 'wangwang']

# adding a little randomness to break ties in term ranking
all_aquino_plusRand = all_aquino.copy()
all_aquino_plusRand['tfidf'] = all_aquino_plusRand['tfidf'] + np.random.rand(all_aquino.shape[0])*0.0001

# base for all visualizations, with rank calculation
base = alt.Chart(all_aquino_plusRand).encode(
    x = 'rank:O',
    y = 'sona_no:N'
).transform_window(
    rank = "rank()",
    sort = [alt.SortField("tfidf", order="descending")],
    groupby = ["sona_no"],
)

# heatmap specification
heatmap = base.mark_rect().encode(
    color = 'tfidf:Q'
)

# red circle over terms in above list
circle = base.mark_circle(size=100).encode(
    color = alt.condition(
        alt.FieldOneOfPredicate(field='term', oneOf=term_list),
        alt.value('red'),
        alt.value('#FFFFFF00')        
    )
)

# text labels, white for darker heatmap colors
text = base.mark_text(baseline='middle').encode(
    text = 'term:N',
    color = alt.condition(alt.datum.tfidf >= 0.23, alt.value('white'), alt.value('black'))
)

# display the three superimposed visualizations
(heatmap + circle + text).properties(width = 600, height=400)