# Processing Federal Reserve statements

This processes all collated Federal Reserve statements from the [scraper](https://github.com/pmagtulis/fed-statement-scraper.git).

## Do all your imports

In [1]:
import pandas as pd
import numpy as np
import re
import altair as alt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import stopwordsiso as stopwords

## Read CSV

In [2]:
merged= pd.read_csv('merged.csv')
merged.tail(10)

Unnamed: 0,meetings,links,statements
43,2021-09-22,https://www.federalreserve.gov/newsevents/pres...,The Federal Reserve is committed to using its ...
44,2021-11-03,https://www.federalreserve.gov/newsevents/pres...,The Federal Reserve is committed to using its ...
45,2021-12-15,https://www.federalreserve.gov/newsevents/pres...,The Federal Reserve is committed to using its ...
46,2022-01-26,https://www.federalreserve.gov/newsevents/pres...,Indicators of economic activity and employment...
47,2022-03-16,https://www.federalreserve.gov/newsevents/pres...,Indicators of economic activity and employment...
48,2022-05-04,https://www.federalreserve.gov/newsevents/pres...,Although overall economic activity edged down ...
49,2022-06-15,https://www.federalreserve.gov/newsevents/pres...,Overall economic activity appears to have pick...
50,2022-07-27,https://www.federalreserve.gov/newsevents/pres...,Recent indicators of spending and production h...
51,2022-09-21,https://www.federalreserve.gov/newsevents/pres...,Recent indicators point to modest growth in sp...
52,2022-11-02,https://www.federalreserve.gov/newsevents/pres...,Recent indicators point to modest growth in sp...


## Text analysis

Now, we can proceed with the text analysis proper. First stop, we set the parameters in the immediate cell below, most importantly the stopwords we want our analysis to disregard.

In [4]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    return text #removes all numbers

In [5]:
y_columns = ['meetings', 'statements']
BINARY=False
NGRAM_RANGE=(1,1)
MIN_DF=5 #omit words not used that much in documents.
STPWORDS=stopwords.stopwords(["en"])
STPWORDS.update(['committee', 'patrick', 'harker', 'jerome', 'powell', 'lael', 'brainard',
                'michelle', 'bowman', 'christopher', 'waller', 'lisa', 'cook','esther',
                'george', 'loretta', 'mester', 'james', 'bullard', 'john', 'williams'])
                #these are names by FOMC members which often appear in the Fed's statements.

vectorizer = CountVectorizer(
    stop_words=STPWORDS,
    ngram_range=NGRAM_RANGE,
    binary=BINARY,
    min_df=MIN_DF,
    preprocessor=preprocess_text
)

## Vectorizing

Simple counting of words that occur in a speech.

In [6]:
X = vectorizer.fit_transform(merged['statements'])
X



<53x314 sparse matrix of type '<class 'numpy.int64'>'
	with 6426 stored elements in Compressed Sparse Row format>

In [7]:
fed_vectors = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
fed_vectors.round(2)
fed_vectors.tail(15)

Unnamed: 0,accommodative,account,achieve,achieved,action,activity,actual,addition,additional,adjust,...,voted,voting,warrant,weak,weaker,weathered,weigh,wide,william,yellen
38,3,1,2,1,1,2,0,1,0,1,...,0,1,0,0,1,0,1,1,0,0
39,3,1,2,1,1,2,0,1,0,1,...,0,1,0,1,0,0,1,1,0,0
40,3,1,2,1,1,1,0,1,0,1,...,0,1,0,1,0,0,1,1,0,0
41,3,1,2,1,1,1,0,1,0,1,...,0,1,0,1,0,0,0,1,0,0
42,3,1,2,1,1,1,0,0,0,1,...,0,1,0,0,0,0,0,1,0,0
43,3,1,2,1,1,1,0,0,0,1,...,0,1,0,0,0,0,0,1,0,0
44,3,1,2,1,1,2,0,0,0,2,...,0,1,0,0,0,0,0,1,0,0
45,2,1,1,0,1,2,0,0,0,2,...,0,1,0,0,0,0,0,1,0,0
46,2,1,1,0,1,2,0,0,0,1,...,1,1,0,0,0,0,0,1,0,0
47,0,1,1,0,2,2,0,1,1,1,...,1,2,0,0,0,0,1,1,0,0


In [8]:
fed_vectors = fed_vectors.transpose() #swapping columns and row positions

In [9]:
fed_vectors

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,43,44,45,46,47,48,49,50,51,52
accommodative,2,2,2,1,1,1,1,1,1,1,...,3,3,2,2,0,0,0,0,0,0
account,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,2
achieve,0,0,0,0,0,0,0,0,0,0,...,2,2,1,1,1,1,1,1,1,1
achieved,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
action,1,2,1,2,1,1,1,2,1,1,...,1,1,1,1,2,1,2,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
weathered,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
weigh,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
wide,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
william,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# fed_vectors.columns = ['SONA1', 'SONA2', 'SONA3', 'SONA4', 'SONA5', 'SONA6'] #rename columns
fed_vectors.sort_values(50, ascending=False).head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,43,44,45,46,47,48,49,50,51,52
inflation,12,13,13,12,12,13,15,14,13,13,...,9,10,7,6,6,7,7,7,7,9
rate,7,8,7,7,6,7,8,9,7,7,...,2,2,3,4,4,3,4,3,3,3
economic,7,7,7,6,6,7,7,7,6,7,...,3,5,5,4,4,4,4,3,3,5
policy,5,5,5,5,5,4,3,3,3,3,...,6,6,5,4,4,4,3,3,3,6
monetary,3,3,3,3,3,3,3,3,3,3,...,4,4,3,3,4,4,3,3,3,6
percent,6,6,6,6,6,6,6,6,5,5,...,7,7,3,3,4,3,5,3,3,4
range,3,4,3,4,3,3,3,4,3,3,...,4,4,4,3,4,3,4,3,3,4
stance,2,2,2,2,2,2,2,2,2,2,...,3,3,2,2,3,3,2,2,2,3
agency,3,3,3,3,3,0,0,0,0,0,...,1,3,2,1,2,2,2,2,2,2
securities,4,4,4,5,3,0,0,0,0,0,...,2,7,5,3,2,2,2,2,2,2


# TF-IDF

In [11]:
vectorizer = TfidfVectorizer(
    stop_words=STPWORDS, 
    ngram_range=NGRAM_RANGE,
    binary=BINARY,
    min_df=MIN_DF,
    preprocessor=preprocess_text
)
X = vectorizer.fit_transform(merged['statements'])
fed_idf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
#[print(x) for x in statements.sentence]
fed_idf.round(2)



Unnamed: 0,accommodative,account,achieve,achieved,action,activity,actual,addition,additional,adjust,...,voted,voting,warrant,weak,weaker,weathered,weigh,wide,william,yellen
0,0.09,0.03,0.0,0.0,0.03,0.06,0.13,0.0,0.0,0.0,...,0.0,0.03,0.06,0.0,0.0,0.0,0.0,0.03,0.06,0.07
1,0.08,0.03,0.0,0.0,0.05,0.05,0.12,0.0,0.0,0.0,...,0.0,0.05,0.06,0.0,0.0,0.0,0.0,0.03,0.06,0.07
2,0.09,0.03,0.0,0.0,0.03,0.05,0.12,0.0,0.0,0.0,...,0.0,0.03,0.06,0.0,0.0,0.0,0.0,0.03,0.06,0.07
3,0.04,0.03,0.0,0.0,0.05,0.05,0.12,0.0,0.0,0.0,...,0.0,0.05,0.06,0.0,0.0,0.0,0.0,0.03,0.06,0.07
4,0.04,0.03,0.0,0.0,0.03,0.06,0.13,0.0,0.0,0.0,...,0.0,0.03,0.06,0.0,0.0,0.0,0.0,0.03,0.06,0.07
5,0.05,0.03,0.0,0.0,0.03,0.09,0.13,0.0,0.0,0.0,...,0.0,0.03,0.07,0.0,0.0,0.0,0.0,0.03,0.06,0.07
6,0.04,0.03,0.0,0.0,0.03,0.08,0.13,0.0,0.0,0.0,...,0.0,0.03,0.06,0.0,0.0,0.0,0.0,0.03,0.06,0.07
7,0.05,0.03,0.0,0.0,0.06,0.09,0.13,0.0,0.0,0.0,...,0.0,0.06,0.07,0.0,0.0,0.0,0.0,0.03,0.06,0.07
8,0.05,0.03,0.0,0.0,0.03,0.06,0.15,0.0,0.0,0.0,...,0.0,0.03,0.07,0.0,0.0,0.0,0.0,0.03,0.07,0.08
9,0.05,0.03,0.0,0.0,0.03,0.06,0.14,0.0,0.0,0.0,...,0.0,0.03,0.07,0.0,0.0,0.0,0.0,0.03,0.07,0.0


In [12]:
fed_idf2 = fed_idf.transpose()
# fed_idf2.columns = ['SONA1', 'SONA2', 'SONA3', 'SONA4', 'SONA5', 'SONA6'] #rename columns

In [17]:
fed_idf2.sort_values(52, ascending=False).head(15)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,43,44,45,46,47,48,49,50,51,52
inflation,0.332866,0.346575,0.355984,0.319906,0.341313,0.376697,0.41983,0.407037,0.41749,0.394542,...,0.260918,0.257842,0.213139,0.228223,0.256254,0.284859,0.28331,0.325163,0.324128,0.358371
policy,0.131183,0.126079,0.129501,0.126075,0.134511,0.109629,0.079418,0.082498,0.091126,0.086117,...,0.164524,0.146326,0.143996,0.143908,0.161583,0.15396,0.114842,0.131808,0.131389,0.225974
monetary,0.075847,0.072896,0.074875,0.072894,0.077772,0.079232,0.07653,0.079498,0.087812,0.082985,...,0.105694,0.094003,0.083256,0.104006,0.155707,0.148361,0.110666,0.127014,0.12661,0.217756
reducing,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.068377,0.0,0.0,0.113259,0.215831,0.214658,0.246369,0.245585,0.21119
economic,0.190597,0.183181,0.188154,0.157007,0.167514,0.199102,0.192313,0.199771,0.189139,0.208534,...,0.085371,0.126547,0.149439,0.149347,0.16769,0.159779,0.15891,0.13679,0.136354,0.195429
percent,0.157419,0.151294,0.155402,0.15129,0.161413,0.164444,0.158837,0.164996,0.151876,0.143528,...,0.191945,0.170714,0.086398,0.107931,0.161583,0.11547,0.191404,0.131808,0.131389,0.15065
range,0.077265,0.099012,0.076275,0.099008,0.079225,0.080713,0.07796,0.107978,0.089453,0.084536,...,0.10767,0.09576,0.113083,0.10595,0.158617,0.113351,0.150312,0.129389,0.128977,0.147884
stance,0.06702,0.064412,0.066161,0.06441,0.06872,0.07001,0.067623,0.070245,0.077592,0.073327,...,0.105067,0.093445,0.073566,0.091901,0.154783,0.147481,0.097786,0.112232,0.111875,0.144309
increases,0.044752,0.043011,0.044179,0.04301,0.045888,0.046749,0.045155,0.046907,0.051812,0.048964,...,0.0,0.041599,0.0,0.0,0.068904,0.065654,0.065297,0.074943,0.074705,0.128484
attentive,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.118586,0.117941,0.135364,0.134934,0.116036


## Looking for specific words

In this part, we are looking for specific words and see how relevant were they in the Fed's statements through time/ 

The cell below covers four words initially, ones that we believe matter to the Fed's statements.

In [18]:
fed_slice = fed_idf[['inflation', 'raise','increase','reduce']] # you can change this
fed_slice.sort_index().round(decimals=2)

Unnamed: 0,inflation,raise,increase,reduce
0,0.33,0.0,0.0,0.0
1,0.35,0.06,0.0,0.0
2,0.36,0.0,0.0,0.0
3,0.32,0.06,0.0,0.07
4,0.34,0.0,0.0,0.0
5,0.38,0.0,0.0,0.0
6,0.42,0.0,0.0,0.0
7,0.41,0.06,0.0,0.0
8,0.42,0.0,0.0,0.0
9,0.39,0.06,0.0,0.0


In [19]:
fed_slice = fed_slice.stack().reset_index()
fed_slice = fed_slice.rename(columns={'level_0': 'sona_no','level_1': 'term', 'tfidf': 'term', 0: 'tfidf'})
fed_slice.head()

Unnamed: 0,sona_no,term,tfidf
0,0,inflation,0.332866
1,0,raise,0.0
2,0,increase,0.0
3,0,reduce,0.0
4,1,inflation,0.346575


In [20]:
top_tfidf = fed_slice.sort_values(by=['sona_no','tfidf'], ascending=[True,False]).groupby(['sona_no']).head(10)
top_tfidf.head()

Unnamed: 0,sona_no,term,tfidf
0,0,inflation,0.332866
1,0,raise,0.0
2,0,increase,0.0
3,0,reduce,0.0
4,1,inflation,0.346575


## Chart it

In [25]:
# # Terms in this list will get a red dot in the visualization
term_list = ['boss', 'wangwang'] # you can change this

# adding a little randomness to break ties in term ranking
top_tfidf_plusRand = top_tfidf.copy()
top_tfidf_plusRand['tfidf'] = top_tfidf_plusRand['tfidf'] + np.random.rand(top_tfidf.shape[0])*0.0001

# base for all visualizations, with rank calculation
base = alt.Chart(top_tfidf_plusRand).encode(
    x = 'rank:O',
    y = 'sona_no:N'
).transform_window(
    rank = "rank()",
    sort = [alt.SortField("tfidf", order="descending")],
    groupby = ["sona_no"],
)

# heatmap specification
heatmap = base.mark_rect().encode(
    color = 'tfidf:Q'
)

# red circle over terms in above list
circle = base.mark_circle(size=100).encode(
    color = alt.condition(
        alt.FieldOneOfPredicate(field='term', oneOf=term_list),
        alt.value('red'),
        alt.value('#FFFFFF00')        
    )
)

# text labels, white for darker heatmap colors
text = base.mark_text(baseline='middle').encode(
    text = 'term:N',
    color = alt.condition(alt.datum.tfidf >= 0.23, alt.value('white'), alt.value('black'))
)

# display the three superimposed visualizations
(heatmap + circle + text).properties(width = 600, height=400)

## Entire SONAs

In here, we do the same thing for all of SONA *without* isolating key words.

In [26]:
aquino_idf = aquino_idf.stack().reset_index()
aquino_idf

Unnamed: 0,level_0,level_1,0
0,0,____________________,0.000000
1,0,_________________________,0.000000
2,0,aabang,0.000000
3,0,aabot,0.043111
4,0,aabuso,0.000000
...,...,...,...
48655,5,yuri,0.000000
48656,5,zambales,0.000000
48657,5,zamboanga,0.014210
48658,5,zone,0.000000


In [27]:
aquino_idf = aquino_idf.rename(columns={'level_0': 'sona_no','level_1': 'term', 0: 'tfidf'})
aquino_idf

Unnamed: 0,sona_no,term,tfidf
0,0,____________________,0.000000
1,0,_________________________,0.000000
2,0,aabang,0.000000
3,0,aabot,0.043111
4,0,aabuso,0.000000
...,...,...,...
48655,5,yuri,0.000000
48656,5,zambales,0.000000
48657,5,zamboanga,0.014210
48658,5,zone,0.000000


In [28]:
all_aquino = aquino_idf.sort_values(by=['sona_no','tfidf'], ascending=[True,False]).groupby(['sona_no']).head(10)
all_aquino.head()

Unnamed: 0,sona_no,term,tfidf
6220,0,pesos,0.26122
5414,0,noong,0.140109
7484,0,taon,0.129332
5300,0,natuklasan,0.121397
4187,0,mas,0.118554


In [29]:
# # Terms in this list will get a red dot in the visualization
term_list = ['boss', 'wangwang']

# adding a little randomness to break ties in term ranking
all_aquino_plusRand = all_aquino.copy()
all_aquino_plusRand['tfidf'] = all_aquino_plusRand['tfidf'] + np.random.rand(all_aquino.shape[0])*0.0001

# base for all visualizations, with rank calculation
base = alt.Chart(all_aquino_plusRand).encode(
    x = 'rank:O',
    y = 'sona_no:N'
).transform_window(
    rank = "rank()",
    sort = [alt.SortField("tfidf", order="descending")],
    groupby = ["sona_no"],
)

# heatmap specification
heatmap = base.mark_rect().encode(
    color = 'tfidf:Q'
)

# red circle over terms in above list
circle = base.mark_circle(size=100).encode(
    color = alt.condition(
        alt.FieldOneOfPredicate(field='term', oneOf=term_list),
        alt.value('red'),
        alt.value('#FFFFFF00')        
    )
)

# text labels, white for darker heatmap colors
text = base.mark_text(baseline='middle').encode(
    text = 'term:N',
    color = alt.condition(alt.datum.tfidf >= 0.23, alt.value('white'), alt.value('black'))
)

# display the three superimposed visualizations
(heatmap + circle + text).properties(width = 600, height=400)