# Processing Federal Reserve statements

This processes all collated Federal Reserve statements from the [scraper](https://github.com/pmagtulis/fed-statement-scraper.git).

## Do all your imports

In [1]:
import pandas as pd
import numpy as np
import re
import altair as alt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import stopwordsiso as stopwords

## Read CSV

In [2]:
merged= pd.read_csv('merged.csv')
merged.tail(10)

Unnamed: 0,meetings,links,statements
44,2021-11-03,https://www.federalreserve.gov/newsevents/pres...,The Federal Reserve is committed to using its ...
45,2021-12-15,https://www.federalreserve.gov/newsevents/pres...,The Federal Reserve is committed to using its ...
46,2022-01-26,https://www.federalreserve.gov/newsevents/pres...,Indicators of economic activity and employment...
47,2022-03-16,https://www.federalreserve.gov/newsevents/pres...,Indicators of economic activity and employment...
48,2022-05-04,https://www.federalreserve.gov/newsevents/pres...,Although overall economic activity edged down ...
49,2022-06-15,https://www.federalreserve.gov/newsevents/pres...,Overall economic activity appears to have pick...
50,2022-07-27,https://www.federalreserve.gov/newsevents/pres...,Recent indicators of spending and production h...
51,2022-09-21,https://www.federalreserve.gov/newsevents/pres...,Recent indicators point to modest growth in sp...
52,2022-11-02,https://www.federalreserve.gov/newsevents/pres...,Recent indicators point to modest growth in sp...
53,2022-12-14,https://www.federalreserve.gov/newsevents/pres...,Recent indicators point to modest growth in sp...


## Text analysis

Now, we can proceed with the text analysis proper. First stop, we set the parameters in the immediate cell below, most importantly the stopwords we want our analysis to disregard.

In [3]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    return text #removes all numbers

In [4]:
y_columns = ['meetings', 'statements']
BINARY=False
NGRAM_RANGE=(1,1)
MIN_DF=5 #omit words not used that much in documents.
STPWORDS=stopwords.stopwords(["en"])
STPWORDS.update(['committee', 'patrick', 'harker', 'jerome', 'powell', 'lael', 'brainard',
                'michelle', 'bowman', 'christopher', 'waller', 'lisa', 'cook','esther',
                'george', 'loretta', 'mester', 'james', 'bullard', 'john', 'williams'])
                #these are names by FOMC members which often appear in the Fed's statements.

vectorizer = CountVectorizer(
    stop_words=STPWORDS,
    ngram_range=NGRAM_RANGE,
    binary=BINARY,
    min_df=MIN_DF,
    preprocessor=preprocess_text
)

## Vectorizing

Simple counting of words that occur in a speech.

In [5]:
X = vectorizer.fit_transform(merged['statements'])
X



<54x317 sparse matrix of type '<class 'numpy.int64'>'
	with 6555 stored elements in Compressed Sparse Row format>

In [6]:
fed_vectors = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
fed_vectors.round(2)
fed_vectors.tail(15)

Unnamed: 0,accommodative,account,achieve,achieved,action,activity,actual,addition,additional,adjust,...,voting,warrant,weak,weaker,weathered,weigh,weighing,wide,william,yellen
39,3,1,2,1,1,2,0,1,0,1,...,1,0,1,0,0,1,0,1,0,0
40,3,1,2,1,1,1,0,1,0,1,...,1,0,1,0,0,1,0,1,0,0
41,3,1,2,1,1,1,0,1,0,1,...,1,0,1,0,0,0,0,1,0,0
42,3,1,2,1,1,1,0,0,0,1,...,1,0,0,0,0,0,0,1,0,0
43,3,1,2,1,1,1,0,0,0,1,...,1,0,0,0,0,0,0,1,0,0
44,3,1,2,1,1,2,0,0,0,2,...,1,0,0,0,0,0,0,1,0,0
45,2,1,1,0,1,2,0,0,0,2,...,1,0,0,0,0,0,0,1,0,0
46,2,1,1,0,1,2,0,0,0,1,...,1,0,0,0,0,0,0,1,0,0
47,0,1,1,0,2,2,0,1,1,1,...,2,0,0,0,0,1,0,1,0,0
48,0,1,1,0,1,2,0,2,1,1,...,1,0,0,0,0,1,0,1,0,0


In [7]:
fed_vectors = fed_vectors.transpose() #swapping columns and row positions

In [8]:
fed_vectors

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,44,45,46,47,48,49,50,51,52,53
accommodative,2,2,2,1,1,1,1,1,1,1,...,3,2,2,0,0,0,0,0,0,0
account,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,2,2
achieve,0,0,0,0,0,0,0,0,0,0,...,2,1,1,1,1,1,1,1,1,1
achieved,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
action,1,2,1,2,1,1,1,2,1,1,...,1,1,1,2,1,2,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
weigh,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
weighing,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,1,1,1
wide,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
william,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# fed_vectors.columns = ['SONA1', 'SONA2', 'SONA3', 'SONA4', 'SONA5', 'SONA6'] #rename columns
fed_vectors.sort_values(50, ascending=False).head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,44,45,46,47,48,49,50,51,52,53
inflation,12,13,13,12,12,13,15,14,13,13,...,10,7,6,6,7,7,7,7,9,9
rate,7,8,7,7,6,7,8,9,7,7,...,2,3,4,4,3,4,3,3,3,3
policy,5,5,5,5,5,4,3,3,3,3,...,6,5,4,4,4,3,3,3,6,6
economic,7,7,7,6,6,7,7,7,6,7,...,5,5,4,4,4,4,3,3,5,5
range,3,4,3,4,3,3,3,4,3,3,...,4,4,3,4,3,4,3,3,4,4
monetary,3,3,3,3,3,3,3,3,3,3,...,4,3,3,4,4,3,3,3,6,6
percent,6,6,6,6,6,6,6,6,5,5,...,7,3,3,4,3,5,3,3,4,4
chair,1,1,1,1,1,1,1,1,1,0,...,2,2,2,2,2,2,2,2,2,2
target,2,3,2,3,2,2,2,3,2,2,...,2,2,2,3,2,3,2,2,3,3
goals,0,0,0,0,0,0,0,0,0,0,...,3,3,2,2,2,2,2,2,2,2


# TF-IDF

In [10]:
vectorizer = TfidfVectorizer(
    stop_words=STPWORDS, 
    ngram_range=NGRAM_RANGE,
    binary=BINARY,
    min_df=MIN_DF,
    preprocessor=preprocess_text
)
X = vectorizer.fit_transform(merged['statements'])
fed_idf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
#[print(x) for x in statements.sentence]
fed_idf.round(2)



Unnamed: 0,accommodative,account,achieve,achieved,action,activity,actual,addition,additional,adjust,...,voting,warrant,weak,weaker,weathered,weigh,weighing,wide,william,yellen
0,0.09,0.03,0.0,0.0,0.03,0.06,0.13,0.0,0.0,0.0,...,0.03,0.06,0.0,0.0,0.0,0.0,0.0,0.03,0.06,0.07
1,0.08,0.03,0.0,0.0,0.05,0.05,0.12,0.0,0.0,0.0,...,0.05,0.06,0.0,0.0,0.0,0.0,0.0,0.03,0.06,0.07
2,0.09,0.03,0.0,0.0,0.03,0.05,0.13,0.0,0.0,0.0,...,0.03,0.06,0.0,0.0,0.0,0.0,0.0,0.03,0.06,0.07
3,0.04,0.03,0.0,0.0,0.05,0.05,0.12,0.0,0.0,0.0,...,0.05,0.06,0.0,0.0,0.0,0.0,0.0,0.03,0.06,0.07
4,0.05,0.03,0.0,0.0,0.03,0.06,0.13,0.0,0.0,0.0,...,0.03,0.07,0.0,0.0,0.0,0.0,0.0,0.03,0.06,0.07
5,0.05,0.03,0.0,0.0,0.03,0.09,0.13,0.0,0.0,0.0,...,0.03,0.07,0.0,0.0,0.0,0.0,0.0,0.03,0.06,0.07
6,0.04,0.03,0.0,0.0,0.03,0.08,0.13,0.0,0.0,0.0,...,0.03,0.06,0.0,0.0,0.0,0.0,0.0,0.03,0.06,0.07
7,0.05,0.03,0.0,0.0,0.06,0.09,0.13,0.0,0.0,0.0,...,0.06,0.07,0.0,0.0,0.0,0.0,0.0,0.03,0.06,0.07
8,0.05,0.03,0.0,0.0,0.03,0.06,0.15,0.0,0.0,0.0,...,0.03,0.07,0.0,0.0,0.0,0.0,0.0,0.03,0.07,0.08
9,0.05,0.03,0.0,0.0,0.03,0.06,0.14,0.0,0.0,0.0,...,0.03,0.07,0.0,0.0,0.0,0.0,0.0,0.03,0.07,0.0


In [11]:
fed_idf2 = fed_idf.transpose()
# fed_idf2.columns = ['SONA1', 'SONA2', 'SONA3', 'SONA4', 'SONA5', 'SONA6'] #rename columns

In [12]:
fed_idf2.sort_values(52, ascending=False).head(15)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,44,45,46,47,48,49,50,51,52,53
inflation,0.331496,0.345154,0.354692,0.318503,0.339702,0.374872,0.417709,0.405143,0.415447,0.39254,...,0.257372,0.212795,0.227853,0.257658,0.28768,0.280007,0.320183,0.319239,0.354668,0.359637
policy,0.130776,0.125689,0.129163,0.12565,0.134013,0.109209,0.079098,0.082198,0.090772,0.085767,...,0.146208,0.143911,0.143821,0.162634,0.155644,0.113619,0.129921,0.129538,0.223866,0.227003
monetary,0.075663,0.07272,0.07473,0.072697,0.077536,0.078981,0.076272,0.079262,0.08753,0.082704,...,0.093991,0.083262,0.104013,0.156825,0.150084,0.109561,0.125281,0.124912,0.21587,0.218895
reducing,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.066031,0.0,0.0,0.110174,0.210876,0.205252,0.234701,0.23401,0.202206,0.205039
economic,0.189877,0.182492,0.187535,0.156372,0.16678,0.198205,0.191407,0.198909,0.188278,0.207546,...,0.126359,0.149249,0.149156,0.168667,0.161417,0.157111,0.13474,0.134343,0.193475,0.196186
percent,0.156931,0.150827,0.154995,0.15078,0.160815,0.163814,0.158195,0.164396,0.151287,0.142945,...,0.170576,0.086346,0.107866,0.162634,0.116733,0.189365,0.129921,0.129538,0.149244,0.151335
range,0.077051,0.098739,0.076101,0.098708,0.078958,0.08043,0.077672,0.107622,0.089136,0.084221,...,0.095715,0.113053,0.105922,0.159703,0.114629,0.148761,0.127579,0.127204,0.146554,0.148608
stance,0.066505,0.063919,0.065685,0.063898,0.068151,0.069422,0.067041,0.069669,0.076936,0.072694,...,0.092942,0.073185,0.091424,0.155075,0.148409,0.0963,0.110118,0.109793,0.142308,0.144301
increases,0.044117,0.042402,0.043573,0.042388,0.045209,0.046052,0.044473,0.046216,0.051037,0.048223,...,0.041103,0.0,0.0,0.068581,0.065633,0.063883,0.073049,0.072833,0.125869,0.127633
philip,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.117433,0.134283,0.133887,0.115691,0.117312


## Looking for specific words

In this part, we are looking for specific words and see how relevant were they in the Fed's statements through time/ 

The cell below covers four words initially, ones that we believe matter to the Fed's statements.

In [13]:
fed_slice = fed_idf[['inflation', 'raise','increase','reduce']] # you can change this
fed_slice.sort_index().round(decimals=2)

Unnamed: 0,inflation,raise,increase,reduce
0,0.33,0.0,0.0,0.0
1,0.35,0.05,0.0,0.0
2,0.35,0.0,0.0,0.0
3,0.32,0.05,0.0,0.07
4,0.34,0.0,0.0,0.0
5,0.37,0.0,0.0,0.0
6,0.42,0.0,0.0,0.0
7,0.41,0.06,0.0,0.0
8,0.42,0.0,0.0,0.0
9,0.39,0.06,0.0,0.0


In [14]:
fed_slice = fed_slice.stack().reset_index()
fed_slice = fed_slice.rename(columns={'level_0': 'sona_no','level_1': 'term', 'tfidf': 'term', 0: 'tfidf'})
fed_slice.head()

Unnamed: 0,sona_no,term,tfidf
0,0,inflation,0.331496
1,0,raise,0.0
2,0,increase,0.0
3,0,reduce,0.0
4,1,inflation,0.345154


In [15]:
top_tfidf = fed_slice.sort_values(by=['sona_no','tfidf'], ascending=[True,False]).groupby(['sona_no']).head(10)
top_tfidf.head()

Unnamed: 0,sona_no,term,tfidf
0,0,inflation,0.331496
1,0,raise,0.0
2,0,increase,0.0
3,0,reduce,0.0
4,1,inflation,0.345154


## Chart it

In [16]:
# # Terms in this list will get a red dot in the visualization
term_list = ['boss', 'wangwang'] # you can change this

# adding a little randomness to break ties in term ranking
top_tfidf_plusRand = top_tfidf.copy()
top_tfidf_plusRand['tfidf'] = top_tfidf_plusRand['tfidf'] + np.random.rand(top_tfidf.shape[0])*0.0001

# base for all visualizations, with rank calculation
base = alt.Chart(top_tfidf_plusRand).encode(
    x = 'rank:O',
    y = 'sona_no:N'
).transform_window(
    rank = "rank()",
    sort = [alt.SortField("tfidf", order="descending")],
    groupby = ["sona_no"],
)

# heatmap specification
heatmap = base.mark_rect().encode(
    color = 'tfidf:Q'
)

# red circle over terms in above list
circle = base.mark_circle(size=100).encode(
    color = alt.condition(
        alt.FieldOneOfPredicate(field='term', oneOf=term_list),
        alt.value('red'),
        alt.value('#FFFFFF00')        
    )
)

# text labels, white for darker heatmap colors
text = base.mark_text(baseline='middle').encode(
    text = 'term:N',
    color = alt.condition(alt.datum.tfidf >= 0.23, alt.value('white'), alt.value('black'))
)

# display the three superimposed visualizations
(heatmap + circle + text).properties(width = 600, height=400)

## Entire statements

In here, we do the same thing for all of Fed's statements *without* isolating key words.

In [17]:
# aquino_idf = aquino_idf.stack().reset_index()
# aquino_idf

In [18]:
# aquino_idf = aquino_idf.rename(columns={'level_0': 'sona_no','level_1': 'term', 0: 'tfidf'})
# aquino_idf

In [19]:
# all_aquino = aquino_idf.sort_values(by=['sona_no','tfidf'], ascending=[True,False]).groupby(['sona_no']).head(10)
# # all_aquino.head()

In [20]:
# # # Terms in this list will get a red dot in the visualization
# term_list = ['boss', 'wangwang']

# # adding a little randomness to break ties in term ranking
# all_aquino_plusRand = all_aquino.copy()
# all_aquino_plusRand['tfidf'] = all_aquino_plusRand['tfidf'] + np.random.rand(all_aquino.shape[0])*0.0001

# # base for all visualizations, with rank calculation
# base = alt.Chart(all_aquino_plusRand).encode(
#     x = 'rank:O',
#     y = 'sona_no:N'
# ).transform_window(
#     rank = "rank()",
#     sort = [alt.SortField("tfidf", order="descending")],
#     groupby = ["sona_no"],
# )

# # heatmap specification
# heatmap = base.mark_rect().encode(
#     color = 'tfidf:Q'
# )

# # red circle over terms in above list
# circle = base.mark_circle(size=100).encode(
#     color = alt.condition(
#         alt.FieldOneOfPredicate(field='term', oneOf=term_list),
#         alt.value('red'),
#         alt.value('#FFFFFF00')        
#     )
# )

# # text labels, white for darker heatmap colors
# text = base.mark_text(baseline='middle').encode(
#     text = 'term:N',
#     color = alt.condition(alt.datum.tfidf >= 0.23, alt.value('white'), alt.value('black'))
# )

# # display the three superimposed visualizations
# (heatmap + circle + text).properties(width = 600, height=400)