In [22]:
import os
from glob import glob
from nltk import sent_tokenize
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
import numpy as np
from sklearn.cluster import KMeans
from collections import defaultdict

# Before preprocessing

In [2]:
filepaths = glob('./Statements/*/*.txt')
print('Number of documents: {}'.format(len(filepaths)))

Number of documents: 213


In [3]:
def extract_sentences(doc):
    sentences = []
    for temp_sentence in doc.split('\n'):
        sentences.extend(sent_tokenize(temp_sentence))
    sentences = [item.strip() for item in sentences]
    return sentences

dfs = []
for filepath in tqdm(filepaths):
    with open(filepath, 'r', encoding='utf-8-sig') as f:
        date = os.path.basename(filepath).replace('.txt', '')
        url = f.readline()
        doc = ''.join(f.readlines())
        sentences = extract_sentences(doc)

        one_doc_sentences_df = pd.DataFrame(sentences, columns=['sentence'])
        one_doc_sentences_df['meeting_date'] = '-'.join([date[:4], date[4:6], date[6:]])
        one_doc_sentences_df['sentence_index'] = one_doc_sentences_df.index
        dfs.append(one_doc_sentences_df)
df = pd.concat(dfs)
df.reset_index(drop=True, inplace=True)

print('Number of sentences: {}'.format(len(df)))
df

100%|███████████████████████████████████████████████████████████████████████████████| 213/213 [00:00<00:00, 217.85it/s]


Number of sentences: 3372


Unnamed: 0,sentence,meeting_date,sentence_index
0,"Release Date: February 4, 1994",1994-02-04,0
1,For immediate release,1994-02-04,1
2,Chairman Alan Greenspan announced today that t...,1994-02-04,2
3,The action is expected to be associated with a...,1994-02-04,3
4,The decision was taken to move toward a less a...,1994-02-04,4
...,...,...,...
3367,In assessing the appropriate stance of monetar...,2022-07-27,10
3368,The Committee would be prepared to adjust the ...,2022-07-27,11
3369,The Committee's assessments will take into acc...,2022-07-27,12
3370,Voting for the monetary policy action were Jer...,2022-07-27,13


# Preprocessing

Extract sentence embeddings using SEC-BERT
* The model was pre-trained on 260,773 10-K filings from 1993-2019, publicly available at U.S. Securities and Exchange Commission (SEC)
* Reference: Loukas, L., Fergadiotis, M., Chalkidis, I., Spyropoulou, E., Malakasiotis, P., Androutsopoulos, I., & Paliouras, G. (2022). FiNER: Financial Numeric Entity Recognition for XBRL Tagging. arXiv preprint arXiv:2203.06482.

In [4]:
from transformers import pipeline
nlp_features = pipeline('feature-extraction', model="nlpaueb/sec-bert-base")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at nlpaueb/sec-bert-base were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
def get_feature(sentence):
    return np.squeeze(nlp_features(sentence)).mean(0) # 평균을 취하는 방법을 택함

df['feature'] = df['sentence'].progress_apply(lambda x: get_feature(x))

100%|██████████████████████████████████████████████████████████████████████████████| 3372/3372 [08:37<00:00,  6.52it/s]


K-Means Clustering

In [7]:
%%time
arr = np.vstack(df['feature'].values)

num_cluster = 50
labels = KMeans(n_clusters=num_cluster, init='k-means++', max_iter=5, random_state=42).fit_predict(arr)

df['cluster_index'] = labels

df.head()

CPU times: total: 23.7 s
Wall time: 7.39 s


Unnamed: 0,sentence,meeting_date,sentence_index,feature,cluster_index
0,"Release Date: February 4, 1994",1994-02-04,0,"[-0.01090921875503328, 0.3668389804661274, -0....",28
1,For immediate release,1994-02-04,1,"[0.016166886687278746, 0.1350989192724228, -0....",20
2,Chairman Alan Greenspan announced today that t...,1994-02-04,2,"[0.08063143004591648, 0.015163101237983657, 0....",24
3,The action is expected to be associated with a...,1994-02-04,3,"[-0.044573451138355515, 0.04237740960988132, 0...",1
4,The decision was taken to move toward a less a...,1994-02-04,4,"[-0.0552548865920731, 0.23561716505459376, 0.4...",22


Clustering results

In [8]:
for cluster_index in sorted(df['cluster_index'].unique()):
    print('Cluster index : {}'.format(cluster_index))
    print(df[df['cluster_index']==cluster_index]['sentence'].head(), '=====\n')

Cluster index : 0
36     The Federal Reserve announced today the follow...
62     In a related move, the Federal Open Market Com...
87     The Federal Reserve today announced the follow...
91     In a related move, the Federal Open Market Com...
117    The Federal Reserve today announced the follow...
Name: sentence, dtype: object =====

Cluster index : 1
3       The action is expected to be associated with a...
12      This action is expected to be associated with ...
19      This action is expected to be associated with ...
317     As a consequence, the FOMC recognizes that the...
1221    In particular, the Committee anticipates that ...
Name: sentence, dtype: object =====

Cluster index : 2
378    Voting for the FOMC monetary policy action wer...
388    Voting for the FOMC monetary policy action wer...
400    Voting for the FOMC monetary policy action wer...
411    Voting for the FOMC monetary policy action wer...
422    Voting for the FOMC monetary policy action wer...
Name: senten

Unnecessary clusters

In [16]:
for cluster_index in [3, 4, 5, 9, 12, 13, 20, 26, 28]:
    print('Cluster index : {}'.format(cluster_index))
    print(df[(df['cluster_index']==cluster_index)]['sentence'].values)

Cluster index : 3
['Home | Press releases' 'Home | Press releases' 'Home | Press releases'
 'Home | Press releases' 'Home | Press releases' 'Home | Press releases'
 'Home | News and events' 'Home | News and events'
 'Home | News and events' 'Home | News and events'
 'Home | News and events' 'Home | News and events'
 'Home | News and events' 'Home | News and events'
 'Home | News and events' 'Home |' 'Home |' 'Home | News and'
 'Home | News and' 'Home | News and' 'Home | News and' 'Home | News and'
 'Home | News and' 'Home | News and' 'Home | News and' 'Home | News and'
 'Home | News and' 'Home | News and' 'Home | News and' 'Home | News and'
 'Home | News and' 'Home | News and' 'Home | News and' 'Home | News and'
 'Home | News and' 'Home | News and' 'Home | News and' 'Home | News and'
 'Home | News and' 'Home | News and' 'Home | News and' 'Home | News and'
 'Home | News and' 'Home | News and' 'Home | News and' 'Home | News and'
 'Home | News and']
Cluster index : 4
['Accessibility | Con

In [18]:
to_be_deleted_rows = df[df.cluster_index.isin([3, 4, 5, 9, 12, 13, 20, 26, 28])]
print('Drop {} rows'.format(len(to_be_deleted_rows)))
df.drop(to_be_deleted_rows.index, inplace=True)

Drop 335 rows


Incomplete cluster containing unfinished phrases

In [17]:
for cluster_index in [40, 45]:
    print('Cluster index : {}'.format(cluster_index))
    print(df[(df['cluster_index']==cluster_index)]['sentence'].values)

Cluster index : 40
['Bank of Canada' 'Bank of England'
 'Statement from Federal Reserve Bank of New York'
 'Statement Regarding Transactions in Agency Mortgage-Backed Securities and Treasury Securities'
 'Statement Regarding Purchases of Treasury Securities and Agency Mortgage-Backed Securities'
 'Statement Regarding Purchases of Treasury Securities and Agency Mortgage-Backed Securities'
 'Statement Regarding Purchases of Treasury Securities and Agency Mortgage-Backed Securities'
 'Statement Regarding Purchases of Treasury Securities and Agency Mortgage-Backed Securities'
 'Statement Regarding Purchases of Treasury Securities and Agency Mortgage-Backed Securities'
 'Statement Regarding Purchases of Treasury Securities and Agency Mortgage-Backed Securities'
 'Statement Regarding Purchases of Treasury Securities and Agency Mortgage-Backed Securities'
 'Statement Regarding Purchases of Treasury Securities and Agency Mortgage-Backed Securities'
 'Statement Regarding Purchases of Treasury S

In [21]:
df[(df['cluster_index']==45)]

Unnamed: 0,sentence,meeting_date,sentence_index,feature,cluster_index
104,"Philadelphia, Cleveland, Atlanta, Minneapolis,...",1996-01-31,18,"[-0.47579996631695676, -0.0016181144433525892,...",45
123,"St. Louis, Minneapolis, Kansas City, and San F...",1998-10-15,7,"[-0.14936120708783468, 0.31271972358226774, 0....",45
244,"St. Louis, Kansas City, Dallas and San Francisco.",2001-01-03,9,"[-0.21832906135490962, 0.23408083591078008, 0....",45
291,St. Louis and San Francisco.,2001-05-15,12,"[-0.05607550260093477, 0.329028084460232, 0.15...",45
334,"New York, Cleveland, Richmond, Atlanta, St. Lo...",2001-10-02,8,"[-0.16531578059259214, -0.105171794194336, 0.1...",45
732,"Richmond, Atlanta, Chicago, St. Louis, Minneap...",2004-12-14,31,"[-0.03362124511285832, 0.1412899830182524, 0.1...",45
733,and San Francisco.,2004-12-14,32,"[-0.12179953356583913, 0.3736587253709634, 0.4...",45
898,"Richmond, Atlanta, Chicago, St. Louis, Minneap...",2005-11-01,32,"[-0.03362124511285832, 0.1412899830182524, 0.1...",45
899,and San Francisco.,2005-11-01,33,"[-0.12179953356583913, 0.3736587253709634, 0.4...",45


Concatenation

In [41]:
meeting_date_dict = defaultdict(list)
for _, row in df.sort_values(by=['meeting_date', 'sentence_index']).iterrows():
    meeting_date_dict[row['meeting_date']].append(row['sentence'])

records = []
for meeting_date, sentences in meeting_date_dict.items():
    records.append((meeting_date, ' '.join(sentences)))
doc_df = pd.DataFrame(records, columns = ['meeting_date', 'doc'])
doc_df.set_index('meeting_date', inplace=True)
doc_df

Unnamed: 0_level_0,doc
meeting_date,Unnamed: 1_level_1
1994-02-04,Chairman Alan Greenspan announced today that t...
1994-03-22,Chairman Alan Greenspan announced today that t...
1994-04-18,Chairman Alan Greenspan announced today that t...
1994-05-17,The Federal Reserve today announced two action...
1994-08-16,The Federal Reserve announced today the follow...
...,...
2022-01-26,Indicators of economic activity and employment...
2022-03-16,Indicators of economic activity and employment...
2022-05-04,Although overall economic activity edged down ...
2022-06-15,Overall economic activity appears to have pick...


In [51]:
print(doc_df.loc['2005-11-01']['doc'], '\n=====\n')
print(sent_tokenize(doc_df.loc['2005-11-01']['doc'])[-1])

The Federal Open Market Committee decided today to raise its target for the federal funds rate by 25 basis points to 4 percent. Elevated energy prices and hurricane-related disruptions in economic activity have temporarily depressed output and employment. However, monetary policy accommodation, coupled with robust underlying growth in productivity, is providing ongoing support to economic activity that will likely be augmented by planned rebuilding in the hurricane-affected areas. rise in energy and other costs has the potential to add to inflation pressures; however, core inflation has been relatively low in recent months and longer-term inflation expectations remain contained. The Committee perceives that, with appropriate monetary policy action, the upside and downside risks to the attainment of both sustainable growth and price stability should be kept roughly equal. With underlying inflation expected to be contained, the Committee believes that policy accommodation can be removed 

# After preprocessing

In [53]:
doc_df.to_csv('Statements_to_be_analyzed.csv')