In [1]:
import os, re
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

In [2]:
filepath = 'data/Fed_chairs_199701-201909_491.csv'
save_filepath = 'data/preprocessed_1997Q1-2019Q3.csv'

In [3]:
df = pd.read_csv(filepath)
df.head()

Unnamed: 0,content,date,key,pdf_url,short_info,speaker,title
0,Mr Greenspan discusses technology and the US e...,2000-01-17,r000117a,https://www.bis.org/review/r000117a.pdf,Remarks by Mr Alan Greenspan Chairman of the ...,Greenspan,Mr Greenspan discusses technology and the US e...
1,Mr Greenspan gives a testimony on over-the-cou...,2000-02-16,r000216a,https://www.bis.org/review/r000216a.pdf,Testimony of Mr Alan Greenspan Chairman of th...,Greenspan,Mr Greenspan gives a testimony on over-the-cou...
2,Mr Greenspan presents the Federal Reserve’s se...,2000-02-18,r000218b,https://www.bis.org/review/r000218b.pdf,Testimony of Mr Alan Greenspan Chairman of th...,Greenspan,Mr Greenspan presents the Federal Reserve's se...
3,Mr Greenspan focuses on the revolution in info...,2000-02-10,r000310a,https://www.bis.org/review/r000310a.pdf,Speech by Mr Alan Greenspan Chairman of the B...,Greenspan,Mr Greenspan focuses on the revolution in info...
4,Mr Greenspan remarks on some of the economic c...,2000-02-24,r000324a,https://www.bis.org/review/r000324a.pdf,Remarks by Mr Alan Greenspan Chairman of the ...,Greenspan,Mr Greenspan remarks on some of the economic c...


In [4]:
number_pattern = re.compile('\d+')
def get_sentences(content):
    content = number_pattern.sub(' ', content) # remove_number

    sentences = sent_tokenize(content)
    for i in range(len(sentences)):
        sentences[i] = sentences[i].strip()
    return sentences

df['sentences'] = df.apply(lambda x: get_sentences(x['content']),axis=1)
df.head()

Unnamed: 0,content,date,key,pdf_url,short_info,speaker,title,sentences
0,Mr Greenspan discusses technology and the US e...,2000-01-17,r000117a,https://www.bis.org/review/r000117a.pdf,Remarks by Mr Alan Greenspan Chairman of the ...,Greenspan,Mr Greenspan discusses technology and the US e...,[Mr Greenspan discusses technology and the US ...
1,Mr Greenspan gives a testimony on over-the-cou...,2000-02-16,r000216a,https://www.bis.org/review/r000216a.pdf,Testimony of Mr Alan Greenspan Chairman of th...,Greenspan,Mr Greenspan gives a testimony on over-the-cou...,[Mr Greenspan gives a testimony on over-the-co...
2,Mr Greenspan presents the Federal Reserve’s se...,2000-02-18,r000218b,https://www.bis.org/review/r000218b.pdf,Testimony of Mr Alan Greenspan Chairman of th...,Greenspan,Mr Greenspan presents the Federal Reserve's se...,[Mr Greenspan presents the Federal Reserve’s s...
3,Mr Greenspan focuses on the revolution in info...,2000-02-10,r000310a,https://www.bis.org/review/r000310a.pdf,Speech by Mr Alan Greenspan Chairman of the B...,Greenspan,Mr Greenspan focuses on the revolution in info...,[Mr Greenspan focuses on the revolution in inf...
4,Mr Greenspan remarks on some of the economic c...,2000-02-24,r000324a,https://www.bis.org/review/r000324a.pdf,Remarks by Mr Alan Greenspan Chairman of the ...,Greenspan,Mr Greenspan remarks on some of the economic c...,[Mr Greenspan remarks on some of the economic ...


In [5]:
stopword_list = ['alan', 'bank', 'banking', 'ben', 'bernanke', 'bi', 'board',\
                 'central', 'federal', 'financial', 'greenspan', 'ha', 'ii', 'janet',\
                 'jerome', 'market', 'mr', 'percent', 'policy', 'powell']
print('Number of stopwords: {}'.format(len(stopword_list)))

Number of stopwords: 20


In [6]:
alphabet_pattern = re.compile('[^a-zA-Z]')
doublespace_pattern = re.compile('\s+')
def get_pos_tagged_words(content):
    content = content.lower() # lowercase
    words = word_tokenize(content)

    lemmatizer = WordNetLemmatizer()
    for i in range(len(words)):
        words[i] = lemmatizer.lemmatize(words[i]) # lemmatize
        
        # alphabet_only
        words[i] = alphabet_pattern.sub(' ', words[i])
        words[i] = doublespace_pattern.sub(' ', words[i])
        words[i] = words[i].strip()
    words = [word for word in words if word != '']

    pos_tagged_words = pos_tag(words)

    tag_list = ['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
    pos_tagged_words = [(x, y) for (x, y) in pos_tagged_words if y in tag_list]
    pos_tagged_words = [(x, y) for (x, y) in pos_tagged_words if x not in stopword_list]
    
    return pos_tagged_words

df['unigrams_by_sentence'] = df.progress_apply(lambda x: [get_pos_tagged_words(sentence) for sentence in x['sentences']],axis=1)
df.head()

100%|██████████| 491/491 [01:09<00:00,  7.05it/s]


Unnamed: 0,content,date,key,pdf_url,short_info,speaker,title,sentences,unigrams_by_sentence
0,Mr Greenspan discusses technology and the US e...,2000-01-17,r000117a,https://www.bis.org/review/r000117a.pdf,Remarks by Mr Alan Greenspan Chairman of the ...,Greenspan,Mr Greenspan discusses technology and the US e...,[Mr Greenspan discusses technology and the US ...,"[[(discus, NN), (technology, NN), (u, JJ), (ec..."
1,Mr Greenspan gives a testimony on over-the-cou...,2000-02-16,r000216a,https://www.bis.org/review/r000216a.pdf,Testimony of Mr Alan Greenspan Chairman of th...,Greenspan,Mr Greenspan gives a testimony on over-the-cou...,[Mr Greenspan gives a testimony on over-the-co...,"[[(give, VBP), (testimony, NN), (over the coun..."
2,Mr Greenspan presents the Federal Reserve’s se...,2000-02-18,r000218b,https://www.bis.org/review/r000218b.pdf,Testimony of Mr Alan Greenspan Chairman of th...,Greenspan,Mr Greenspan presents the Federal Reserve's se...,[Mr Greenspan presents the Federal Reserve’s s...,"[[(present, VBD), (reserve, NN), (s, VBD), (se..."
3,Mr Greenspan focuses on the revolution in info...,2000-02-10,r000310a,https://www.bis.org/review/r000310a.pdf,Speech by Mr Alan Greenspan Chairman of the B...,Greenspan,Mr Greenspan focuses on the revolution in info...,[Mr Greenspan focuses on the revolution in inf...,"[[(focus, NN), (revolution, NN), (information,..."
4,Mr Greenspan remarks on some of the economic c...,2000-02-24,r000324a,https://www.bis.org/review/r000324a.pdf,Remarks by Mr Alan Greenspan Chairman of the ...,Greenspan,Mr Greenspan remarks on some of the economic c...,[Mr Greenspan remarks on some of the economic ...,"[[(remark, NN), (economic, JJ), (challenge, NN..."


In [18]:
print(df[df['pdf_url']=='https://www.bis.org/review/r070216a.pdf']['unigrams_by_sentence'].values)

[list([[('s', 'NN'), ('reserve', 'NN'), ('s', 'VBD'), ('semiannual', 'JJ'), ('monetary', 'JJ'), ('report', 'NN'), ('congress', 'NN'), ('testimony', 'NN'), ('s', 'NN'), ('chairman', 'NN'), ('governor', 'NN'), ('u', 'JJ'), ('reserve', 'NN'), ('system', 'NN'), ('committee', 'NN'), ('housing', 'NN'), ('urban', 'JJ'), ('affair', 'NN'), ('u', 'JJ'), ('senate', 'NN'), ('washington', 'NN'), ('february', 'NN')], [('chairman', 'NN'), ('dodd', 'VBZ'), ('senator', 'NN'), ('shelby', 'NN'), ('other', 'JJ'), ('member', 'NN'), ('committee', 'NN'), ('i', 'NN'), ('am', 'VBP'), ('pleased', 'VBN'), ('present', 'VB'), ('reserve', 'NN'), ('s', 'VBD'), ('monetary', 'JJ'), ('report', 'NN'), ('congress', 'NN')], [('real', 'JJ'), ('activity', 'NN'), ('united', 'JJ'), ('state', 'NN'), ('expanded', 'VBD'), ('solid', 'JJ'), ('pace', 'NN'), ('pattern', 'NN'), ('growth', 'NN'), ('wa', 'NN'), ('uneven', 'JJ')], [('first quarter', 'NN'), ('rebound', 'NN'), ('weakness', 'NN'), ('associated', 'VBN'), ('effect', 'NN'), (

In [20]:
list_of_words = ['exception', 'is', 'subprime', 'mortgage', 'variable', 'interest', 'rate', 'delinquency', 'rate', 'have', 'increased', 'appreciably']
print(list(zip(list_of_words, list_of_words[1:])))

[('exception', 'is'), ('is', 'subprime'), ('subprime', 'mortgage'), ('mortgage', 'variable'), ('variable', 'interest'), ('interest', 'rate'), ('rate', 'delinquency'), ('delinquency', 'rate'), ('rate', 'have'), ('have', 'increased'), ('increased', 'appreciably')]


In [30]:
def get_bigrams(unigrams_by_sentence):
    nested_unigrams = unigrams_by_sentence
    unigrams = [word for word, pos in [item for sub in nested_unigrams for item in sub]]
    return list(zip(unigrams, unigrams[1:]))

df['bigrams'] = df['unigrams_by_sentence'].apply(lambda x: get_bigrams(x))
df.head()

Unnamed: 0,content,date,key,pdf_url,short_info,speaker,title,sentences,unigrams_by_sentence,num_of_sentences,bigrams
0,Mr Greenspan discusses technology and the US e...,2000-01-17,r000117a,https://www.bis.org/review/r000117a.pdf,Remarks by Mr Alan Greenspan Chairman of the ...,Greenspan,Mr Greenspan discusses technology and the US e...,[Mr Greenspan discusses technology and the US ...,"[[(discus, NN), (technology, NN), (u, JJ), (ec...",187,"[(discus, technology), (technology, u), (u, ec..."
1,Mr Greenspan gives a testimony on over-the-cou...,2000-02-16,r000216a,https://www.bis.org/review/r000216a.pdf,Testimony of Mr Alan Greenspan Chairman of th...,Greenspan,Mr Greenspan gives a testimony on over-the-cou...,[Mr Greenspan gives a testimony on over-the-co...,"[[(give, VBP), (testimony, NN), (over the coun...",69,"[(give, testimony), (testimony, over the count..."
2,Mr Greenspan presents the Federal Reserve’s se...,2000-02-18,r000218b,https://www.bis.org/review/r000218b.pdf,Testimony of Mr Alan Greenspan Chairman of th...,Greenspan,Mr Greenspan presents the Federal Reserve's se...,[Mr Greenspan presents the Federal Reserve’s s...,"[[(present, VBD), (reserve, NN), (s, VBD), (se...",115,"[(present, reserve), (reserve, s), (s, semi an..."
3,Mr Greenspan focuses on the revolution in info...,2000-02-10,r000310a,https://www.bis.org/review/r000310a.pdf,Speech by Mr Alan Greenspan Chairman of the B...,Greenspan,Mr Greenspan focuses on the revolution in info...,[Mr Greenspan focuses on the revolution in inf...,"[[(focus, NN), (revolution, NN), (information,...",108,"[(focus, revolution), (revolution, information..."
4,Mr Greenspan remarks on some of the economic c...,2000-02-24,r000324a,https://www.bis.org/review/r000324a.pdf,Remarks by Mr Alan Greenspan Chairman of the ...,Greenspan,Mr Greenspan remarks on some of the economic c...,[Mr Greenspan remarks on some of the economic ...,"[[(remark, NN), (economic, JJ), (challenge, NN...",100,"[(remark, economic), (economic, challenge), (c..."


In [33]:
df[(df['bigrams'].apply(lambda x: ('u', 's') in x))]

Unnamed: 0,content,date,key,pdf_url,short_info,speaker,title,sentences,unigrams_by_sentence,num_of_sentences,bigrams
48,Alan Greenspan: Semi-annual monetary policy re...,2002-07-17,r020717a,https://www.bis.org/review/r020717a.pdf,Testimony of Mr Alan Greenspan Chairman of th...,Greenspan,Alan Greenspan: Semi-annual monetary policy re...,[Alan Greenspan: Semi-annual monetary policy r...,"[[(semi annual, JJ), (monetary, JJ), (report, ...",164,"[(semi annual, monetary), (monetary, report), ..."
161,Mr. Greenspan's remarks to the Economic Club o...,1997-12-08,r971208d,https://www.bis.org/review/r971208d.pdf,Remarks by the Chairman of the Board of the U ...,Greenspan,Mr. Greenspan's remarks to the Economic Club o...,[Mr. Greenspan's remarks to the Economic Club ...,"[[(s, VBP), (remark, NN), (economic, JJ), (clu...",188,"[(s, remark), (remark, economic), (economic, c..."
260,Ben S Bernanke: Remarks on Class Day 2008 Spee...,2008-06-06,r080606a,https://www.bis.org/review/r080606a.pdf,Speech of Mr Ben S Bernanke Chairman of the B...,Bernanke,Ben S Bernanke: Remarks on Class Day 2008,[Ben S Bernanke: Remarks on Class Day Speech...,"[[(s, NN), (remark, NN), (class, NN), (day, NN...",175,"[(s, remark), (remark, class), (class, day), (..."


In [37]:
print(df[(df['bigrams'].apply(lambda x: ('u', 's') in x))].iloc[0]['bigrams'])

[('semi annual', 'monetary'), ('monetary', 'report'), ('report', 'u'), ('u', 'congress'), ('congress', 'testimony'), ('testimony', 'chairman'), ('chairman', 'governor'), ('governor', 'u'), ('u', 'reserve'), ('reserve', 'system'), ('system', 'occasion'), ('occasion', 'reserve'), ('reserve', 's'), ('s', 'semiannual'), ('semiannual', 'monetary'), ('monetary', 'report'), ('report', 'congress'), ('congress', 'committee'), ('committee', 'service'), ('service', 'u'), ('u', 'house'), ('house', 'representative'), ('representative', 'july'), ('july', 'i'), ('i', 'appreciate'), ('appreciate', 'opportunity'), ('opportunity', 'present'), ('present', 'reserve'), ('reserve', 's'), ('s', 'monetary'), ('monetary', 'report'), ('report', 'congress'), ('congress', 'one half'), ('one half', 'month'), ('month', 'i'), ('i', 'last'), ('last', 'testified'), ('testified', 'committee'), ('committee', 'monetary'), ('monetary', 'economy'), ('economy', 'continued'), ('continued', 'expand'), ('expand', 'largely'), (

In [7]:
def remove_outlier_sentence(df):
    for index, row in df.iterrows():
        i = 0
        while i < len(row['sentences']):
            unigrams_len = len(row['unigrams_by_sentence'][i])
            if (unigrams_len < 3) or (unigrams_len > 180):
                del row['sentences'][i]
                del row['unigrams_by_sentence'][i]
                i -= 1
            i += 1
    return df

df = remove_outlier_sentence(df)

df['num_of_sentences'] = df['sentences'].apply(lambda x: len(x))
print('Number of sentences: {}'.format(df['num_of_sentences'].sum()))

Number of sentences: 55458


In [8]:
list_of_num_of_tokens_in_a_sentence = []
for doc in df['unigrams_by_sentence'].values:
    for sent in doc:
        list_of_num_of_tokens_in_a_sentence.append(len(sent))
print('Average number of tokens in a sentence: {}'.format(np.average(list_of_num_of_tokens_in_a_sentence)))

Average number of tokens in a sentence: 15.316257347902917


In [8]:
def quarterly_dict():
    def _period_dict(target_dict, _period, start, end):
        for _month in [str('%02d' % i) for i in range(start, end + 1)]:
            target_dict[_month] = _period
        return target_dict
    target_dict = dict()
    target_dict = _period_dict(target_dict, 'Q1', 1, 3)
    target_dict = _period_dict(target_dict, 'Q2', 4, 6)
    target_dict = _period_dict(target_dict, 'Q3', 7, 9)
    target_dict = _period_dict(target_dict, 'Q4', 10, 12)
    return target_dict

period_dict = quarterly_dict()
df['grouping_period'] = df.apply(lambda x: '_'.join([x['date'][:4], period_dict[x['date'][5:7]]]), axis=1)
df.head()

Unnamed: 0,content,date,key,pdf_url,short_info,speaker,title,sentences,unigrams_by_sentence,num_of_sentences,grouping_period
0,Mr Greenspan discusses technology and the US e...,2000-01-17,r000117a,https://www.bis.org/review/r000117a.pdf,Remarks by Mr Alan Greenspan Chairman of the ...,Greenspan,Mr Greenspan discusses technology and the US e...,[Mr Greenspan discusses technology and the US ...,"[[(discus, NN), (technology, NN), (u, JJ), (ec...",187,2000_Q1
1,Mr Greenspan gives a testimony on over-the-cou...,2000-02-16,r000216a,https://www.bis.org/review/r000216a.pdf,Testimony of Mr Alan Greenspan Chairman of th...,Greenspan,Mr Greenspan gives a testimony on over-the-cou...,[Mr Greenspan gives a testimony on over-the-co...,"[[(give, VBP), (testimony, NN), (over the coun...",69,2000_Q1
2,Mr Greenspan presents the Federal Reserve’s se...,2000-02-18,r000218b,https://www.bis.org/review/r000218b.pdf,Testimony of Mr Alan Greenspan Chairman of th...,Greenspan,Mr Greenspan presents the Federal Reserve's se...,[Mr Greenspan presents the Federal Reserve’s s...,"[[(present, VBD), (reserve, NN), (s, VBD), (se...",115,2000_Q1
3,Mr Greenspan focuses on the revolution in info...,2000-02-10,r000310a,https://www.bis.org/review/r000310a.pdf,Speech by Mr Alan Greenspan Chairman of the B...,Greenspan,Mr Greenspan focuses on the revolution in info...,[Mr Greenspan focuses on the revolution in inf...,"[[(focus, NN), (revolution, NN), (information,...",108,2000_Q1
4,Mr Greenspan remarks on some of the economic c...,2000-02-24,r000324a,https://www.bis.org/review/r000324a.pdf,Remarks by Mr Alan Greenspan Chairman of the ...,Greenspan,Mr Greenspan remarks on some of the economic c...,[Mr Greenspan remarks on some of the economic ...,"[[(remark, NN), (economic, JJ), (challenge, NN...",100,2000_Q1


In [10]:
records = []
for period in sorted(df.grouping_period.unique()):
    sentences_for_one_period = []
    for list_of_sentences in df[(df['grouping_period']==period)]['unigrams_by_sentence'].values:
        flattened_sentences = [item for sub in list_of_sentences for item in sub]
        sentences_for_one_period.extend(flattened_sentences)
    tokens = ' '.join([x for x, y in sentences_for_one_period])
    records.append((tokens, period))
    
preprocessed_df = pd.DataFrame(records, columns=['tokens', 'period'])
print('Number of periods: {}'.format(len(preprocessed_df)))
print('Last quarter:', sorted(preprocessed_df['period'].unique())[-1])
preprocessed_df.head()

Number of periods: 91
Last quarter: 2019_Q3


Unnamed: 0,tokens,period
0,address key role remark chairman governor u re...,1997_Q1
1,highlight key aspect current economic situatio...,1997_Q2
2,present view reserve semi annual humphrey hawk...,1997_Q3
3,considers effect technological change remark c...,1997_Q4
4,s remark american economic association america...,1998_Q1


In [11]:
preprocessed_df['num_of_tokens'] = preprocessed_df['tokens'].apply(lambda x: len(x.split(' ')))
print('Total number of tokens: {}'.format(preprocessed_df['num_of_tokens'].sum()))
preprocessed_df.head()

Total number of tokens: 868838


Unnamed: 0,tokens,period,num_of_tokens
0,address key role remark chairman governor u re...,1997_Q1,16246
1,highlight key aspect current economic situatio...,1997_Q2,13432
2,present view reserve semi annual humphrey hawk...,1997_Q3,6409
3,considers effect technological change remark c...,1997_Q4,15038
4,s remark american economic association america...,1998_Q1,10804


In [12]:
preprocessed_df.to_csv(save_filepath, index=False)
print('Created {}'.format(save_filepath))

Created data/preprocessed_1997Q1-2019Q3.csv
