In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import pandas as pd
from tqdm import tqdm

In [2]:
filepath = 'data/preprocessed_1997Q1-2019Q3.csv'

save_dir = '/media/dmlab/My Passport/DATA/hot_topic_detection_in_central_bankers_speeches'
save_filepath = os.path.join(save_dir, 'top5_hot_topics_bigram_TFIDF.csv')

In [3]:
df = pd.read_csv(filepath)
df = df[df['period'].apply(lambda x: x[:4]!='1997')]
df.sort_values(by=['period'], ascending=True, inplace=True)
df.head()

Unnamed: 0,tokens,period,num_of_tokens
4,s remark american economic association america...,1998_Q1,10804
5,discus ascendance capitalism remark chairman g...,1998_Q2,6988
6,present u reserve s mid year report monetary t...,1998_Q3,9722
7,testifies private sector refinancing large hed...,1998_Q4,2091
8,testifies state u economy testimony chairman g...,1999_Q1,6197


In [4]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(2,2), tokenizer=lambda x: x.split(' '), \
                                   lowercase=False, smooth_idf=True, use_idf=True) 
tfidf_vec = tfidf_vectorizer.fit_transform(df.tokens.values) 
index_to_word = tfidf_vectorizer.get_feature_names()

In [5]:
dfs = []
for period_idx, period in tqdm(enumerate((df.period.values))):
    _X = tfidf_vec[period_idx]
    _df = pd.DataFrame(_X.toarray().T, columns=['score'])\
                        .sort_values(by=['score'], ascending=False).iloc[:5]
    _df['vocab_index'] = _df.index
    _df['word'] = _df['vocab_index'].apply(lambda x: index_to_word[x])
    _df.drop(columns=['vocab_index'], inplace=True)
    _df['period'] = period
    dfs.append(_df)


top5_df = pd.concat(dfs)
top5_df = top5_df[(top5_df['score']>0)]
top5_df = top5_df[['word', 'score', 'period']].sort_values(by=['period', 'score'], ascending=False)
top5_df.reset_index(drop=True, inplace=True)

top5_df.to_csv(save_filepath, index=False)
print('Created {}'.format(save_filepath))

87it [00:01, 64.30it/s]


Created /media/dmlab/My Passport/DATA/hot_topic_detection_in_central_bankers_speeches/top5_hot_topics_bigram_TFIDF.csv


In [6]:
# Key periods
top5_df[(top5_df['period'].isin(['1998_Q2', '2000_Q1', '2007_Q2']))]

Unnamed: 0,word,score,period
245,hedge fund,0.205297,2007_Q2
246,finance premium,0.140616,2007_Q2
247,external finance,0.120244,2007_Q2
248,subprime mortgage,0.11222,2007_Q2
249,lending channel,0.08437,2007_Q2
390,wealth effect,0.087818,2000_Q1
391,excess demand,0.084878,2000_Q1
392,newer technology,0.074034,2000_Q1
393,otc derivative,0.070896,2000_Q1
394,have been,0.068921,2000_Q1
