In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import pandas as pd
from tqdm import tqdm

In [2]:
filepath = 'data/preprocessed_1997Q1-2019Q3.csv'

save_dir = '/media/dmlab/My Passport/DATA/hot_topic_detection_in_central_bankers_speeches'
save_filepath = os.path.join(save_dir, 'top5_hot_topics_bigram_TFIDF.csv')

In [3]:
df = pd.read_csv(filepath)
df.sort_values(by=['period'], ascending=True, inplace=True)
sorted_periods = sorted(df.period.values)
print(sorted_periods)
df.head()

['1997_Q1', '1997_Q2', '1997_Q3', '1997_Q4', '1998_Q1', '1998_Q2', '1998_Q3', '1998_Q4', '1999_Q1', '1999_Q2', '1999_Q3', '1999_Q4', '2000_Q1', '2000_Q2', '2000_Q3', '2000_Q4', '2001_Q1', '2001_Q2', '2001_Q3', '2001_Q4', '2002_Q1', '2002_Q2', '2002_Q3', '2002_Q4', '2003_Q1', '2003_Q2', '2003_Q3', '2003_Q4', '2004_Q1', '2004_Q2', '2004_Q3', '2004_Q4', '2005_Q1', '2005_Q2', '2005_Q3', '2005_Q4', '2006_Q1', '2006_Q2', '2006_Q3', '2006_Q4', '2007_Q1', '2007_Q2', '2007_Q3', '2007_Q4', '2008_Q1', '2008_Q2', '2008_Q3', '2008_Q4', '2009_Q1', '2009_Q2', '2009_Q3', '2009_Q4', '2010_Q1', '2010_Q2', '2010_Q3', '2010_Q4', '2011_Q1', '2011_Q2', '2011_Q3', '2011_Q4', '2012_Q1', '2012_Q2', '2012_Q3', '2012_Q4', '2013_Q1', '2013_Q2', '2013_Q3', '2013_Q4', '2014_Q1', '2014_Q2', '2014_Q3', '2014_Q4', '2015_Q1', '2015_Q2', '2015_Q3', '2015_Q4', '2016_Q1', '2016_Q2', '2016_Q3', '2016_Q4', '2017_Q1', '2017_Q2', '2017_Q3', '2017_Q4', '2018_Q1', '2018_Q2', '2018_Q3', '2018_Q4', '2019_Q1', '2019_Q2', '2019_Q3'

Unnamed: 0,tokens,period,num_of_tokens
0,address key role remark chairman governor u re...,1997_Q1,16246
1,highlight key aspect current economic situatio...,1997_Q2,13432
2,present view reserve semi annual humphrey hawk...,1997_Q3,6409
3,considers effect technological change remark c...,1997_Q4,15038
4,s remark american economic association america...,1998_Q1,10804


In [4]:
dfs = []
for period_idx, period in tqdm(enumerate((sorted_periods))):
    if period[:4] == '1997':
        continue
        
    tfidf_vectorizer = TfidfVectorizer(ngram_range=(2,2), tokenizer=lambda x: x.split(' '), \
                                   lowercase=False, smooth_idf=True, use_idf=True) 
    
    target_periods = sorted_periods[period_idx-4:period_idx+1]
    tfidf_vec = tfidf_vectorizer.fit_transform(df[(df['period'].isin(target_periods))].tokens.values) 
    
    index_to_word = tfidf_vectorizer.get_feature_names()
    _X = tfidf_vec[-1]
    _df = pd.DataFrame(_X.toarray().T, columns=['score'])\
                        .sort_values(by=['score'], ascending=False)
    _df['vocab_index'] = _df.index
    _df['word'] = _df['vocab_index'].apply(lambda x: index_to_word[x])
    _df.drop(columns=['vocab_index'], inplace=True)
    _df['period'] = period
    if period in ['2007_Q1', '2007_Q2']:
        print(_df[(_df['word']=='subprime mortgage')])
    dfs.append(_df.iloc[:5])


top5_df = pd.concat(dfs)
top5_df = top5_df[(top5_df['score']>0)]
top5_df = top5_df[['word', 'score', 'period']].sort_values(by=['period', 'score'], ascending=False)
top5_df.reset_index(drop=True, inplace=True)

top5_df.to_csv(save_filepath, index=False)
print('Created {}'.format(save_filepath))

42it [00:04,  6.47it/s]

          score               word   period
38517  0.025702  subprime mortgage  2007_Q1
          score               word   period
44411  0.129482  subprime mortgage  2007_Q2


91it [00:10,  8.96it/s]


Created /media/dmlab/My Passport/DATA/hot_topic_detection_in_central_bankers_speeches/top5_hot_topics_bigram_TFIDF.csv


In [5]:
# Key periods
top5_df[(top5_df['period'].isin(['1998_Q2', '2000_Q1', '2007_Q2']))]

Unnamed: 0,word,score,period
245,hedge fund,0.218501,2007_Q2
246,u s,0.149708,2007_Q2
247,subprime mortgage,0.129482,2007_Q2
248,finance premium,0.120838,2007_Q2
249,external finance,0.120838,2007_Q2
390,have been,0.134392,2000_Q1
391,good service,0.104275,2000_Q1
392,information technology,0.084413,2000_Q1
393,recent year,0.083995,2000_Q1
394,excess demand,0.079323,2000_Q1


In [6]:
# Check
top5_df[(top5_df['period'].isin(['1998_Q1', '1999_Q4', '2007_Q1']))]

Unnamed: 0,word,score,period
250,u s,0.273236,2007_Q1
251,affordable housing,0.164592,2007_Q1
252,long term,0.113848,2007_Q1
253,fed s,0.110229,2007_Q1
254,gse portfolio,0.106188,2007_Q1
395,risk management,0.144416,1999_Q4
396,supervision regulation,0.135708,1999_Q4
397,equity premium,0.133819,1999_Q4
398,internal risk,0.109489,1999_Q4
399,risk manager,0.105551,1999_Q4


In [7]:
top5_df[(top5_df['period'].isin(['2007_Q1', '2007_Q2'])) & (top5_df['word']=='subprime mortgage')]

Unnamed: 0,word,score,period
247,subprime mortgage,0.129482,2007_Q2
