KeyBERT: https://github.com/MaartenGr/KeyBERT
```python
pip install keybert
```

In [65]:
from keybert import KeyBERT
import os
import pandas as pd
from tqdm import tqdm

In [2]:
filepath = 'data/preprocessed_sentences_for_each_period_1997Q1-2019Q3.csv'

save_dir = 'C:\DATA\hot_topic_detection_in_central_bankers_speeches'
save_filepath = os.path.join(save_dir, 'top5_hot_topics_bigram_KeyBERT.csv')

In [3]:
df = pd.read_csv(filepath)
sorted_periods = sorted(df[df['period'].apply(lambda x: x[:4]!='1997')].period.unique())
df.set_index('period', inplace=True)
print(sorted_periods)
df.head()

['1998_Q1', '1998_Q2', '1998_Q3', '1998_Q4', '1999_Q1', '1999_Q2', '1999_Q3', '1999_Q4', '2000_Q1', '2000_Q2', '2000_Q3', '2000_Q4', '2001_Q1', '2001_Q2', '2001_Q3', '2001_Q4', '2002_Q1', '2002_Q2', '2002_Q3', '2002_Q4', '2003_Q1', '2003_Q2', '2003_Q3', '2003_Q4', '2004_Q1', '2004_Q2', '2004_Q3', '2004_Q4', '2005_Q1', '2005_Q2', '2005_Q3', '2005_Q4', '2006_Q1', '2006_Q2', '2006_Q3', '2006_Q4', '2007_Q1', '2007_Q2', '2007_Q3', '2007_Q4', '2008_Q1', '2008_Q2', '2008_Q3', '2008_Q4', '2009_Q1', '2009_Q2', '2009_Q3', '2009_Q4', '2010_Q1', '2010_Q2', '2010_Q3', '2010_Q4', '2011_Q1', '2011_Q2', '2011_Q3', '2011_Q4', '2012_Q1', '2012_Q2', '2012_Q3', '2012_Q4', '2013_Q1', '2013_Q2', '2013_Q3', '2013_Q4', '2014_Q1', '2014_Q2', '2014_Q3', '2014_Q4', '2015_Q1', '2015_Q2', '2015_Q3', '2015_Q4', '2016_Q1', '2016_Q2', '2016_Q3', '2016_Q4', '2017_Q1', '2017_Q2', '2017_Q3', '2017_Q4', '2018_Q1', '2018_Q2', '2018_Q3', '2018_Q4', '2019_Q1', '2019_Q2', '2019_Q3']


Unnamed: 0_level_0,document
period,Unnamed: 1_level_1
1997_Q1,Mr. Greenspan addresses some key roles of a ce...
1997_Q2,Mr. Greenspan highlights some key aspects of t...
1997_Q3,Mr. Greenspan presents the views of the Federa...
1997_Q4,Mr. Greenspan considers some of the effects of...
1998_Q1,Mr. Greenspan’s remarks to the American Econom...


In [4]:
kw_model = KeyBERT()

In [74]:
def normalize_bigrams(original_df):
    original_df.sort_values(by=['score'], ascending=False, inplace=True)
    original_df['sorted_word'] = original_df['word'].apply(lambda x: sorted(x.split()))
    
    no_duplicated_df = original_df.drop_duplicates(subset=['sorted_word']).drop(columns=['sorted_word'])
    
    word_score_dict = dict(zip(original_df['word'].values, original_df['score'].values))
    def sum_scores(word):
        tokens = word.split()
        return word_score_dict.setdefault(' '.join(sorted(tokens, reverse=False)), 0) \
                    + word_score_dict.setdefault(' '.join(sorted(tokens, reverse=True)), 0)

    if len(original_df) != len(no_duplicated_df):
        no_duplicated_df['score'] = no_duplicated_df.apply(lambda x: sum_scores(x['word']), axis=1)
        
    return no_duplicated_df

dfs = []
for period in tqdm(sorted_periods):
    doc = df.loc[period]['document']
    keywords = kw_model.extract_keywords(doc, keyphrase_ngram_range=(2, 2), top_n=10)
    one_period_df = pd.DataFrame(keywords, columns=['word', 'score'])
    
    one_period_df = normalize_bigrams(one_period_df)
    
    one_period_df['period'] = period
    dfs.append(one_period_df)
top5_df = pd.concat(dfs)

top5_df.to_csv(save_filepath, index=False)
print('Created {}'.format(save_filepath))

100%|██████████████████████████████████████████████████████████████████████████████████| 87/87 [30:55<00:00, 21.33s/it]

Created C:\DATA\hot_topic_detection_in_central_bankers_speeches\top5_hot_topics_bigram_KeyBERT.csv





In [75]:
# Key periods
top5_df[(top5_df['period'].isin(['1998_Q2', '2000_Q1', '2007_Q2']))]

Unnamed: 0,word,score,period
0,consumer markets,0.5845,1998_Q2
1,consumers shifting,0.5727,1998_Q2
2,retailing supermarkets,0.5565,1998_Q2
3,forces consumers,0.552,1998_Q2
4,consumers evolving,0.5463,1998_Q2
5,embracing market,0.5437,1998_Q2
6,drives consumers,0.5423,1998_Q2
7,market capitalism,0.542,1998_Q2
8,markets changing,0.5401,1998_Q2
9,changes consumers,0.5365,1998_Q2
