### `pke` package: https://github.com/boudinfl/pke

```bash
pip install git+https://github.com/boudinfl/pke.git

python -m spacy download en_core_web_sm
```

### Models 
* TopicalPageRank (Sterckx et al., 2015)
    - Sterckx, Lucas, Thomas Demeester, Johannes Deleu, and Chris Develder. "Topical word importance for fast keyphrase extraction." In Proceedings of the 24th International Conference on World Wide Web, pp. 121-122. 2015.
* PositionRank (Florescu and Caragea, 2017)
    - Florescu, Corina, and Cornelia Caragea. "Positionrank: An unsupervised approach to keyphrase extraction from scholarly documents." In Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 1105-1115. 2017.
* MultipartiteRank (Boudin, 2018)
    - [Unsupervised Keyphrase Extraction with Multipartite Graphs (Boudin, NAACL 2018)](https://aclanthology.org/N18-2105/)

In [1]:
import pke
import os
import pandas as pd
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
filepath = 'data/preprocessed_sentences_for_each_period_1997Q1-2019Q3.csv'

save_dir = 'C:\DATA\hot_topic_detection_in_central_bankers_speeches'
save_filepath_format = os.path.join(save_dir, 'top5_hot_topics_{}.csv') # model name

In [3]:
df = pd.read_csv(filepath)
sorted_periods = sorted(df[df['period'].apply(lambda x: x[:4]!='1997')].period.unique())
df.set_index('period', inplace=True)
print(sorted_periods)
df.head()

['1998_Q1', '1998_Q2', '1998_Q3', '1998_Q4', '1999_Q1', '1999_Q2', '1999_Q3', '1999_Q4', '2000_Q1', '2000_Q2', '2000_Q3', '2000_Q4', '2001_Q1', '2001_Q2', '2001_Q3', '2001_Q4', '2002_Q1', '2002_Q2', '2002_Q3', '2002_Q4', '2003_Q1', '2003_Q2', '2003_Q3', '2003_Q4', '2004_Q1', '2004_Q2', '2004_Q3', '2004_Q4', '2005_Q1', '2005_Q2', '2005_Q3', '2005_Q4', '2006_Q1', '2006_Q2', '2006_Q3', '2006_Q4', '2007_Q1', '2007_Q2', '2007_Q3', '2007_Q4', '2008_Q1', '2008_Q2', '2008_Q3', '2008_Q4', '2009_Q1', '2009_Q2', '2009_Q3', '2009_Q4', '2010_Q1', '2010_Q2', '2010_Q3', '2010_Q4', '2011_Q1', '2011_Q2', '2011_Q3', '2011_Q4', '2012_Q1', '2012_Q2', '2012_Q3', '2012_Q4', '2013_Q1', '2013_Q2', '2013_Q3', '2013_Q4', '2014_Q1', '2014_Q2', '2014_Q3', '2014_Q4', '2015_Q1', '2015_Q2', '2015_Q3', '2015_Q4', '2016_Q1', '2016_Q2', '2016_Q3', '2016_Q4', '2017_Q1', '2017_Q2', '2017_Q3', '2017_Q4', '2018_Q1', '2018_Q2', '2018_Q3', '2018_Q4', '2019_Q1', '2019_Q2', '2019_Q3']


Unnamed: 0_level_0,document
period,Unnamed: 1_level_1
1997_Q1,Mr. Greenspan addresses some key roles of a ce...
1997_Q2,Mr. Greenspan highlights some key aspects of t...
1997_Q3,Mr. Greenspan presents the views of the Federa...
1997_Q4,Mr. Greenspan considers some of the effects of...
1998_Q1,Mr. Greenspan’s remarks to the American Econom...


In [4]:
def get_keyphrases(extractor, doc, n=5):
    extractor.load_document(input=doc, language='en')

    extractor.candidate_selection()
    extractor.candidate_weighting()

    keyphrases = extractor.get_n_best(n=n)
    return keyphrases # (keyphrase, score) tuples

### TopicalPageRank (Sterckx et al., 2015)
Sterckx, Lucas, Thomas Demeester, Johannes Deleu, and Chris Develder. "Topical word importance for fast keyphrase extraction." In Proceedings of the 24th International Conference on World Wide Web, pp. 121-122. 2015.

In [11]:
extractor = pke.unsupervised.TopicalPageRank()
save_filepath = save_filepath_format.format('TopicalPageRank')

In [12]:
dfs = []
for _, period in tqdm(enumerate(sorted_periods)):
    doc = df.loc[period]['document']
    keywords = get_keyphrases(extractor, doc, n=5)
    one_period_df = pd.DataFrame(keywords, columns=['word', 'score'])
    
    one_period_df['period'] = period
    dfs.append(one_period_df)
    
top5_df = pd.concat(dfs)

top5_df.to_csv(save_filepath, index=False)
print('Created {}'.format(save_filepath))

# Key periods
top5_df[(top5_df['period'].isin(['1998_Q2', '2000_Q1', '2007_Q2']))]















87it [18:54, 13.04s/it]

Created C:\DATA\hot_topic_detection_in_central_bankers_speeches\top5_hot_topics_TopicalPageRank.csv





Unnamed: 0,word,score,period
0,new high-tech international financial system,0.042367,1998_Q2
1,new international financial system,0.041111,1998_Q2
2,many financial markets,0.038286,1998_Q2
3,international financial system remarks,0.037287,1998_Q2
4,major financial market disruptions,0.0366,1998_Q2
0,many new technologies,0.024574,2000_Q1
1,new financial products,0.023078,2000_Q1
2,productive capital investments,0.022437,2000_Q1
3,innovative financial products,0.022254,2000_Q1
4,competitive market economy,0.021175,2000_Q1


### PositionRank (Florescu and Caragea, 2017)
Florescu, Corina, and Cornelia Caragea. "Positionrank: An unsupervised approach to keyphrase extraction from scholarly documents." In Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 1105-1115. 2017.

In [13]:
extractor = pke.unsupervised.PositionRank()
save_filepath = save_filepath_format.format('PositionRank')

In [14]:
dfs = []
for _, period in tqdm(enumerate(sorted_periods)):
    doc = df.loc[period]['document']
    keywords = get_keyphrases(extractor, doc, n=5)
    one_period_df = pd.DataFrame(keywords, columns=['word', 'score'])
    
    one_period_df['period'] = period
    dfs.append(one_period_df)
    
top5_df = pd.concat(dfs)

top5_df.to_csv(save_filepath, index=False)
print('Created {}'.format(save_filepath))

# Key periods
top5_df[(top5_df['period'].isin(['1998_Q2', '2000_Q1', '2007_Q2']))]







87it [13:45,  9.48s/it]

Created C:\DATA\hot_topic_detection_in_central_bankers_speeches\top5_hot_topics_PositionRank.csv





Unnamed: 0,word,score,period
0,mr. alan greenspan,0.069165,1998_Q2
1,mr. greenspan,0.062879,1998_Q2
2,free market systems,0.048051,1998_Q2
3,market capitalism remarks,0.046764,1998_Q2
4,mr. chairman,0.046381,1998_Q2
0,mr alan greenspan,0.074618,2000_Q1
1,mr greenspan,0.066422,2000_Q1
2,many new technologies,0.03892,2000_Q1
3,new technologies,0.033363,2000_Q1
4,us economy remarks,0.033355,2000_Q1


### MultipartiteRank (Boudin, 2018)
[Unsupervised Keyphrase Extraction with Multipartite Graphs (Boudin, NAACL 2018)](https://aclanthology.org/N18-2105/)

In [15]:
extractor = pke.unsupervised.MultipartiteRank()
save_filepath = save_filepath_format.format('MultipartiteRank')

In [17]:
dfs = []
for _, period in tqdm(enumerate(sorted_periods)):
    doc = df.loc[period]['document']
    keywords = get_keyphrases(extractor, doc, n=5)
    one_period_df = pd.DataFrame(keywords, columns=['word', 'score'])
    
    one_period_df['period'] = period
    dfs.append(one_period_df)
    
top5_df = pd.concat(dfs)

top5_df.to_csv(save_filepath, index=False)
print('Created {}'.format(save_filepath))

# Key periods
top5_df[(top5_df['period'].isin(['1998_Q2', '2000_Q1', '2007_Q2']))]

3it [11:59, 239.92s/it]


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "C:\Users\Jihye Park\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3378, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\Jihye Park\AppData\Local\Temp\ipykernel_21832\3222944803.py", line 4, in <module>
    keywords = get_keyphrases(extractor, doc, n=5)
  File "C:\Users\Jihye Park\AppData\Local\Temp\ipykernel_21832\3198141450.py", line 5, in get_keyphrases
    extractor.candidate_weighting()
  File "C:\Users\Jihye Park\anaconda3\lib\site-packages\pke\unsupervised\graph_based\multipartiterank.py", line 231, in candidate_weighting
    self.build_topic_graph()
  File "C:\Users\Jihye Park\anaconda3\lib\site-packages\pke\unsupervised\graph_based\multipartiterank.py", line 154, in build_topic_graph
    weights.append(1.0 / gap)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Jihye Park\anaconda3\lib\site