In [1]:
import os, copy
import pandas as pd
from collections import Counter
from tqdm import tqdm

In [2]:
filepath = 'data/preprocessed_1997Q1-2019Q3.csv'

save_dir = '/media/dmlab/My Passport/DATA/hot_topic_detection_in_central_bankers_speeches'
if not os.path.exists(save_dir): os.makedirs(save_dir)
save_filepath_format = os.path.join(save_dir, 'candidates_{}_{}.csv')

In [3]:
df = pd.read_csv(filepath)
df.set_index('period', inplace=True)
sorted_periods = sorted(list(df.index))
print(sorted_periods)
df.head()

['1997_Q1', '1997_Q2', '1997_Q3', '1997_Q4', '1998_Q1', '1998_Q2', '1998_Q3', '1998_Q4', '1999_Q1', '1999_Q2', '1999_Q3', '1999_Q4', '2000_Q1', '2000_Q2', '2000_Q3', '2000_Q4', '2001_Q1', '2001_Q2', '2001_Q3', '2001_Q4', '2002_Q1', '2002_Q2', '2002_Q3', '2002_Q4', '2003_Q1', '2003_Q2', '2003_Q3', '2003_Q4', '2004_Q1', '2004_Q2', '2004_Q3', '2004_Q4', '2005_Q1', '2005_Q2', '2005_Q3', '2005_Q4', '2006_Q1', '2006_Q2', '2006_Q3', '2006_Q4', '2007_Q1', '2007_Q2', '2007_Q3', '2007_Q4', '2008_Q1', '2008_Q2', '2008_Q3', '2008_Q4', '2009_Q1', '2009_Q2', '2009_Q3', '2009_Q4', '2010_Q1', '2010_Q2', '2010_Q3', '2010_Q4', '2011_Q1', '2011_Q2', '2011_Q3', '2011_Q4', '2012_Q1', '2012_Q2', '2012_Q3', '2012_Q4', '2013_Q1', '2013_Q2', '2013_Q3', '2013_Q4', '2014_Q1', '2014_Q2', '2014_Q3', '2014_Q4', '2015_Q1', '2015_Q2', '2015_Q3', '2015_Q4', '2016_Q1', '2016_Q2', '2016_Q3', '2016_Q4', '2017_Q1', '2017_Q2', '2017_Q3', '2017_Q4', '2018_Q1', '2018_Q2', '2018_Q3', '2018_Q4', '2019_Q1', '2019_Q2', '2019_Q3'

Unnamed: 0_level_0,tokens,num_of_tokens
period,Unnamed: 1_level_1,Unnamed: 2_level_1
1997_Q1,address key role remark chairman governor u re...,16246
1997_Q2,highlight key aspect current economic situatio...,13432
1997_Q3,present view reserve semi annual humphrey hawk...,6409
1997_Q4,considers effect technological change remark c...,15038
1998_Q1,s remark american economic association america...,10804


In [4]:
def concat_values_for_reversed_order_of_key(counter):
    keys = copy.copy(list(counter.keys()))
    for (one, two) in keys:
        if counter[(one, two)] >= counter[(two, one)]:
            counter[(one, two)] += counter[(two, one)]
            del counter[(two, one)]
        else:
            counter[(two, one)] += counter[(one, two)]
            del counter[(one, two)]
            
def get_bigram_counter(tokens):
    bigram_counter = Counter(list(zip(tokens, tokens[1:])))
    concat_values_for_reversed_order_of_key(bigram_counter)
    return bigram_counter

def get_15_collocation_counter(tokens):
    # 15-window collocation & frequency
    # 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15-gram
    collocation_counter = Counter()
    for i in range(1, 14+1):
        collocation_counter.update(list(zip(tokens, tokens[i:])))
    concat_values_for_reversed_order_of_key(collocation_counter)
    return collocation_counter

def get_previous_period(current_period, how_many_quarters):
    if not sorted_periods.index(current_period)>=4:
        raise Exception('Keyword extraction is only available from the 1998_Q1')
    previous_quarter = sorted_periods[sorted_periods.index(current_period)-how_many_quarters]
    return previous_quarter

In [5]:
dictionary_value_is_bigram_counter = dict()
for period in tqdm(sorted_periods):
    tokens = df.loc[period]['tokens'].split(' ')
    bigram_counter = get_bigram_counter(tokens)
    dictionary_value_is_bigram_counter[period] = bigram_counter

# Bigram & Frequency
records = []
for period in tqdm(sorted_periods):
    counter = dictionary_value_is_bigram_counter[period]
    for tuple_key, current_freq in counter.items():
        records.append((period, ' '.join(tuple_key), current_freq))

candidates_df = pd.DataFrame(records, columns=['period', 'word', 'score'])

save_filepath = save_filepath_format.format('bigram', 'frequency')
candidates_df.to_csv(save_filepath, index=False)
print('Created {}'.format(save_filepath))

# Bigram & Emergence score
records = []
for period in tqdm(sorted_periods):
    counter = dictionary_value_is_bigram_counter[period]
    
    try:
        for tuple_key, current_freq in counter.items():
            denominator = 0
            for i in range(1,4+1):
                previous_quarter = get_previous_period(period, i)
                prev_freq = dictionary_value_is_bigram_counter[previous_quarter][tuple_key]
                denominator += (prev_freq + 1)
                denominator /= 4
            emergence_score = current_freq / denominator

            records.append((period, ' '.join(tuple_key), emergence_score))
    except: continue
        
candidates_df = pd.DataFrame(records, columns=['period', 'word', 'score'])

save_filepath = save_filepath_format.format('bigram', 'emergence')
candidates_df.to_csv(save_filepath, index=False)
print('Created {}'.format(save_filepath))

100%|██████████| 91/91 [00:01<00:00, 69.03it/s]
100%|██████████| 91/91 [00:00<00:00, 331.73it/s]


Created /media/dmlab/My Passport/DATA/hot_topic_detection_in_central_bankers_speeches/candidates_bigram_frequency.csv


100%|██████████| 91/91 [00:06<00:00, 14.60it/s]


Created /media/dmlab/My Passport/DATA/hot_topic_detection_in_central_bankers_speeches/candidates_bigram_emergence.csv


In [6]:
dictionary_value_is_collocation_counter = dict()
for period in tqdm(sorted_periods):
    tokens = df.loc[period]['tokens'].split(' ')
    collocation_counter = get_15_collocation_counter(tokens)
    dictionary_value_is_collocation_counter[period] = collocation_counter

# Collocation & Frequency
records = []
for period in tqdm(sorted_periods):
    counter = dictionary_value_is_collocation_counter[period]
    for tuple_key, current_freq in counter.items():
        records.append((period, ' '.join(tuple_key), current_freq))

candidates_df = pd.DataFrame(records, columns=['period', 'word', 'score'])

save_filepath = save_filepath_format.format('collocation', 'frequency')
candidates_df.to_csv(save_filepath, index=False)
print('Created {}'.format(save_filepath))

# Collocation & Emergence score
records = []
for period in tqdm(sorted_periods):
    counter = dictionary_value_is_collocation_counter[period]
    
    try:
        for tuple_key, current_freq in counter.items():
            denominator = 0
            for i in range(1,4+1):
                previous_quarter = get_previous_period(period, i)
                prev_freq = dictionary_value_is_collocation_counter[previous_quarter][tuple_key]
                denominator += (prev_freq + 1)
                denominator /= 4
            emergence_score = current_freq / denominator

            records.append((period, ' '.join(tuple_key), emergence_score))
    except: continue
        
candidates_df = pd.DataFrame(records, columns=['period', 'word', 'score'])

save_filepath = save_filepath_format.format('collocation', 'emergence')
candidates_df.to_csv(save_filepath, index=False)
print('Created {}'.format(save_filepath))

100%|██████████| 91/91 [00:17<00:00,  5.24it/s]
100%|██████████| 91/91 [00:02<00:00, 30.96it/s]


Created /media/dmlab/My Passport/DATA/hot_topic_detection_in_central_bankers_speeches/candidates_collocation_frequency.csv


100%|██████████| 91/91 [01:12<00:00,  1.25it/s]


Created /media/dmlab/My Passport/DATA/hot_topic_detection_in_central_bankers_speeches/candidates_collocation_emergence.csv
