Reference
* https://radimrehurek.com/gensim/models/ldamodel.html
* https://medium.com/analytics-vidhya/topic-modeling-using-gensim-lda-in-python-48eaa2344920

In [1]:
import os
import pandas as pd
import gensim
from pprint import pprint

import pyLDAvis
from pyLDAvis import gensim_models
pyLDAvis.enable_notebook()

In [2]:
filepath = 'data/preprocessed_speeches_1997Q1-2019Q3.csv'

In [4]:
df = pd.read_csv(filepath)
df

Unnamed: 0,tokens,period
0,discus technology u economy remark chairman go...,2000_Q1
1,give testimony over the counter derivative tes...,2000_Q1
2,present reserve s semi annual report economy m...,2000_Q1
3,focus revolution information technology implic...,2000_Q1
4,remark economic challenge facing united state ...,2000_Q1
...,...,...
486,release delivery a m edt a m cdt june opening ...,2019_Q2
487,h economic outlook monetary review speech h ch...,2019_Q2
488,h welcoming remark stress testing discussion r...,2019_Q3
489,h monetary post crisis era speech h chair gove...,2019_Q3


In [27]:
for target_period in ['1998_Q2', '2000_Q1', '2007_Q2']:
    print('==={}==='.format(target_period))
    list_of_tokens_for_all_documents = [doc.split(' ') for doc in df[df['period']==target_period]['tokens']]
    '''
    `list_of_tokens_for_all_documents` is equivalent to `gensim.test.utils.common_texts`

    [['human', 'interface', 'computer'],
     ['survey', 'user', 'computer', 'system', 'response', 'time'],
     ['eps', 'user', 'interface', 'system'],
     ['system', 'human', 'system', 'eps'],
     ['user', 'response', 'time'],
     ['trees'],
     ['graph', 'trees'],
     ['graph', 'minors', 'trees'],
     ['graph', 'minors', 'survey']]
    '''

    id2word = gensim.corpora.dictionary.Dictionary(list_of_tokens_for_all_documents)
    corpus = [id2word.doc2bow(doc) for doc in list_of_tokens_for_all_documents]

    '''
    `corpus` structure example
    Gensim creates unique id for each word in the document. Its mapping of word_id and word_frequency. 
    For example, (4,15) indicates, word_id 4 occurs 15 times in the document.

    [[(0, 1), (1, 1), (2, 1)],
     [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
     [(2, 1), (5, 1), (7, 1), (8, 1)],
     [(1, 1), (5, 2), (8, 1)],
     [(3, 1), (6, 1), (7, 1)],
     [(9, 1)],
     [(9, 1), (10, 1)],
     [(9, 1), (10, 1), (11, 1)],
     [(4, 1), (10, 1), (11, 1)]]
    '''

    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                               id2word=id2word,
                                               num_topics=5, 
                                               random_state=100,
                                               update_every=1,
                                               chunksize=100,
                                               passes=10,
                                               alpha='auto',
                                               per_word_topics=True)

    pprint(lda_model.print_topics())

    # pyLDAvis
    vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)

    save_filepath = './LDA_{}.html'.format(target_period)
    pyLDAvis.save_html(vis, save_filepath)
    print('Created {}'.format(save_filepath))

  and should_run_async(code)


===1998_Q2===
[(0,
  '0.018*"have" + 0.016*"is" + 0.015*"imf" + 0.010*"not" + 0.009*"crisis" + '
  '0.009*"been" + 0.008*"economy" + 0.008*"system" + 0.008*"asian" + '
  '0.007*"be"'),
 (1,
  '0.001*"have" + 0.001*"is" + 0.001*"be" + 0.001*"not" + 0.001*"are" + '
  '0.001*"system" + 0.001*"been" + 0.001*"risk" + 0.001*"economy" + '
  '0.001*"more"'),
 (2,
  '0.017*"is" + 0.013*"economy" + 0.011*"not" + 0.010*"system" + 0.010*"are" + '
  '0.008*"have" + 0.007*"be" + 0.006*"consumer" + 0.006*"state" + '
  '0.006*"planning"'),
 (3,
  '0.023*"have" + 0.016*"been" + 0.011*"growth" + 0.010*"be" + '
  '0.010*"increase" + 0.009*"economy" + 0.009*"price" + 0.009*"year" + '
  '0.009*"is" + 0.008*"not"'),
 (4,
  '0.017*"system" + 0.015*"risk" + 0.015*"be" + 0.015*"is" + 0.010*"have" + '
  '0.010*"not" + 0.009*"capital" + 0.007*"more" + 0.006*"are" + '
  '0.006*"crisis"')]


  default_term_info = default_term_info.sort_values(


Created ./LDA_1998_Q2.html
===2000_Q1===
[(0,
  '0.015*"is" + 0.013*"have" + 0.012*"be" + 0.011*"are" + 0.009*"year" + '
  '0.008*"budget" + 0.008*"not" + 0.007*"economy" + 0.006*"price" + '
  '0.006*"been"'),
 (1,
  '0.013*"is" + 0.012*"be" + 0.012*"have" + 0.012*"are" + 0.010*"technology" + '
  '0.009*"capital" + 0.009*"not" + 0.008*"growth" + 0.007*"information" + '
  '0.007*"business"'),
 (2,
  '0.002*"is" + 0.001*"are" + 0.001*"have" + 0.001*"be" + 0.001*"not" + '
  '0.001*"technology" + 0.001*"business" + 0.001*"capital" + 0.001*"growth" + '
  '0.001*"increase"'),
 (3,
  '0.001*"be" + 0.001*"is" + 0.001*"are" + 0.001*"year" + 0.001*"have" + '
  '0.001*"economy" + 0.001*"economic" + 0.001*"capital" + 0.001*"growth" + '
  '0.001*"not"'),
 (4,
  '0.013*"have" + 0.013*"worker" + 0.012*"are" + 0.012*"is" + 0.012*"business" '
  '+ 0.010*"community" + 0.007*"be" + 0.007*"not" + 0.007*"economic" + '
  '0.007*"s"')]


  default_term_info = default_term_info.sort_values(


Created ./LDA_2000_Q1.html
===2007_Q2===
[(0,
  '0.001*"have" + 0.001*"s" + 0.001*"mortgage" + 0.001*"are" + 0.001*"is" + '
  '0.001*"trade" + 0.001*"be" + 0.000*"risk" + 0.000*"borrower" + '
  '0.000*"subprime"'),
 (1,
  '0.021*"risk" + 0.013*"fund" + 0.012*"have" + 0.011*"are" + 0.010*"investor" '
  '+ 0.010*"hedge" + 0.008*"be" + 0.008*"s" + 0.008*"is" + 0.006*"regulation"'),
 (2,
  '0.012*"s" + 0.012*"i" + 0.010*"is" + 0.009*"wa" + 0.007*"not" + '
  '0.007*"today" + 0.007*"washington" + 0.007*"school" + 0.007*"robinson" + '
  '0.006*"education"'),
 (3,
  '0.027*"trade" + 0.016*"s" + 0.013*"job" + 0.012*"is" + 0.012*"u" + '
  '0.008*"are" + 0.007*"more" + 0.007*"have" + 0.007*"firm" + 0.006*"service"'),
 (4,
  '0.012*"credit" + 0.012*"have" + 0.012*"mortgage" + 0.008*"be" + '
  '0.008*"borrower" + 0.008*"lending" + 0.008*"are" + 0.008*"s" + 0.007*"cra" '
  '+ 0.007*"loan"')]


  default_term_info = default_term_info.sort_values(


Created ./LDA_2007_Q2.html
