# LDA using Gensim

This notebook, goes through performing LDA on the text documents of the Climate Change PDFs using Gensim.

In this notebook, we will:
- identify the files and come up with a file list
- load the stopwords
- functionally get the documents
- normalize the documents
- create the dictionary and corpus
- create id2word
- initialize and train the model
- print out top topics
- determine the coherence score and perplexity
- create and save a visualization of topics using pyLDAvis
- test how an external document fits into the model

In [1]:
#imports
import codecs
import pandas as pd
import numpy as np
import os
import json
import time
import csv
import sklearn
import nltk
nltk.download('all')
import unicodedata
from nltk.corpus import PlaintextCorpusReader
from nltk.tokenize import WordPunctTokenizer
from nltk import pos_tag, sent_tokenize, wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import CountVectorizer
from gensim.corpora import Dictionary
from sklearn.base import BaseEstimator, TransformerMixin
from pprint import pprint
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models.callbacks import PerplexityMetric
from gensim.models.callbacks import CoherenceMetric
from gensim.models.coherencemodel import CoherenceModel
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import pyLDAvis.gensim
import pyLDAvis
from gensim.models import TfidfModel

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/rahimjiwa/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/rahimjiwa/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /Users/rahimjiwa/nltk_data...
[nltk_data]    |   Package biocreative_ppi is already up-to-date!
[nltk_data]    | Downloading package brown to
[nltk_data]    |     /Users/rahimjiwa/nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package brown_tei to
[nltk_data]    |     /Users/rahimjiwa/nltk_data...
[nltk_data]    |   Package brown_tei is already up-to-date!
[nltk_data]    | Downloading package cess_cat to
[nltk_data]    |     /Users/rahimjiwa/nltk_data...
[nltk_data]    |   Package cess_cat is already up-

[nltk_data]    |   Package product_reviews_2 is already up-to-date!
[nltk_data]    | Downloading package pros_cons to
[nltk_data]    |     /Users/rahimjiwa/nltk_data...
[nltk_data]    |   Package pros_cons is already up-to-date!
[nltk_data]    | Downloading package qc to
[nltk_data]    |     /Users/rahimjiwa/nltk_data...
[nltk_data]    |   Package qc is already up-to-date!
[nltk_data]    | Downloading package reuters to
[nltk_data]    |     /Users/rahimjiwa/nltk_data...
[nltk_data]    |   Package reuters is already up-to-date!
[nltk_data]    | Downloading package rte to
[nltk_data]    |     /Users/rahimjiwa/nltk_data...
[nltk_data]    |   Package rte is already up-to-date!
[nltk_data]    | Downloading package semcor to
[nltk_data]    |     /Users/rahimjiwa/nltk_data...
[nltk_data]    |   Package semcor is already up-to-date!
[nltk_data]    | Downloading package senseval to
[nltk_data]    |     /Users/rahimjiwa/nltk_data...
[nltk_data]    |   Package senseval is already up-to-date!
[nlt

In [2]:
# let's start by running over the folder where our files are, we can use the plaintextcorpusreader to obtain the fileids
reader = PlaintextCorpusReader('tika_alldocs','.*txt')
# trying with added new docs

In [3]:
# create a list of fileids
file_list = reader.fileids()
file_list

['104890-2017_Climate_Change_Strategy.txt',
 '20170125-en.txt',
 '2017BernardSoubryPolicyBrief.txt',
 '81363.txt',
 "A Practitioners Guide to ClimateChange Adaptation in Ontario's Ecosystems Ver 1 2011.txt",
 'A_Canadian_Opportunity_-_Tackling_climate_change_by_switching_to_clean_power.txt',
 'A_Residential_Guide_to_Flood_Prevention_and_Recovery.txt',
 'Adapting to Climate Change in Coastal Communities In Canada White Paper.txt',
 'Adapting_to_Climate_Change_a_Risk_Based_Guide_for_Local_Governments_EN.txt',
 'Adapting_to_a_Changing_Climate_in_NS.txt',
 'Ahead-of-the-Storm-1.txt',
 'AtlanticRegionAdaptationScienceActivitiesReport.txt',
 'BC-Agriculture-Climate-Change-Action-Plan.txt',
 'Bruce_(2006)_AdaptingtoClimateChange_ARisk-basedGuideforONMunicipalities.txt',
 'CAT_research_plan_2015.txt',
 'CCCR_FULLREPORT-EN-FINAL.txt',
 'ClimatRisk-E-ACCESSIBLE.txt',
 'Climate Change Adaptation - A Priorities Plan for Canada (2012).txt',
 'Climate Change and Energy Sector.txt',
 'Climatecommunic

In [4]:
# load the stopwords text file
with open('stopwords.txt','r') as f:
    # create a list for stopwords
    stopwords = []
    # itearate through the text files, replacing any new line unicode characters
    for i in list(f):
        temp = str(i).replace('\n','')
        # append the stopwords to the list
        stopwords.append(temp)

In [5]:
print(stopwords)

['Stopwords and Punctuation:', '.', ',', 'and', 'the', 'of', 'to', '-', '/', 'in', '(', ')', 'a', 'for', ':', 'is', 'on', 'or', 'that', 'are', 'be', ').', '1', 'as', 'with', '://', '.,', 'from', '2', 'by', 'www', 'http', 'ca', '[', ']', '...', "'", '\\uf0b7', ';', 'at', ').', 'pdf', 'climate', 'change', 'canada', 'en', '):', 'so', 'html', 'do', '.;', 'com', 'canadian', 'impacts', 'adaptation']


In [6]:
# change directory to directory with files
os.chdir('/Users/rahimjiwa/Documents/DataScience/UofT3666_AppliedNLP/Final_Testings/tika_alldocs/')
# trying with added new docs

In [7]:
# create a list for documents
docs = []
# iterate through file list
for i in file_list:
    # load the documents
    text = PlaintextCorpusReader(root=os.getcwd(),fileids=i)
    # create a list of the words in the documents
    try:
        i_doc = list(text.words())
        # create a new list for document
        new_i_doc = []
        # for each word in the documents
        for j in i_doc:
            #check if the length of the word is greater than 2 and that it is not a stopword
            if len(j) > 2 and j.lower() not in stopwords:
                # append to word to the new list of words
                new_i_doc.append(j)
        # append list of words to documents
        docs.append(new_i_doc)
        # print the file name
        print(i)
    except ValueError:
        pass

104890-2017_Climate_Change_Strategy.txt
20170125-en.txt
2017BernardSoubryPolicyBrief.txt
81363.txt
A Practitioners Guide to ClimateChange Adaptation in Ontario's Ecosystems Ver 1 2011.txt
A_Canadian_Opportunity_-_Tackling_climate_change_by_switching_to_clean_power.txt
A_Residential_Guide_to_Flood_Prevention_and_Recovery.txt
Adapting to Climate Change in Coastal Communities In Canada White Paper.txt
Adapting_to_Climate_Change_a_Risk_Based_Guide_for_Local_Governments_EN.txt
Adapting_to_a_Changing_Climate_in_NS.txt
Ahead-of-the-Storm-1.txt
AtlanticRegionAdaptationScienceActivitiesReport.txt
BC-Agriculture-Climate-Change-Action-Plan.txt
CAT_research_plan_2015.txt
CCCR_FULLREPORT-EN-FINAL.txt
ClimatRisk-E-ACCESSIBLE.txt
Climate Change and Energy Sector.txt
ClimatecommunicationsintheNetherlands.txt
Coastal Flooding Ferryland, NL.txt
CoucilReport_july4_EN_Web.txt
En56-226-2008-eng.txt
Environment-and-Renewable-Energy-Workshop-Presentation-Abstracts-final.txt
FBC_WaterGuide_FINAL.txt
FloodReco

In [8]:
# take a look at the output of the documents
docs

[['Saskatchewan',
  'Prairie',
  'Resilience',
  'Made',
  'Saskatchewan',
  'Strategy',
  'Saskatchewan',
  'Saskatchewan',
  'people',
  'pragmatic',
  'resourceful',
  'innovative',
  'Throughout',
  'our',
  'history',
  'have',
  'faced',
  'complex',
  'challenging',
  'problems',
  'imposed',
  'geography',
  'Our',
  'population',
  'spread',
  'over',
  'vast',
  'land',
  'has',
  'taught',
  'self',
  'reliance',
  'resilience',
  'learned',
  'when',
  'faced',
  'challenge',
  'solve',
  'Today',
  'face',
  'global',
  'challenge',
  'once',
  'again',
  'our',
  'province',
  'motivated',
  'develop',
  'effective',
  'response',
  'Our',
  'industries',
  'heavily',
  'dependent',
  'fossil',
  'fuels',
  'produce',
  'energy',
  'food',
  'fertilizer',
  'products',
  'commodities',
  'needed',
  'around',
  'world',
  'come',
  'made',
  'Saskatchewan',
  'solutions',
  'encourage',
  'action',
  'meet',
  'challenges',
  'posed',
  'When',
  'faced',
  'complex',
  '

In [9]:
# check how many documents are being used
len(docs)

77

In [10]:
# define a normalizer class
class TextNormalizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
    
    # define function for punctuation marks
    def is_punct(self, token):
        return all(unicodedata.category(char).startswith('P') for char in token)
    
    def normalize(self, document):
        return [
            # here you could return a lemmatized token instead
            self.lemmatize(token.lower())
            for sentence in document
            for token in sentence
            # check if a punctuation mark
            if not self.is_punct(token)
        ]
    
    # create a lemmatizer function
    def lemmatize(self, token):
        return self.lemmatizer.lemmatize(token)
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, documents):
        return [''.join(self.normalize(doc)) for doc in documents]

In [11]:
# initialize normalizer class
norm = TextNormalizer()

In [12]:
# create a list of the normalized documents
docs_normalized = []
# iterate through each document and apply the normalizer
for i in docs:
    docs_normalized.append(norm.fit_transform(i))

# take a look at the normalized documents
print(docs_normalized[:100])

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [13]:
# create a dictionary of the documents.
dictionary = Dictionary(docs_normalized)

# filter out words present in less than 3 documents or in more than 90% of the documents.
dictionary.filter_extremes(no_below=3, no_above=0.9)

2019-12-16 16:11:42,267 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2019-12-16 16:11:43,120 : INFO : built Dictionary(39554 unique tokens: ['000', '13462', '1980s', '1990', '1e13']...) from 77 documents (total 1135688 corpus positions)
2019-12-16 16:11:43,227 : INFO : discarding 27058 tokens: [('13462', 1), ('1e13', 1), ('5mt', 1), ('9485', 1), ('about', 74), ('all', 73), ('also', 73), ('areas', 71), ('backstop', 2), ('been', 70)]...
2019-12-16 16:11:43,228 : INFO : keeping 12496 tokens which were in no less than 3 and no more than 69 (=90.0%) documents
2019-12-16 16:11:43,258 : INFO : resulting dictionary: Dictionary(12496 unique tokens: ['000', '1980s', '1990', '2005', '2008']...)


In [14]:
# create a bow representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs_normalized]

In [15]:
# check the number of tokens and the number of documents
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 12496
Number of documents: 77


In [16]:
%%time
# index word dictionary
temp = dictionary[0]  
id2word = dictionary.id2token

top_terms = [5,10,15,20]
number_topics = [2,3,4,5,6,7,10]

for i in number_topics:
    for j in top_terms:
    # create the model
        model = LdaModel(
            corpus=corpus,
            id2word=id2word,
            alpha='auto',
            eta='auto',
            num_topics=i,
            eval_every=1,
            random_state=42
        )
        # find the top 10 words for each topic
        top_topics = model.top_topics(corpus, topn=j)
        # print the top 3 topics, and the top 10 words within those topics
        # also includes the topics coherence scores
        print('Number of topics: ' + str(i) + ' Number of Terms: ' + str(j))
        pprint(top_topics)
        # read a new file not trained to the model
        new_doc = PlaintextCorpusReader(root='/Users/rahimjiwa/Documents/DataScience/UofT3666_AppliedNLP/Final_Testings',fileids='random_pdf.txt')
        # get a list of words
        new_doc = list(new_doc.words())
        # create a bow representations of the new documents
        new_doc_bow = dictionary.doc2bow(new_doc)
        # get the document distributions
        print('Random Document Distribution')
        print(model.get_document_topics(new_doc_bow))
        # get the coherence score
        coherence_model = CoherenceModel(model=model, corpus=corpus, coherence='u_mass')
        print("Coherence: " + str(coherence_model.get_coherence()))
        # get the perplexity score
        print("Perplexity: " + str(model.log_perplexity(corpus)))

2019-12-16 16:11:43,885 : INFO : using autotuned alpha, starting with [0.5, 0.5]
2019-12-16 16:11:43,897 : INFO : using serial LDA version on this node
2019-12-16 16:11:43,928 : INFO : running online (single-pass) LDA training, 2 topics, 1 passes over the supplied corpus of 77 documents, updating model once every 77 documents, evaluating perplexity every 77 documents, iterating 50x with a convergence threshold of 0.001000
2019-12-16 16:11:45,681 : INFO : -9.811 per-word bound, 898.2 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:11:45,682 : INFO : PROGRESS: pass 0, at document #77/77
2019-12-16 16:11:45,991 : INFO : optimized alpha [0.869983, 0.91217685]
2019-12-16 16:11:45,996 : INFO : topic #0 (0.870): 0.004*"planning" + 0.004*"land" + 0.004*"energy" + 0.004*"development" + 0.003*"plan" + 0.003*"management" + 0.003*"level" + 0.003*"sea" + 0.003*"community" + 0.003*"coastal"
2019-12-16 16:11:45,997 : INFO : topic #1 (0.912): 0.006*"energ

Number of topics: 2 Number of Terms: 5
[([(0.004147297, 'planning'),
   (0.003976375, 'land'),
   (0.003585681, 'energy'),
   (0.003559576, 'development'),
   (0.0034465855, 'plan')],
  -0.09536137343039641),
 ([(0.005554169, 'energy'),
   (0.0040934845, 'changes'),
   (0.0038522764, 'report'),
   (0.0037370163, 'emissions'),
   (0.0037185382, 'sea')],
  -0.19855591246416965)]
Random Document Distribution
[(0, 0.10143141), (1, 0.8985686)]
Coherence: -0.14222640404144926


2019-12-16 16:11:47,390 : INFO : -8.066 per-word bound, 267.9 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:11:47,391 : INFO : using autotuned alpha, starting with [0.5, 0.5]
2019-12-16 16:11:47,398 : INFO : using serial LDA version on this node
2019-12-16 16:11:47,418 : INFO : running online (single-pass) LDA training, 2 topics, 1 passes over the supplied corpus of 77 documents, updating model once every 77 documents, evaluating perplexity every 77 documents, iterating 50x with a convergence threshold of 0.001000


Perplexity: -8.065665893539961


2019-12-16 16:11:48,792 : INFO : -9.811 per-word bound, 898.2 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:11:48,794 : INFO : PROGRESS: pass 0, at document #77/77
2019-12-16 16:11:49,093 : INFO : optimized alpha [0.869983, 0.91217685]
2019-12-16 16:11:49,097 : INFO : topic #0 (0.870): 0.004*"planning" + 0.004*"land" + 0.004*"energy" + 0.004*"development" + 0.003*"plan" + 0.003*"management" + 0.003*"level" + 0.003*"sea" + 0.003*"community" + 0.003*"coastal"
2019-12-16 16:11:49,099 : INFO : topic #1 (0.912): 0.006*"energy" + 0.004*"changes" + 0.004*"report" + 0.004*"emissions" + 0.004*"sea" + 0.003*"future" + 0.003*"management" + 0.003*"level" + 0.003*"development" + 0.003*"land"
2019-12-16 16:11:49,100 : INFO : topic diff=1.522116, rho=1.000000


Number of topics: 2 Number of Terms: 10
[([(0.005554169, 'energy'),
   (0.0040934845, 'changes'),
   (0.0038522764, 'report'),
   (0.0037370163, 'emissions'),
   (0.0037185382, 'sea'),
   (0.0033258446, 'future'),
   (0.0033047143, 'management'),
   (0.003303176, 'level'),
   (0.0032368281, 'development'),
   (0.0031536152, 'land')],
  -0.08980969599230433),
 ([(0.004147297, 'planning'),
   (0.003976375, 'land'),
   (0.003585681, 'energy'),
   (0.003559576, 'development'),
   (0.0034465855, 'plan'),
   (0.0033540376, 'management'),
   (0.003252176, 'level'),
   (0.003134172, 'sea'),
   (0.0031166258, 'community'),
   (0.0030462486, 'coastal')],
  -0.17480850678403198)]
Random Document Distribution
[(0, 0.10143141), (1, 0.8985686)]
Coherence: -0.14222640404144926


2019-12-16 16:11:50,189 : INFO : -8.066 per-word bound, 267.9 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:11:50,192 : INFO : using autotuned alpha, starting with [0.5, 0.5]
2019-12-16 16:11:50,197 : INFO : using serial LDA version on this node
2019-12-16 16:11:50,205 : INFO : running online (single-pass) LDA training, 2 topics, 1 passes over the supplied corpus of 77 documents, updating model once every 77 documents, evaluating perplexity every 77 documents, iterating 50x with a convergence threshold of 0.001000


Perplexity: -8.065665893539961


2019-12-16 16:11:51,084 : INFO : -9.811 per-word bound, 898.2 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:11:51,085 : INFO : PROGRESS: pass 0, at document #77/77
2019-12-16 16:11:51,357 : INFO : optimized alpha [0.869983, 0.91217685]
2019-12-16 16:11:51,362 : INFO : topic #0 (0.870): 0.004*"planning" + 0.004*"land" + 0.004*"energy" + 0.004*"development" + 0.003*"plan" + 0.003*"management" + 0.003*"level" + 0.003*"sea" + 0.003*"community" + 0.003*"coastal"
2019-12-16 16:11:51,365 : INFO : topic #1 (0.912): 0.006*"energy" + 0.004*"changes" + 0.004*"report" + 0.004*"emissions" + 0.004*"sea" + 0.003*"future" + 0.003*"management" + 0.003*"level" + 0.003*"development" + 0.003*"land"
2019-12-16 16:11:51,366 : INFO : topic diff=1.522116, rho=1.000000


Number of topics: 2 Number of Terms: 15
[([(0.005554169, 'energy'),
   (0.0040934845, 'changes'),
   (0.0038522764, 'report'),
   (0.0037370163, 'emissions'),
   (0.0037185382, 'sea'),
   (0.0033258446, 'future'),
   (0.0033047143, 'management'),
   (0.003303176, 'level'),
   (0.0032368281, 'development'),
   (0.0031536152, 'land'),
   (0.0031317167, 'coastal'),
   (0.0030861602, 'planning'),
   (0.0029496849, 'research'),
   (0.0028741304, 'government'),
   (0.0027247134, 'infrastructure')],
  -0.11813570047859617),
 ([(0.004147297, 'planning'),
   (0.003976375, 'land'),
   (0.003585681, 'energy'),
   (0.003559576, 'development'),
   (0.0034465855, 'plan'),
   (0.0033540376, 'management'),
   (0.003252176, 'level'),
   (0.003134172, 'sea'),
   (0.0031166258, 'community'),
   (0.0030462486, 'coastal'),
   (0.0029501598, 'transportation'),
   (0.0029324335, 'report'),
   (0.0029297755, 'government'),
   (0.0028913422, 'risks'),
   (0.0028579743, 'emissions')],
  -0.177652462869605)]
Ran

2019-12-16 16:11:52,488 : INFO : -8.066 per-word bound, 267.9 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:11:52,489 : INFO : using autotuned alpha, starting with [0.5, 0.5]
2019-12-16 16:11:52,494 : INFO : using serial LDA version on this node
2019-12-16 16:11:52,502 : INFO : running online (single-pass) LDA training, 2 topics, 1 passes over the supplied corpus of 77 documents, updating model once every 77 documents, evaluating perplexity every 77 documents, iterating 50x with a convergence threshold of 0.001000


Perplexity: -8.065665893539961


2019-12-16 16:11:53,446 : INFO : -9.811 per-word bound, 898.2 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:11:53,447 : INFO : PROGRESS: pass 0, at document #77/77
2019-12-16 16:11:53,692 : INFO : optimized alpha [0.869983, 0.91217685]
2019-12-16 16:11:53,697 : INFO : topic #0 (0.870): 0.004*"planning" + 0.004*"land" + 0.004*"energy" + 0.004*"development" + 0.003*"plan" + 0.003*"management" + 0.003*"level" + 0.003*"sea" + 0.003*"community" + 0.003*"coastal"
2019-12-16 16:11:53,698 : INFO : topic #1 (0.912): 0.006*"energy" + 0.004*"changes" + 0.004*"report" + 0.004*"emissions" + 0.004*"sea" + 0.003*"future" + 0.003*"management" + 0.003*"level" + 0.003*"development" + 0.003*"land"
2019-12-16 16:11:53,699 : INFO : topic diff=1.522116, rho=1.000000


Number of topics: 2 Number of Terms: 20
[([(0.005554169, 'energy'),
   (0.0040934845, 'changes'),
   (0.0038522764, 'report'),
   (0.0037370163, 'emissions'),
   (0.0037185382, 'sea'),
   (0.0033258446, 'future'),
   (0.0033047143, 'management'),
   (0.003303176, 'level'),
   (0.0032368281, 'development'),
   (0.0031536152, 'land'),
   (0.0031317167, 'coastal'),
   (0.0030861602, 'planning'),
   (0.0029496849, 'research'),
   (0.0028741304, 'government'),
   (0.0027247134, 'infrastructure'),
   (0.0026502262, 'flood'),
   (0.0026377656, 'information'),
   (0.002567921, 'sector'),
   (0.0025541617, 'global'),
   (0.0024220452, 'environment')],
  -0.13747426291003004),
 ([(0.004147297, 'planning'),
   (0.003976375, 'land'),
   (0.003585681, 'energy'),
   (0.003559576, 'development'),
   (0.0034465855, 'plan'),
   (0.0033540376, 'management'),
   (0.003252176, 'level'),
   (0.003134172, 'sea'),
   (0.0031166258, 'community'),
   (0.0030462486, 'coastal'),
   (0.0029501598, 'transportation

2019-12-16 16:11:54,840 : INFO : -8.066 per-word bound, 267.9 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:11:54,841 : INFO : using autotuned alpha, starting with [0.33333334, 0.33333334, 0.33333334]
2019-12-16 16:11:54,846 : INFO : using serial LDA version on this node
2019-12-16 16:11:54,855 : INFO : running online (single-pass) LDA training, 3 topics, 1 passes over the supplied corpus of 77 documents, updating model once every 77 documents, evaluating perplexity every 77 documents, iterating 50x with a convergence threshold of 0.001000


Perplexity: -8.065665893539961


2019-12-16 16:11:55,849 : INFO : -9.871 per-word bound, 936.2 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:11:55,850 : INFO : PROGRESS: pass 0, at document #77/77
2019-12-16 16:11:56,146 : INFO : optimized alpha [0.5889133, 0.6060606, 0.61193645]
2019-12-16 16:11:56,151 : INFO : topic #0 (0.589): 0.004*"land" + 0.004*"planning" + 0.004*"development" + 0.004*"energy" + 0.003*"report" + 0.003*"coastal" + 0.003*"risks" + 0.003*"emissions" + 0.003*"plan" + 0.003*"sea"
2019-12-16 16:11:56,153 : INFO : topic #1 (0.606): 0.006*"energy" + 0.004*"report" + 0.004*"emissions" + 0.004*"changes" + 0.004*"sea" + 0.004*"coastal" + 0.003*"development" + 0.003*"land" + 0.003*"future" + 0.003*"research"
2019-12-16 16:11:56,155 : INFO : topic #2 (0.612): 0.005*"energy" + 0.005*"management" + 0.004*"planning" + 0.004*"level" + 0.004*"changes" + 0.004*"sea" + 0.003*"infrastructure" + 0.003*"future" + 0.003*"land" + 0.003*"government"
2019-12-16 16:11:56,157

Number of topics: 3 Number of Terms: 5
[([(0.004631676, 'energy'),
   (0.0045930427, 'management'),
   (0.0039982614, 'planning'),
   (0.0039370125, 'level'),
   (0.003597411, 'changes')],
  -0.04898606535110876),
 ([(0.0041694823, 'land'),
   (0.0039146347, 'planning'),
   (0.003820342, 'development'),
   (0.0035005992, 'energy'),
   (0.0034255798, 'report')],
  -0.11699232978137983),
 ([(0.005660018, 'energy'),
   (0.0044247587, 'report'),
   (0.004079819, 'emissions'),
   (0.0040084627, 'changes'),
   (0.0036828027, 'sea')],
  -0.17502112247940768)]
Random Document Distribution
[(0, 0.04762689), (1, 0.6231852), (2, 0.32918796)]
Coherence: -0.13066012318477496


2019-12-16 16:11:57,059 : INFO : -8.077 per-word bound, 270.1 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:11:57,060 : INFO : using autotuned alpha, starting with [0.33333334, 0.33333334, 0.33333334]
2019-12-16 16:11:57,066 : INFO : using serial LDA version on this node
2019-12-16 16:11:57,073 : INFO : running online (single-pass) LDA training, 3 topics, 1 passes over the supplied corpus of 77 documents, updating model once every 77 documents, evaluating perplexity every 77 documents, iterating 50x with a convergence threshold of 0.001000


Perplexity: -8.077154265521893


2019-12-16 16:11:57,967 : INFO : -9.871 per-word bound, 936.2 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:11:57,967 : INFO : PROGRESS: pass 0, at document #77/77
2019-12-16 16:11:58,262 : INFO : optimized alpha [0.5889133, 0.6060606, 0.61193645]
2019-12-16 16:11:58,267 : INFO : topic #0 (0.589): 0.004*"land" + 0.004*"planning" + 0.004*"development" + 0.004*"energy" + 0.003*"report" + 0.003*"coastal" + 0.003*"risks" + 0.003*"emissions" + 0.003*"plan" + 0.003*"sea"
2019-12-16 16:11:58,270 : INFO : topic #1 (0.606): 0.006*"energy" + 0.004*"report" + 0.004*"emissions" + 0.004*"changes" + 0.004*"sea" + 0.004*"coastal" + 0.003*"development" + 0.003*"land" + 0.003*"future" + 0.003*"research"
2019-12-16 16:11:58,272 : INFO : topic #2 (0.612): 0.005*"energy" + 0.005*"management" + 0.004*"planning" + 0.004*"level" + 0.004*"changes" + 0.004*"sea" + 0.003*"infrastructure" + 0.003*"future" + 0.003*"land" + 0.003*"government"
2019-12-16 16:11:58,273

Number of topics: 3 Number of Terms: 10
[([(0.004631676, 'energy'),
   (0.0045930427, 'management'),
   (0.0039982614, 'planning'),
   (0.0039370125, 'level'),
   (0.003597411, 'changes'),
   (0.0035456615, 'sea'),
   (0.0033018345, 'infrastructure'),
   (0.0031888986, 'future'),
   (0.003135145, 'land'),
   (0.0029807717, 'government')],
  -0.11312160203614283),
 ([(0.005660018, 'energy'),
   (0.0044247587, 'report'),
   (0.004079819, 'emissions'),
   (0.0040084627, 'changes'),
   (0.0036828027, 'sea'),
   (0.0036512404, 'coastal'),
   (0.003472614, 'development'),
   (0.003346625, 'land'),
   (0.003226643, 'future'),
   (0.0030616391, 'research')],
  -0.12724002054886982),
 ([(0.0041694823, 'land'),
   (0.0039146347, 'planning'),
   (0.003820342, 'development'),
   (0.0035005992, 'energy'),
   (0.0034255798, 'report'),
   (0.003383334, 'coastal'),
   (0.0032530655, 'risks'),
   (0.0031254895, 'emissions'),
   (0.0030975, 'plan'),
   (0.0030801585, 'sea')],
  -0.1981268823302916)]
Ran

2019-12-16 16:11:59,475 : INFO : -8.077 per-word bound, 270.1 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:11:59,477 : INFO : using autotuned alpha, starting with [0.33333334, 0.33333334, 0.33333334]
2019-12-16 16:11:59,485 : INFO : using serial LDA version on this node
2019-12-16 16:11:59,502 : INFO : running online (single-pass) LDA training, 3 topics, 1 passes over the supplied corpus of 77 documents, updating model once every 77 documents, evaluating perplexity every 77 documents, iterating 50x with a convergence threshold of 0.001000


Perplexity: -8.077154265521893


2019-12-16 16:12:01,013 : INFO : -9.871 per-word bound, 936.2 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:01,014 : INFO : PROGRESS: pass 0, at document #77/77
2019-12-16 16:12:01,424 : INFO : optimized alpha [0.5889133, 0.6060606, 0.61193645]
2019-12-16 16:12:01,429 : INFO : topic #0 (0.589): 0.004*"land" + 0.004*"planning" + 0.004*"development" + 0.004*"energy" + 0.003*"report" + 0.003*"coastal" + 0.003*"risks" + 0.003*"emissions" + 0.003*"plan" + 0.003*"sea"
2019-12-16 16:12:01,431 : INFO : topic #1 (0.606): 0.006*"energy" + 0.004*"report" + 0.004*"emissions" + 0.004*"changes" + 0.004*"sea" + 0.004*"coastal" + 0.003*"development" + 0.003*"land" + 0.003*"future" + 0.003*"research"
2019-12-16 16:12:01,432 : INFO : topic #2 (0.612): 0.005*"energy" + 0.005*"management" + 0.004*"planning" + 0.004*"level" + 0.004*"changes" + 0.004*"sea" + 0.003*"infrastructure" + 0.003*"future" + 0.003*"land" + 0.003*"government"
2019-12-16 16:12:01,435

Number of topics: 3 Number of Terms: 15
[([(0.005660018, 'energy'),
   (0.0044247587, 'report'),
   (0.004079819, 'emissions'),
   (0.0040084627, 'changes'),
   (0.0036828027, 'sea'),
   (0.0036512404, 'coastal'),
   (0.003472614, 'development'),
   (0.003346625, 'land'),
   (0.003226643, 'future'),
   (0.0030616391, 'research'),
   (0.002999301, 'level'),
   (0.002961773, 'global'),
   (0.0028861132, 'sector'),
   (0.0028813006, 'planning'),
   (0.002793181, 'government')],
  -0.11680503740252812),
 ([(0.004631676, 'energy'),
   (0.0045930427, 'management'),
   (0.0039982614, 'planning'),
   (0.0039370125, 'level'),
   (0.003597411, 'changes'),
   (0.0035456615, 'sea'),
   (0.0033018345, 'infrastructure'),
   (0.0031888986, 'future'),
   (0.003135145, 'land'),
   (0.0029807717, 'government'),
   (0.0028847293, 'development'),
   (0.0028840324, 'plan'),
   (0.0027220563, 'emissions'),
   (0.0027190235, 'information'),
   (0.0026366594, 'transportation')],
  -0.1372296712388793),
 ([(0.

2019-12-16 16:12:02,991 : INFO : -8.077 per-word bound, 270.1 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:02,993 : INFO : using autotuned alpha, starting with [0.33333334, 0.33333334, 0.33333334]
2019-12-16 16:12:02,996 : INFO : using serial LDA version on this node
2019-12-16 16:12:03,014 : INFO : running online (single-pass) LDA training, 3 topics, 1 passes over the supplied corpus of 77 documents, updating model once every 77 documents, evaluating perplexity every 77 documents, iterating 50x with a convergence threshold of 0.001000


Perplexity: -8.077154265521893


2019-12-16 16:12:04,175 : INFO : -9.871 per-word bound, 936.2 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:04,176 : INFO : PROGRESS: pass 0, at document #77/77
2019-12-16 16:12:04,465 : INFO : optimized alpha [0.5889133, 0.6060606, 0.61193645]
2019-12-16 16:12:04,472 : INFO : topic #0 (0.589): 0.004*"land" + 0.004*"planning" + 0.004*"development" + 0.004*"energy" + 0.003*"report" + 0.003*"coastal" + 0.003*"risks" + 0.003*"emissions" + 0.003*"plan" + 0.003*"sea"
2019-12-16 16:12:04,474 : INFO : topic #1 (0.606): 0.006*"energy" + 0.004*"report" + 0.004*"emissions" + 0.004*"changes" + 0.004*"sea" + 0.004*"coastal" + 0.003*"development" + 0.003*"land" + 0.003*"future" + 0.003*"research"
2019-12-16 16:12:04,479 : INFO : topic #2 (0.612): 0.005*"energy" + 0.005*"management" + 0.004*"planning" + 0.004*"level" + 0.004*"changes" + 0.004*"sea" + 0.003*"infrastructure" + 0.003*"future" + 0.003*"land" + 0.003*"government"
2019-12-16 16:12:04,481

Number of topics: 3 Number of Terms: 20
[([(0.005660018, 'energy'),
   (0.0044247587, 'report'),
   (0.004079819, 'emissions'),
   (0.0040084627, 'changes'),
   (0.0036828027, 'sea'),
   (0.0036512404, 'coastal'),
   (0.003472614, 'development'),
   (0.003346625, 'land'),
   (0.003226643, 'future'),
   (0.0030616391, 'research'),
   (0.002999301, 'level'),
   (0.002961773, 'global'),
   (0.0028861132, 'sector'),
   (0.0028813006, 'planning'),
   (0.002793181, 'government'),
   (0.0027378316, 'flood'),
   (0.002682801, 'management'),
   (0.0025248579, 'information'),
   (0.0024675464, 'risks'),
   (0.0024586075, 'environment')],
  -0.12195427385033793),
 ([(0.0041694823, 'land'),
   (0.0039146347, 'planning'),
   (0.003820342, 'development'),
   (0.0035005992, 'energy'),
   (0.0034255798, 'report'),
   (0.003383334, 'coastal'),
   (0.0032530655, 'risks'),
   (0.0031254895, 'emissions'),
   (0.0030975, 'plan'),
   (0.0030801585, 'sea'),
   (0.0029789936, 'community'),
   (0.0029367271, '

2019-12-16 16:12:05,463 : INFO : -8.077 per-word bound, 270.1 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:05,464 : INFO : using autotuned alpha, starting with [0.25, 0.25, 0.25, 0.25]
2019-12-16 16:12:05,467 : INFO : using serial LDA version on this node
2019-12-16 16:12:05,478 : INFO : running online (single-pass) LDA training, 4 topics, 1 passes over the supplied corpus of 77 documents, updating model once every 77 documents, evaluating perplexity every 77 documents, iterating 50x with a convergence threshold of 0.001000


Perplexity: -8.077154265521893


2019-12-16 16:12:06,369 : INFO : -9.913 per-word bound, 964.2 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:06,374 : INFO : PROGRESS: pass 0, at document #77/77
2019-12-16 16:12:06,691 : INFO : optimized alpha [0.44068837, 0.45419693, 0.45948315, 0.46236175]
2019-12-16 16:12:06,697 : INFO : topic #0 (0.441): 0.004*"land" + 0.004*"planning" + 0.004*"development" + 0.004*"coastal" + 0.003*"report" + 0.003*"emissions" + 0.003*"plan" + 0.003*"energy" + 0.003*"community" + 0.003*"risks"
2019-12-16 16:12:06,699 : INFO : topic #1 (0.454): 0.005*"energy" + 0.004*"report" + 0.004*"coastal" + 0.004*"changes" + 0.004*"emissions" + 0.004*"development" + 0.003*"sea" + 0.003*"future" + 0.003*"land" + 0.003*"level"
2019-12-16 16:12:06,700 : INFO : topic #2 (0.459): 0.005*"management" + 0.004*"energy" + 0.004*"level" + 0.004*"planning" + 0.004*"changes" + 0.004*"infrastructure" + 0.003*"future" + 0.003*"sea" + 0.003*"transportation" + 0.003*"land"
20

Number of topics: 4 Number of Terms: 5
[([(0.005056925, 'management'),
   (0.0042777886, 'energy'),
   (0.0039296765, 'level'),
   (0.0039178343, 'planning'),
   (0.0036909892, 'changes')],
  -0.06044735855699408),
 ([(0.005397008, 'energy'),
   (0.004099413, 'sea'),
   (0.003916738, 'land'),
   (0.0036034773, 'report'),
   (0.0034277441, 'planning')],
  -0.08247348883991885),
 ([(0.0042842235, 'land'),
   (0.004051525, 'planning'),
   (0.004003564, 'development'),
   (0.0039036395, 'coastal'),
   (0.0034452328, 'report')],
  -0.1734912466850911),
 ([(0.0054746573, 'energy'),
   (0.004362802, 'report'),
   (0.004082027, 'coastal'),
   (0.003988131, 'changes'),
   (0.0039199595, 'emissions')],
  -0.17860044481375584)]
Random Document Distribution
[(0, 0.010210701), (1, 0.34604564), (2, 0.184408), (3, 0.45933563)]
Coherence: -0.14761306030487037


2019-12-16 16:12:07,647 : INFO : -8.095 per-word bound, 273.5 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:07,648 : INFO : using autotuned alpha, starting with [0.25, 0.25, 0.25, 0.25]
2019-12-16 16:12:07,653 : INFO : using serial LDA version on this node
2019-12-16 16:12:07,663 : INFO : running online (single-pass) LDA training, 4 topics, 1 passes over the supplied corpus of 77 documents, updating model once every 77 documents, evaluating perplexity every 77 documents, iterating 50x with a convergence threshold of 0.001000


Perplexity: -8.095141557522131


2019-12-16 16:12:08,569 : INFO : -9.913 per-word bound, 964.2 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:08,570 : INFO : PROGRESS: pass 0, at document #77/77
2019-12-16 16:12:08,927 : INFO : optimized alpha [0.44068837, 0.45419693, 0.45948315, 0.46236175]
2019-12-16 16:12:08,937 : INFO : topic #0 (0.441): 0.004*"land" + 0.004*"planning" + 0.004*"development" + 0.004*"coastal" + 0.003*"report" + 0.003*"emissions" + 0.003*"plan" + 0.003*"energy" + 0.003*"community" + 0.003*"risks"
2019-12-16 16:12:08,942 : INFO : topic #1 (0.454): 0.005*"energy" + 0.004*"report" + 0.004*"coastal" + 0.004*"changes" + 0.004*"emissions" + 0.004*"development" + 0.003*"sea" + 0.003*"future" + 0.003*"land" + 0.003*"level"
2019-12-16 16:12:08,946 : INFO : topic #2 (0.459): 0.005*"management" + 0.004*"energy" + 0.004*"level" + 0.004*"planning" + 0.004*"changes" + 0.004*"infrastructure" + 0.003*"future" + 0.003*"sea" + 0.003*"transportation" + 0.003*"land"
20

Number of topics: 4 Number of Terms: 10
[([(0.005397008, 'energy'),
   (0.004099413, 'sea'),
   (0.003916738, 'land'),
   (0.0036034773, 'report'),
   (0.0034277441, 'planning'),
   (0.0034032862, 'emissions'),
   (0.003375774, 'changes'),
   (0.0033082773, 'government'),
   (0.0032880101, 'development'),
   (0.003133398, 'research')],
  -0.10416910866188338),
 ([(0.0054746573, 'energy'),
   (0.004362802, 'report'),
   (0.004082027, 'coastal'),
   (0.003988131, 'changes'),
   (0.0039199595, 'emissions'),
   (0.0035018078, 'development'),
   (0.0034972501, 'sea'),
   (0.0032986235, 'future'),
   (0.0031128647, 'land'),
   (0.0031020911, 'level')],
  -0.11182869624152841),
 ([(0.005056925, 'management'),
   (0.0042777886, 'energy'),
   (0.0039296765, 'level'),
   (0.0039178343, 'planning'),
   (0.0036909892, 'changes'),
   (0.0036104312, 'infrastructure'),
   (0.0033215801, 'future'),
   (0.0032050188, 'sea'),
   (0.0030119994, 'transportation'),
   (0.0028660856, 'land')],
  -0.15124901

2019-12-16 16:12:10,355 : INFO : -8.095 per-word bound, 273.5 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:10,356 : INFO : using autotuned alpha, starting with [0.25, 0.25, 0.25, 0.25]
2019-12-16 16:12:10,360 : INFO : using serial LDA version on this node
2019-12-16 16:12:10,373 : INFO : running online (single-pass) LDA training, 4 topics, 1 passes over the supplied corpus of 77 documents, updating model once every 77 documents, evaluating perplexity every 77 documents, iterating 50x with a convergence threshold of 0.001000


Perplexity: -8.095141557522131


2019-12-16 16:12:11,400 : INFO : -9.913 per-word bound, 964.2 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:11,401 : INFO : PROGRESS: pass 0, at document #77/77
2019-12-16 16:12:11,733 : INFO : optimized alpha [0.44068837, 0.45419693, 0.45948315, 0.46236175]
2019-12-16 16:12:11,740 : INFO : topic #0 (0.441): 0.004*"land" + 0.004*"planning" + 0.004*"development" + 0.004*"coastal" + 0.003*"report" + 0.003*"emissions" + 0.003*"plan" + 0.003*"energy" + 0.003*"community" + 0.003*"risks"
2019-12-16 16:12:11,745 : INFO : topic #1 (0.454): 0.005*"energy" + 0.004*"report" + 0.004*"coastal" + 0.004*"changes" + 0.004*"emissions" + 0.004*"development" + 0.003*"sea" + 0.003*"future" + 0.003*"land" + 0.003*"level"
2019-12-16 16:12:11,748 : INFO : topic #2 (0.459): 0.005*"management" + 0.004*"energy" + 0.004*"level" + 0.004*"planning" + 0.004*"changes" + 0.004*"infrastructure" + 0.003*"future" + 0.003*"sea" + 0.003*"transportation" + 0.003*"land"
20

Number of topics: 4 Number of Terms: 15
[([(0.005397008, 'energy'),
   (0.004099413, 'sea'),
   (0.003916738, 'land'),
   (0.0036034773, 'report'),
   (0.0034277441, 'planning'),
   (0.0034032862, 'emissions'),
   (0.003375774, 'changes'),
   (0.0033082773, 'government'),
   (0.0032880101, 'development'),
   (0.003133398, 'research'),
   (0.0030533646, 'level'),
   (0.0027716232, 'future'),
   (0.0026710236, 'environment'),
   (0.00264897, 'risks'),
   (0.0026142823, 'information')],
  -0.0964962372012774),
 ([(0.0054746573, 'energy'),
   (0.004362802, 'report'),
   (0.004082027, 'coastal'),
   (0.003988131, 'changes'),
   (0.0039199595, 'emissions'),
   (0.0035018078, 'development'),
   (0.0034972501, 'sea'),
   (0.0032986235, 'future'),
   (0.0031128647, 'land'),
   (0.0031020911, 'level'),
   (0.0030591101, 'global'),
   (0.0030410087, 'flood'),
   (0.002943826, 'management'),
   (0.0029432487, 'planning'),
   (0.0029048135, 'sector')],
  -0.13408196268192088),
 ([(0.005056925, 'man

2019-12-16 16:12:12,913 : INFO : -8.095 per-word bound, 273.5 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:12,914 : INFO : using autotuned alpha, starting with [0.25, 0.25, 0.25, 0.25]
2019-12-16 16:12:12,919 : INFO : using serial LDA version on this node
2019-12-16 16:12:12,929 : INFO : running online (single-pass) LDA training, 4 topics, 1 passes over the supplied corpus of 77 documents, updating model once every 77 documents, evaluating perplexity every 77 documents, iterating 50x with a convergence threshold of 0.001000


Perplexity: -8.095141557522131


2019-12-16 16:12:13,986 : INFO : -9.913 per-word bound, 964.2 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:13,986 : INFO : PROGRESS: pass 0, at document #77/77
2019-12-16 16:12:14,274 : INFO : optimized alpha [0.44068837, 0.45419693, 0.45948315, 0.46236175]
2019-12-16 16:12:14,286 : INFO : topic #0 (0.441): 0.004*"land" + 0.004*"planning" + 0.004*"development" + 0.004*"coastal" + 0.003*"report" + 0.003*"emissions" + 0.003*"plan" + 0.003*"energy" + 0.003*"community" + 0.003*"risks"
2019-12-16 16:12:14,287 : INFO : topic #1 (0.454): 0.005*"energy" + 0.004*"report" + 0.004*"coastal" + 0.004*"changes" + 0.004*"emissions" + 0.004*"development" + 0.003*"sea" + 0.003*"future" + 0.003*"land" + 0.003*"level"
2019-12-16 16:12:14,289 : INFO : topic #2 (0.459): 0.005*"management" + 0.004*"energy" + 0.004*"level" + 0.004*"planning" + 0.004*"changes" + 0.004*"infrastructure" + 0.003*"future" + 0.003*"sea" + 0.003*"transportation" + 0.003*"land"
20

Number of topics: 4 Number of Terms: 20
[([(0.005397008, 'energy'),
   (0.004099413, 'sea'),
   (0.003916738, 'land'),
   (0.0036034773, 'report'),
   (0.0034277441, 'planning'),
   (0.0034032862, 'emissions'),
   (0.003375774, 'changes'),
   (0.0033082773, 'government'),
   (0.0032880101, 'development'),
   (0.003133398, 'research'),
   (0.0030533646, 'level'),
   (0.0027716232, 'future'),
   (0.0026710236, 'environment'),
   (0.00264897, 'risks'),
   (0.0026142823, 'information'),
   (0.0025095674, 'assessment'),
   (0.0024361939, 'ice'),
   (0.0023584266, 'management'),
   (0.0023476006, '2011'),
   (0.0023170419, 'public')],
  -0.13543233978084349),
 ([(0.0042842235, 'land'),
   (0.004051525, 'planning'),
   (0.004003564, 'development'),
   (0.0039036395, 'coastal'),
   (0.0034452328, 'report'),
   (0.0033004717, 'emissions'),
   (0.0032592977, 'plan'),
   (0.0032570506, 'energy'),
   (0.0032035704, 'community'),
   (0.003170887, 'risks'),
   (0.0030642815, 'flood'),
   (0.00301597

2019-12-16 16:12:15,459 : INFO : -8.095 per-word bound, 273.5 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:15,460 : INFO : using autotuned alpha, starting with [0.2, 0.2, 0.2, 0.2, 0.2]
2019-12-16 16:12:15,464 : INFO : using serial LDA version on this node
2019-12-16 16:12:15,475 : INFO : running online (single-pass) LDA training, 5 topics, 1 passes over the supplied corpus of 77 documents, updating model once every 77 documents, evaluating perplexity every 77 documents, iterating 50x with a convergence threshold of 0.001000


Perplexity: -8.095141557522131


2019-12-16 16:12:16,619 : INFO : -9.950 per-word bound, 988.9 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:16,620 : INFO : PROGRESS: pass 0, at document #77/77
2019-12-16 16:12:16,871 : INFO : optimized alpha [0.35788614, 0.365349, 0.37119335, 0.37187704, 0.36003608]
2019-12-16 16:12:16,879 : INFO : topic #0 (0.358): 0.004*"land" + 0.004*"coastal" + 0.004*"planning" + 0.004*"development" + 0.004*"energy" + 0.003*"report" + 0.003*"plan" + 0.003*"risks" + 0.003*"transportation" + 0.003*"emissions"
2019-12-16 16:12:16,880 : INFO : topic #1 (0.365): 0.006*"energy" + 0.004*"report" + 0.004*"coastal" + 0.004*"changes" + 0.004*"emissions" + 0.004*"development" + 0.003*"land" + 0.003*"future" + 0.003*"global" + 0.003*"sea"
2019-12-16 16:12:16,881 : INFO : topic #2 (0.371): 0.005*"management" + 0.004*"energy" + 0.004*"planning" + 0.004*"level" + 0.004*"infrastructure" + 0.004*"changes" + 0.003*"future" + 0.003*"transportation" + 0.003*"sea" +

Number of topics: 5 Number of Terms: 5
[([(0.0043366402, 'emissions'),
   (0.0042816713, 'sea'),
   (0.0041759536, 'energy'),
   (0.003965057, 'development'),
   (0.0039052812, 'level')],
  -0.06301319556119493),
 ([(0.005350163, 'management'),
   (0.0042796624, 'energy'),
   (0.004035368, 'planning'),
   (0.0039085117, 'level'),
   (0.0038915032, 'infrastructure')],
  -0.08251816990861086),
 ([(0.005250148, 'energy'),
   (0.004008893, 'sea'),
   (0.003948369, 'land'),
   (0.0036980982, 'report'),
   (0.0036134361, 'changes')],
  -0.09099597524846695),
 ([(0.0042636977, 'land'),
   (0.0039397357, 'coastal'),
   (0.0039183404, 'planning'),
   (0.0038133408, 'development'),
   (0.003559269, 'energy')],
  -0.12676717566332257),
 ([(0.0057347408, 'energy'),
   (0.004295709, 'report'),
   (0.00417842, 'coastal'),
   (0.0038711925, 'changes'),
   (0.0038443164, 'emissions')],
  -0.17860044481375584)]
Random Document Distribution
[(0, 0.010758162), (1, 0.2834281), (2, 0.106275216), (3, 0.4133

2019-12-16 16:12:17,914 : INFO : -8.106 per-word bound, 275.6 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:17,915 : INFO : using autotuned alpha, starting with [0.2, 0.2, 0.2, 0.2, 0.2]
2019-12-16 16:12:17,920 : INFO : using serial LDA version on this node
2019-12-16 16:12:17,932 : INFO : running online (single-pass) LDA training, 5 topics, 1 passes over the supplied corpus of 77 documents, updating model once every 77 documents, evaluating perplexity every 77 documents, iterating 50x with a convergence threshold of 0.001000


Perplexity: -8.106432019927071


2019-12-16 16:12:18,887 : INFO : -9.950 per-word bound, 988.9 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:18,887 : INFO : PROGRESS: pass 0, at document #77/77
2019-12-16 16:12:19,139 : INFO : optimized alpha [0.35788614, 0.365349, 0.37119335, 0.37187704, 0.36003608]
2019-12-16 16:12:19,146 : INFO : topic #0 (0.358): 0.004*"land" + 0.004*"coastal" + 0.004*"planning" + 0.004*"development" + 0.004*"energy" + 0.003*"report" + 0.003*"plan" + 0.003*"risks" + 0.003*"transportation" + 0.003*"emissions"
2019-12-16 16:12:19,147 : INFO : topic #1 (0.365): 0.006*"energy" + 0.004*"report" + 0.004*"coastal" + 0.004*"changes" + 0.004*"emissions" + 0.004*"development" + 0.003*"land" + 0.003*"future" + 0.003*"global" + 0.003*"sea"
2019-12-16 16:12:19,150 : INFO : topic #2 (0.371): 0.005*"management" + 0.004*"energy" + 0.004*"planning" + 0.004*"level" + 0.004*"infrastructure" + 0.004*"changes" + 0.003*"future" + 0.003*"transportation" + 0.003*"sea" +

Number of topics: 5 Number of Terms: 10
[([(0.005250148, 'energy'),
   (0.004008893, 'sea'),
   (0.003948369, 'land'),
   (0.0036980982, 'report'),
   (0.0036134361, 'changes'),
   (0.0034625905, 'research'),
   (0.0034031617, 'government'),
   (0.0033128029, 'planning'),
   (0.0032759977, 'emissions'),
   (0.0031003386, 'development')],
  -0.10600582387824345),
 ([(0.0043366402, 'emissions'),
   (0.0042816713, 'sea'),
   (0.0041759536, 'energy'),
   (0.003965057, 'development'),
   (0.0039052812, 'level'),
   (0.0037071065, 'planning'),
   (0.003421588, 'report'),
   (0.003270676, 'changes'),
   (0.0030541343, 'flood'),
   (0.0029776064, 'land')],
  -0.11287230421476724),
 ([(0.0057347408, 'energy'),
   (0.004295709, 'report'),
   (0.00417842, 'coastal'),
   (0.0038711925, 'changes'),
   (0.0038443164, 'emissions'),
   (0.0035371168, 'development'),
   (0.003261291, 'land'),
   (0.0032418927, 'future'),
   (0.0031887267, 'global'),
   (0.0031563556, 'sea')],
  -0.14308365349310684),
 

2019-12-16 16:12:20,432 : INFO : -8.106 per-word bound, 275.6 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:20,433 : INFO : using autotuned alpha, starting with [0.2, 0.2, 0.2, 0.2, 0.2]
2019-12-16 16:12:20,438 : INFO : using serial LDA version on this node
2019-12-16 16:12:20,451 : INFO : running online (single-pass) LDA training, 5 topics, 1 passes over the supplied corpus of 77 documents, updating model once every 77 documents, evaluating perplexity every 77 documents, iterating 50x with a convergence threshold of 0.001000


Perplexity: -8.106432019927071


2019-12-16 16:12:21,302 : INFO : -9.950 per-word bound, 988.9 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:21,303 : INFO : PROGRESS: pass 0, at document #77/77
2019-12-16 16:12:21,569 : INFO : optimized alpha [0.35788614, 0.365349, 0.37119335, 0.37187704, 0.36003608]
2019-12-16 16:12:21,577 : INFO : topic #0 (0.358): 0.004*"land" + 0.004*"coastal" + 0.004*"planning" + 0.004*"development" + 0.004*"energy" + 0.003*"report" + 0.003*"plan" + 0.003*"risks" + 0.003*"transportation" + 0.003*"emissions"
2019-12-16 16:12:21,579 : INFO : topic #1 (0.365): 0.006*"energy" + 0.004*"report" + 0.004*"coastal" + 0.004*"changes" + 0.004*"emissions" + 0.004*"development" + 0.003*"land" + 0.003*"future" + 0.003*"global" + 0.003*"sea"
2019-12-16 16:12:21,580 : INFO : topic #2 (0.371): 0.005*"management" + 0.004*"energy" + 0.004*"planning" + 0.004*"level" + 0.004*"infrastructure" + 0.004*"changes" + 0.003*"future" + 0.003*"transportation" + 0.003*"sea" +

Number of topics: 5 Number of Terms: 15
[([(0.005350163, 'management'),
   (0.0042796624, 'energy'),
   (0.004035368, 'planning'),
   (0.0039085117, 'level'),
   (0.0038915032, 'infrastructure'),
   (0.0036723039, 'changes'),
   (0.0033784236, 'future'),
   (0.0033573247, 'transportation'),
   (0.003192899, 'sea'),
   (0.0030613276, 'land'),
   (0.0029150713, 'government'),
   (0.0028871358, 'plan'),
   (0.0028188957, 'community'),
   (0.0027631125, 'development'),
   (0.0027443562, 'information')],
  -0.11072221153085807),
 ([(0.005250148, 'energy'),
   (0.004008893, 'sea'),
   (0.003948369, 'land'),
   (0.0036980982, 'report'),
   (0.0036134361, 'changes'),
   (0.0034625905, 'research'),
   (0.0034031617, 'government'),
   (0.0033128029, 'planning'),
   (0.0032759977, 'emissions'),
   (0.0031003386, 'development'),
   (0.0029397816, 'level'),
   (0.0027983496, 'future'),
   (0.0026618303, 'environment'),
   (0.0026541809, 'risks'),
   (0.0026385866, 'ice')],
  -0.12563414462696448),


2019-12-16 16:12:22,623 : INFO : -8.106 per-word bound, 275.6 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:22,624 : INFO : using autotuned alpha, starting with [0.2, 0.2, 0.2, 0.2, 0.2]
2019-12-16 16:12:22,629 : INFO : using serial LDA version on this node
2019-12-16 16:12:22,641 : INFO : running online (single-pass) LDA training, 5 topics, 1 passes over the supplied corpus of 77 documents, updating model once every 77 documents, evaluating perplexity every 77 documents, iterating 50x with a convergence threshold of 0.001000


Perplexity: -8.106432019927071


2019-12-16 16:12:23,492 : INFO : -9.950 per-word bound, 988.9 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:23,493 : INFO : PROGRESS: pass 0, at document #77/77
2019-12-16 16:12:23,757 : INFO : optimized alpha [0.35788614, 0.365349, 0.37119335, 0.37187704, 0.36003608]
2019-12-16 16:12:23,764 : INFO : topic #0 (0.358): 0.004*"land" + 0.004*"coastal" + 0.004*"planning" + 0.004*"development" + 0.004*"energy" + 0.003*"report" + 0.003*"plan" + 0.003*"risks" + 0.003*"transportation" + 0.003*"emissions"
2019-12-16 16:12:23,766 : INFO : topic #1 (0.365): 0.006*"energy" + 0.004*"report" + 0.004*"coastal" + 0.004*"changes" + 0.004*"emissions" + 0.004*"development" + 0.003*"land" + 0.003*"future" + 0.003*"global" + 0.003*"sea"
2019-12-16 16:12:23,769 : INFO : topic #2 (0.371): 0.005*"management" + 0.004*"energy" + 0.004*"planning" + 0.004*"level" + 0.004*"infrastructure" + 0.004*"changes" + 0.003*"future" + 0.003*"transportation" + 0.003*"sea" +

Number of topics: 5 Number of Terms: 20
[([(0.0043366402, 'emissions'),
   (0.0042816713, 'sea'),
   (0.0041759536, 'energy'),
   (0.003965057, 'development'),
   (0.0039052812, 'level'),
   (0.0037071065, 'planning'),
   (0.003421588, 'report'),
   (0.003270676, 'changes'),
   (0.0030541343, 'flood'),
   (0.0029776064, 'land'),
   (0.0029547731, 'future'),
   (0.002709202, 'coastal'),
   (0.002514384, 'management'),
   (0.002418995, 'ice'),
   (0.0024016104, 'regional'),
   (0.002390752, 'risks'),
   (0.0023262363, 'information'),
   (0.0023072602, 'flooding'),
   (0.002306168, 'government'),
   (0.0022883865, 'environment')],
  -0.13737503365377307),
 ([(0.005350163, 'management'),
   (0.0042796624, 'energy'),
   (0.004035368, 'planning'),
   (0.0039085117, 'level'),
   (0.0038915032, 'infrastructure'),
   (0.0036723039, 'changes'),
   (0.0033784236, 'future'),
   (0.0033573247, 'transportation'),
   (0.003192899, 'sea'),
   (0.0030613276, 'land'),
   (0.0029150713, 'government'),
  

2019-12-16 16:12:24,952 : INFO : -8.106 per-word bound, 275.6 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:24,953 : INFO : using autotuned alpha, starting with [0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667]
2019-12-16 16:12:24,957 : INFO : using serial LDA version on this node
2019-12-16 16:12:24,970 : INFO : running online (single-pass) LDA training, 6 topics, 1 passes over the supplied corpus of 77 documents, updating model once every 77 documents, evaluating perplexity every 77 documents, iterating 50x with a convergence threshold of 0.001000


Perplexity: -8.106432019927071


2019-12-16 16:12:25,977 : INFO : -9.981 per-word bound, 1010.7 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:25,978 : INFO : PROGRESS: pass 0, at document #77/77
2019-12-16 16:12:26,236 : INFO : optimized alpha [0.30054817, 0.30381817, 0.30877817, 0.3107337, 0.30158848, 0.30852026]
2019-12-16 16:12:26,245 : INFO : topic #0 (0.301): 0.005*"land" + 0.004*"planning" + 0.004*"coastal" + 0.004*"development" + 0.004*"report" + 0.003*"plan" + 0.003*"community" + 0.003*"management" + 0.003*"energy" + 0.003*"level"
2019-12-16 16:12:26,246 : INFO : topic #4 (0.302): 0.004*"sea" + 0.004*"energy" + 0.004*"level" + 0.004*"emissions" + 0.004*"development" + 0.004*"planning" + 0.003*"report" + 0.003*"changes" + 0.003*"land" + 0.003*"flood"
2019-12-16 16:12:26,248 : INFO : topic #5 (0.309): 0.006*"energy" + 0.005*"emissions" + 0.004*"development" + 0.003*"future" + 0.003*"changes" + 0.003*"planning" + 0.003*"government" + 0.003*"report" + 0.003*"mana

Number of topics: 6 Number of Terms: 5
[([(0.0059093367, 'energy'),
   (0.0046728975, 'emissions'),
   (0.0035821458, 'development'),
   (0.003418936, 'future'),
   (0.0034153666, 'changes')],
  -0.04507122645422164),
 ([(0.0042631356, 'sea'),
   (0.004097028, 'energy'),
   (0.0040351404, 'level'),
   (0.0040320917, 'emissions'),
   (0.0040268125, 'development')],
  -0.09058423145738773),
 ([(0.005165023, 'energy'),
   (0.0041810474, 'sea'),
   (0.004152091, 'land'),
   (0.0037663437, 'research'),
   (0.0035948139, 'report')],
  -0.1122023594716568),
 ([(0.005667839, 'management'),
   (0.0042760787, 'planning'),
   (0.0040314565, 'level'),
   (0.00395201, 'energy'),
   (0.003655946, 'infrastructure')],
  -0.11579098877666358),
 ([(0.004623084, 'land'),
   (0.004090662, 'planning'),
   (0.0040536067, 'coastal'),
   (0.0038069289, 'development'),
   (0.003514571, 'report')],
  -0.14014208583678361),
 ([(0.0052656922, 'energy'),
   (0.0045225755, 'report'),
   (0.0041588848, 'changes'),
 

2019-12-16 16:12:27,208 : INFO : -8.118 per-word bound, 277.8 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:27,209 : INFO : using autotuned alpha, starting with [0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667]
2019-12-16 16:12:27,213 : INFO : using serial LDA version on this node
2019-12-16 16:12:27,227 : INFO : running online (single-pass) LDA training, 6 topics, 1 passes over the supplied corpus of 77 documents, updating model once every 77 documents, evaluating perplexity every 77 documents, iterating 50x with a convergence threshold of 0.001000


Perplexity: -8.117790847395906


2019-12-16 16:12:28,117 : INFO : -9.981 per-word bound, 1010.7 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:28,118 : INFO : PROGRESS: pass 0, at document #77/77
2019-12-16 16:12:28,408 : INFO : optimized alpha [0.30054817, 0.30381817, 0.30877817, 0.3107337, 0.30158848, 0.30852026]
2019-12-16 16:12:28,417 : INFO : topic #0 (0.301): 0.005*"land" + 0.004*"planning" + 0.004*"coastal" + 0.004*"development" + 0.004*"report" + 0.003*"plan" + 0.003*"community" + 0.003*"management" + 0.003*"energy" + 0.003*"level"
2019-12-16 16:12:28,419 : INFO : topic #4 (0.302): 0.004*"sea" + 0.004*"energy" + 0.004*"level" + 0.004*"emissions" + 0.004*"development" + 0.004*"planning" + 0.003*"report" + 0.003*"changes" + 0.003*"land" + 0.003*"flood"
2019-12-16 16:12:28,421 : INFO : topic #5 (0.309): 0.006*"energy" + 0.005*"emissions" + 0.004*"development" + 0.003*"future" + 0.003*"changes" + 0.003*"planning" + 0.003*"government" + 0.003*"report" + 0.003*"mana

Number of topics: 6 Number of Terms: 10
[([(0.005165023, 'energy'),
   (0.0041810474, 'sea'),
   (0.004152091, 'land'),
   (0.0037663437, 'research'),
   (0.0035948139, 'report'),
   (0.0034473059, 'changes'),
   (0.0033704792, 'planning'),
   (0.0032359706, 'government'),
   (0.003100147, 'level'),
   (0.0030576182, 'development')],
  -0.07511503119507655),
 ([(0.004623084, 'land'),
   (0.004090662, 'planning'),
   (0.0040536067, 'coastal'),
   (0.0038069289, 'development'),
   (0.003514571, 'report'),
   (0.003240385, 'plan'),
   (0.003150651, 'community'),
   (0.003144437, 'management'),
   (0.0031436621, 'energy'),
   (0.0030874873, 'level')],
  -0.10277665070909843),
 ([(0.0059093367, 'energy'),
   (0.0046728975, 'emissions'),
   (0.0035821458, 'development'),
   (0.003418936, 'future'),
   (0.0034153666, 'changes'),
   (0.0033534179, 'planning'),
   (0.0033204383, 'government'),
   (0.003236939, 'report'),
   (0.0029653374, 'management'),
   (0.0028557428, 'flood')],
  -0.1142320

2019-12-16 16:12:29,427 : INFO : -8.118 per-word bound, 277.8 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:29,428 : INFO : using autotuned alpha, starting with [0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667]
2019-12-16 16:12:29,434 : INFO : using serial LDA version on this node
2019-12-16 16:12:29,446 : INFO : running online (single-pass) LDA training, 6 topics, 1 passes over the supplied corpus of 77 documents, updating model once every 77 documents, evaluating perplexity every 77 documents, iterating 50x with a convergence threshold of 0.001000


Perplexity: -8.117790847395906


2019-12-16 16:12:30,442 : INFO : -9.981 per-word bound, 1010.7 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:30,442 : INFO : PROGRESS: pass 0, at document #77/77
2019-12-16 16:12:30,796 : INFO : optimized alpha [0.30054817, 0.30381817, 0.30877817, 0.3107337, 0.30158848, 0.30852026]
2019-12-16 16:12:30,808 : INFO : topic #0 (0.301): 0.005*"land" + 0.004*"planning" + 0.004*"coastal" + 0.004*"development" + 0.004*"report" + 0.003*"plan" + 0.003*"community" + 0.003*"management" + 0.003*"energy" + 0.003*"level"
2019-12-16 16:12:30,809 : INFO : topic #4 (0.302): 0.004*"sea" + 0.004*"energy" + 0.004*"level" + 0.004*"emissions" + 0.004*"development" + 0.004*"planning" + 0.003*"report" + 0.003*"changes" + 0.003*"land" + 0.003*"flood"
2019-12-16 16:12:30,811 : INFO : topic #5 (0.309): 0.006*"energy" + 0.005*"emissions" + 0.004*"development" + 0.003*"future" + 0.003*"changes" + 0.003*"planning" + 0.003*"government" + 0.003*"report" + 0.003*"mana

Number of topics: 6 Number of Terms: 15
[([(0.005165023, 'energy'),
   (0.0041810474, 'sea'),
   (0.004152091, 'land'),
   (0.0037663437, 'research'),
   (0.0035948139, 'report'),
   (0.0034473059, 'changes'),
   (0.0033704792, 'planning'),
   (0.0032359706, 'government'),
   (0.003100147, 'level'),
   (0.0030576182, 'development'),
   (0.0028882697, 'emissions'),
   (0.0027210175, 'future'),
   (0.0026978562, 'risks'),
   (0.002633907, 'environment'),
   (0.0026248777, 'information')],
  -0.09969160571081331),
 ([(0.005667839, 'management'),
   (0.0042760787, 'planning'),
   (0.0040314565, 'level'),
   (0.00395201, 'energy'),
   (0.003655946, 'infrastructure'),
   (0.0036432934, 'changes'),
   (0.003275966, 'land'),
   (0.003255391, 'future'),
   (0.0032115763, 'sea'),
   (0.0029713789, 'plan'),
   (0.0029501764, 'information'),
   (0.0028410552, 'transportation'),
   (0.002827594, 'government'),
   (0.0027848617, 'community'),
   (0.002779951, 'development')],
  -0.12626451497170138)

2019-12-16 16:12:32,052 : INFO : -8.118 per-word bound, 277.8 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:32,054 : INFO : using autotuned alpha, starting with [0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667]
2019-12-16 16:12:32,059 : INFO : using serial LDA version on this node
2019-12-16 16:12:32,075 : INFO : running online (single-pass) LDA training, 6 topics, 1 passes over the supplied corpus of 77 documents, updating model once every 77 documents, evaluating perplexity every 77 documents, iterating 50x with a convergence threshold of 0.001000


Perplexity: -8.117790847395906


2019-12-16 16:12:33,000 : INFO : -9.981 per-word bound, 1010.7 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:33,001 : INFO : PROGRESS: pass 0, at document #77/77
2019-12-16 16:12:33,261 : INFO : optimized alpha [0.30054817, 0.30381817, 0.30877817, 0.3107337, 0.30158848, 0.30852026]
2019-12-16 16:12:33,271 : INFO : topic #0 (0.301): 0.005*"land" + 0.004*"planning" + 0.004*"coastal" + 0.004*"development" + 0.004*"report" + 0.003*"plan" + 0.003*"community" + 0.003*"management" + 0.003*"energy" + 0.003*"level"
2019-12-16 16:12:33,272 : INFO : topic #4 (0.302): 0.004*"sea" + 0.004*"energy" + 0.004*"level" + 0.004*"emissions" + 0.004*"development" + 0.004*"planning" + 0.003*"report" + 0.003*"changes" + 0.003*"land" + 0.003*"flood"
2019-12-16 16:12:33,274 : INFO : topic #5 (0.309): 0.006*"energy" + 0.005*"emissions" + 0.004*"development" + 0.003*"future" + 0.003*"changes" + 0.003*"planning" + 0.003*"government" + 0.003*"report" + 0.003*"mana

Number of topics: 6 Number of Terms: 20
[([(0.0059093367, 'energy'),
   (0.0046728975, 'emissions'),
   (0.0035821458, 'development'),
   (0.003418936, 'future'),
   (0.0034153666, 'changes'),
   (0.0033534179, 'planning'),
   (0.0033204383, 'government'),
   (0.003236939, 'report'),
   (0.0029653374, 'management'),
   (0.0028557428, 'flood'),
   (0.002686238, 'coastal'),
   (0.002685864, 'communities'),
   (0.0026798, 'sea'),
   (0.0026728753, 'infrastructure'),
   (0.002592771, 'land'),
   (0.0025493624, 'transportation'),
   (0.0025400252, 'level'),
   (0.0025340898, 'resources'),
   (0.0024737718, 'community'),
   (0.0024650625, 'plan')],
  -0.13113603804549012),
 ([(0.0052656922, 'energy'),
   (0.0045225755, 'report'),
   (0.0041588848, 'changes'),
   (0.0041587036, 'coastal'),
   (0.003551493, 'sea'),
   (0.003540167, 'emissions'),
   (0.003387212, 'land'),
   (0.0033355912, 'sector'),
   (0.003287069, 'global'),
   (0.0032774387, 'development'),
   (0.003201859, 'research'),
   

2019-12-16 16:12:34,358 : INFO : -8.118 per-word bound, 277.8 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:34,360 : INFO : using autotuned alpha, starting with [0.14285715, 0.14285715, 0.14285715, 0.14285715, 0.14285715, 0.14285715, 0.14285715]
2019-12-16 16:12:34,366 : INFO : using serial LDA version on this node
2019-12-16 16:12:34,389 : INFO : running online (single-pass) LDA training, 7 topics, 1 passes over the supplied corpus of 77 documents, updating model once every 77 documents, evaluating perplexity every 77 documents, iterating 50x with a convergence threshold of 0.001000


Perplexity: -8.117790847395906


2019-12-16 16:12:35,441 : INFO : -10.012 per-word bound, 1032.6 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:35,442 : INFO : PROGRESS: pass 0, at document #77/77
2019-12-16 16:12:35,734 : INFO : optimized alpha [0.260733, 0.26313332, 0.2664994, 0.2662589, 0.26166806, 0.2668909, 0.25493205]
2019-12-16 16:12:35,745 : INFO : topic #6 (0.255): 0.005*"energy" + 0.004*"changes" + 0.004*"level" + 0.003*"planning" + 0.003*"future" + 0.003*"emissions" + 0.003*"development" + 0.003*"global" + 0.003*"sea" + 0.003*"management"
2019-12-16 16:12:35,747 : INFO : topic #0 (0.261): 0.005*"land" + 0.004*"planning" + 0.004*"coastal" + 0.004*"development" + 0.003*"plan" + 0.003*"transportation" + 0.003*"report" + 0.003*"risks" + 0.003*"community" + 0.003*"management"
2019-12-16 16:12:35,748 : INFO : topic #3 (0.266): 0.005*"energy" + 0.004*"land" + 0.004*"sea" + 0.004*"research" + 0.004*"report" + 0.003*"government" + 0.003*"changes" + 0.003*"planning" 

Number of topics: 7 Number of Terms: 5
[([(0.0048915124, 'energy'),
   (0.0038385517, 'changes'),
   (0.0036019199, 'level'),
   (0.0034646885, 'planning'),
   (0.0033384697, 'future')],
  -0.04594501513812171),
 ([(0.005758422, 'energy'),
   (0.0044910274, 'emissions'),
   (0.003507216, 'development'),
   (0.0035044213, 'planning'),
   (0.0034760726, 'changes')],
  -0.046047105992628054),
 ([(0.0043537715, 'sea'),
   (0.0040994943, 'energy'),
   (0.0039131646, 'level'),
   (0.0038860624, 'development'),
   (0.003788087, 'emissions')],
  -0.11402430204127215),
 ([(0.0055099158, 'management'),
   (0.004065344, 'level'),
   (0.0040149065, 'planning'),
   (0.00391829, 'energy'),
   (0.003756183, 'infrastructure')],
  -0.1143311088345483),
 ([(0.0047889003, 'land'),
   (0.0041604405, 'planning'),
   (0.0041430774, 'coastal'),
   (0.0038985303, 'development'),
   (0.003478226, 'plan')],
  -0.12383891254968223),
 ([(0.005147052, 'energy'),
   (0.0041588917, 'land'),
   (0.0041115237, 'sea'),

2019-12-16 16:12:36,871 : INFO : -8.139 per-word bound, 281.9 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:36,872 : INFO : using autotuned alpha, starting with [0.14285715, 0.14285715, 0.14285715, 0.14285715, 0.14285715, 0.14285715, 0.14285715]
2019-12-16 16:12:36,877 : INFO : using serial LDA version on this node
2019-12-16 16:12:36,891 : INFO : running online (single-pass) LDA training, 7 topics, 1 passes over the supplied corpus of 77 documents, updating model once every 77 documents, evaluating perplexity every 77 documents, iterating 50x with a convergence threshold of 0.001000


Perplexity: -8.138974248742516


2019-12-16 16:12:37,845 : INFO : -10.012 per-word bound, 1032.6 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:37,846 : INFO : PROGRESS: pass 0, at document #77/77
2019-12-16 16:12:38,110 : INFO : optimized alpha [0.260733, 0.26313332, 0.2664994, 0.2662589, 0.26166806, 0.2668909, 0.25493205]
2019-12-16 16:12:38,121 : INFO : topic #6 (0.255): 0.005*"energy" + 0.004*"changes" + 0.004*"level" + 0.003*"planning" + 0.003*"future" + 0.003*"emissions" + 0.003*"development" + 0.003*"global" + 0.003*"sea" + 0.003*"management"
2019-12-16 16:12:38,122 : INFO : topic #0 (0.261): 0.005*"land" + 0.004*"planning" + 0.004*"coastal" + 0.004*"development" + 0.003*"plan" + 0.003*"transportation" + 0.003*"report" + 0.003*"risks" + 0.003*"community" + 0.003*"management"
2019-12-16 16:12:38,124 : INFO : topic #3 (0.266): 0.005*"energy" + 0.004*"land" + 0.004*"sea" + 0.004*"research" + 0.004*"report" + 0.003*"government" + 0.003*"changes" + 0.003*"planning" 

Number of topics: 7 Number of Terms: 10
[([(0.005147052, 'energy'),
   (0.0041588917, 'land'),
   (0.0041115237, 'sea'),
   (0.0037909248, 'research'),
   (0.0037159617, 'report'),
   (0.003382163, 'government'),
   (0.0033652866, 'changes'),
   (0.0033220618, 'planning'),
   (0.0031245816, 'development'),
   (0.003043856, 'level')],
  -0.0790434780349464),
 ([(0.005758422, 'energy'),
   (0.0044910274, 'emissions'),
   (0.003507216, 'development'),
   (0.0035044213, 'planning'),
   (0.0034760726, 'changes'),
   (0.0033690864, 'government'),
   (0.0033531832, 'future'),
   (0.003321301, 'report'),
   (0.0030374245, 'management'),
   (0.0029445114, 'sea')],
  -0.12447664441779938),
 ([(0.005167844, 'energy'),
   (0.004560959, 'report'),
   (0.0042559756, 'coastal'),
   (0.0039678537, 'changes'),
   (0.003806141, 'emissions'),
   (0.003642561, 'land'),
   (0.0034930862, 'sea'),
   (0.0034530798, 'development'),
   (0.0032714321, 'research'),
   (0.0032224855, 'management')],
  -0.12596117

2019-12-16 16:12:39,144 : INFO : -8.139 per-word bound, 281.9 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:39,145 : INFO : using autotuned alpha, starting with [0.14285715, 0.14285715, 0.14285715, 0.14285715, 0.14285715, 0.14285715, 0.14285715]
2019-12-16 16:12:39,150 : INFO : using serial LDA version on this node
2019-12-16 16:12:39,163 : INFO : running online (single-pass) LDA training, 7 topics, 1 passes over the supplied corpus of 77 documents, updating model once every 77 documents, evaluating perplexity every 77 documents, iterating 50x with a convergence threshold of 0.001000


Perplexity: -8.138974248742516


2019-12-16 16:12:40,045 : INFO : -10.012 per-word bound, 1032.6 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:40,046 : INFO : PROGRESS: pass 0, at document #77/77
2019-12-16 16:12:40,314 : INFO : optimized alpha [0.260733, 0.26313332, 0.2664994, 0.2662589, 0.26166806, 0.2668909, 0.25493205]
2019-12-16 16:12:40,324 : INFO : topic #6 (0.255): 0.005*"energy" + 0.004*"changes" + 0.004*"level" + 0.003*"planning" + 0.003*"future" + 0.003*"emissions" + 0.003*"development" + 0.003*"global" + 0.003*"sea" + 0.003*"management"
2019-12-16 16:12:40,326 : INFO : topic #0 (0.261): 0.005*"land" + 0.004*"planning" + 0.004*"coastal" + 0.004*"development" + 0.003*"plan" + 0.003*"transportation" + 0.003*"report" + 0.003*"risks" + 0.003*"community" + 0.003*"management"
2019-12-16 16:12:40,328 : INFO : topic #3 (0.266): 0.005*"energy" + 0.004*"land" + 0.004*"sea" + 0.004*"research" + 0.004*"report" + 0.003*"government" + 0.003*"changes" + 0.003*"planning" 

Number of topics: 7 Number of Terms: 15
[([(0.005147052, 'energy'),
   (0.0041588917, 'land'),
   (0.0041115237, 'sea'),
   (0.0037909248, 'research'),
   (0.0037159617, 'report'),
   (0.003382163, 'government'),
   (0.0033652866, 'changes'),
   (0.0033220618, 'planning'),
   (0.0031245816, 'development'),
   (0.003043856, 'level'),
   (0.0029394927, 'emissions'),
   (0.002822554, 'risks'),
   (0.0027981128, 'information'),
   (0.0026794693, 'future'),
   (0.0026561534, 'environment')],
  -0.10083219751896408),
 ([(0.005167844, 'energy'),
   (0.004560959, 'report'),
   (0.0042559756, 'coastal'),
   (0.0039678537, 'changes'),
   (0.003806141, 'emissions'),
   (0.003642561, 'land'),
   (0.0034930862, 'sea'),
   (0.0034530798, 'development'),
   (0.0032714321, 'research'),
   (0.0032224855, 'management'),
   (0.0031424095, 'sector'),
   (0.0031109934, 'level'),
   (0.0031105564, 'global'),
   (0.003106013, 'future'),
   (0.0029844323, 'planning')],
  -0.10653062504282988),
 ([(0.005509915

2019-12-16 16:12:41,342 : INFO : -8.139 per-word bound, 281.9 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:41,343 : INFO : using autotuned alpha, starting with [0.14285715, 0.14285715, 0.14285715, 0.14285715, 0.14285715, 0.14285715, 0.14285715]
2019-12-16 16:12:41,348 : INFO : using serial LDA version on this node
2019-12-16 16:12:41,361 : INFO : running online (single-pass) LDA training, 7 topics, 1 passes over the supplied corpus of 77 documents, updating model once every 77 documents, evaluating perplexity every 77 documents, iterating 50x with a convergence threshold of 0.001000


Perplexity: -8.138974248742516


2019-12-16 16:12:42,249 : INFO : -10.012 per-word bound, 1032.6 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:42,249 : INFO : PROGRESS: pass 0, at document #77/77
2019-12-16 16:12:42,596 : INFO : optimized alpha [0.260733, 0.26313332, 0.2664994, 0.2662589, 0.26166806, 0.2668909, 0.25493205]
2019-12-16 16:12:42,606 : INFO : topic #6 (0.255): 0.005*"energy" + 0.004*"changes" + 0.004*"level" + 0.003*"planning" + 0.003*"future" + 0.003*"emissions" + 0.003*"development" + 0.003*"global" + 0.003*"sea" + 0.003*"management"
2019-12-16 16:12:42,607 : INFO : topic #0 (0.261): 0.005*"land" + 0.004*"planning" + 0.004*"coastal" + 0.004*"development" + 0.003*"plan" + 0.003*"transportation" + 0.003*"report" + 0.003*"risks" + 0.003*"community" + 0.003*"management"
2019-12-16 16:12:42,609 : INFO : topic #3 (0.266): 0.005*"energy" + 0.004*"land" + 0.004*"sea" + 0.004*"research" + 0.004*"report" + 0.003*"government" + 0.003*"changes" + 0.003*"planning" 

Number of topics: 7 Number of Terms: 20
[([(0.005758422, 'energy'),
   (0.0044910274, 'emissions'),
   (0.003507216, 'development'),
   (0.0035044213, 'planning'),
   (0.0034760726, 'changes'),
   (0.0033690864, 'government'),
   (0.0033531832, 'future'),
   (0.003321301, 'report'),
   (0.0030374245, 'management'),
   (0.0029445114, 'sea'),
   (0.0029365271, 'transportation'),
   (0.0028902944, 'infrastructure'),
   (0.002853777, 'coastal'),
   (0.002839456, 'flood'),
   (0.002786566, 'land'),
   (0.002755732, 'risks'),
   (0.0027093778, 'plan'),
   (0.002688941, 'communities'),
   (0.0026066322, 'level'),
   (0.0025596593, 'resources')],
  -0.13742077048467657),
 ([(0.0047889003, 'land'),
   (0.0041604405, 'planning'),
   (0.0041430774, 'coastal'),
   (0.0038985303, 'development'),
   (0.003478226, 'plan'),
   (0.0034672534, 'transportation'),
   (0.003445935, 'report'),
   (0.0033655085, 'risks'),
   (0.003262818, 'community'),
   (0.0032414037, 'management'),
   (0.0031953526, 'ener

2019-12-16 16:12:43,736 : INFO : -8.139 per-word bound, 281.9 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:43,737 : INFO : using autotuned alpha, starting with [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
2019-12-16 16:12:43,741 : INFO : using serial LDA version on this node
2019-12-16 16:12:43,760 : INFO : running online (single-pass) LDA training, 10 topics, 1 passes over the supplied corpus of 77 documents, updating model once every 77 documents, evaluating perplexity every 77 documents, iterating 50x with a convergence threshold of 0.001000


Perplexity: -8.138974248742516


2019-12-16 16:12:44,784 : INFO : -10.098 per-word bound, 1096.3 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:44,785 : INFO : PROGRESS: pass 0, at document #77/77
2019-12-16 16:12:45,079 : INFO : optimized alpha [0.18233542, 0.185698, 0.18769206, 0.18845245, 0.1848827, 0.18705593, 0.18017241, 0.18571228, 0.18557808, 0.18660522]
2019-12-16 16:12:45,095 : INFO : topic #6 (0.180): 0.005*"energy" + 0.004*"changes" + 0.004*"level" + 0.003*"development" + 0.003*"future" + 0.003*"planning" + 0.003*"emissions" + 0.003*"global" + 0.003*"sea" + 0.003*"land"
2019-12-16 16:12:45,097 : INFO : topic #0 (0.182): 0.004*"land" + 0.004*"development" + 0.004*"planning" + 0.004*"report" + 0.003*"energy" + 0.003*"transportation" + 0.003*"coastal" + 0.003*"plan" + 0.003*"risks" + 0.003*"emissions"
2019-12-16 16:12:45,098 : INFO : topic #5 (0.187): 0.006*"energy" + 0.005*"emissions" + 0.004*"development" + 0.004*"report" + 0.003*"changes" + 0.003*"governmen

Number of topics: 10 Number of Terms: 5
[([(0.0050891335, 'energy'),
   (0.0037768981, 'changes'),
   (0.0037038194, 'level'),
   (0.0034565334, 'development'),
   (0.0032889016, 'future')],
  -0.05095500132435933),
 ([(0.0061802324, 'energy'),
   (0.0050408123, 'emissions'),
   (0.0037383963, 'development'),
   (0.0035114456, 'report'),
   (0.003447032, 'changes')],
  -0.05775143224864513),
 ([(0.0052146586, 'management'),
   (0.0044046254, 'energy'),
   (0.00400896, 'planning'),
   (0.0039019084, 'level'),
   (0.0037185587, 'changes')],
  -0.06190723849910936),
 ([(0.005374173, 'land'),
   (0.00514556, 'coastal'),
   (0.005022374, 'energy'),
   (0.004723662, 'planning'),
   (0.0047023804, 'level')],
  -0.08411194466184696),
 ([(0.0052862675, 'energy'),
   (0.0043129525, 'land'),
   (0.0038770984, 'sea'),
   (0.0037922806, 'report'),
   (0.0036311473, 'planning')],
  -0.10760493166800948),
 ([(0.0042203846, 'sea'),
   (0.0041288645, 'energy'),
   (0.0041002594, 'development'),
   (0.0

2019-12-16 16:12:46,282 : INFO : -8.194 per-word bound, 292.9 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:46,283 : INFO : using autotuned alpha, starting with [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
2019-12-16 16:12:46,289 : INFO : using serial LDA version on this node
2019-12-16 16:12:46,310 : INFO : running online (single-pass) LDA training, 10 topics, 1 passes over the supplied corpus of 77 documents, updating model once every 77 documents, evaluating perplexity every 77 documents, iterating 50x with a convergence threshold of 0.001000


Perplexity: -8.194296510330643


2019-12-16 16:12:49,412 : INFO : -10.098 per-word bound, 1096.3 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:49,413 : INFO : PROGRESS: pass 0, at document #77/77
2019-12-16 16:12:49,880 : INFO : optimized alpha [0.18233542, 0.185698, 0.18769206, 0.18845245, 0.1848827, 0.18705593, 0.18017241, 0.18571228, 0.18557808, 0.18660522]
2019-12-16 16:12:49,896 : INFO : topic #6 (0.180): 0.005*"energy" + 0.004*"changes" + 0.004*"level" + 0.003*"development" + 0.003*"future" + 0.003*"planning" + 0.003*"emissions" + 0.003*"global" + 0.003*"sea" + 0.003*"land"
2019-12-16 16:12:49,897 : INFO : topic #0 (0.182): 0.004*"land" + 0.004*"development" + 0.004*"planning" + 0.004*"report" + 0.003*"energy" + 0.003*"transportation" + 0.003*"coastal" + 0.003*"plan" + 0.003*"risks" + 0.003*"emissions"
2019-12-16 16:12:49,898 : INFO : topic #5 (0.187): 0.006*"energy" + 0.005*"emissions" + 0.004*"development" + 0.004*"report" + 0.003*"changes" + 0.003*"governmen

Number of topics: 10 Number of Terms: 10
[([(0.0042568767, 'sea'),
   (0.0040918984, 'report'),
   (0.003735604, 'emissions'),
   (0.0036406913, 'energy'),
   (0.0035631943, 'risks'),
   (0.0034528512, 'development'),
   (0.003451777, 'infrastructure'),
   (0.003442626, 'management'),
   (0.0032847226, 'land'),
   (0.0030990536, 'changes')],
  -0.08179195225201402),
 ([(0.0052862675, 'energy'),
   (0.0043129525, 'land'),
   (0.0038770984, 'sea'),
   (0.0037922806, 'report'),
   (0.0036311473, 'planning'),
   (0.0034816898, 'research'),
   (0.0034496654, 'government'),
   (0.0034382474, 'development'),
   (0.003397386, 'changes'),
   (0.0030684005, 'level')],
  -0.08656116526701671),
 ([(0.005374173, 'land'),
   (0.00514556, 'coastal'),
   (0.005022374, 'energy'),
   (0.004723662, 'planning'),
   (0.0047023804, 'level'),
   (0.0046558864, 'sea'),
   (0.0044960403, 'management'),
   (0.0037167498, 'changes'),
   (0.00320481, 'future'),
   (0.0030464665, 'community')],
  -0.09396206537842

2019-12-16 16:12:50,931 : INFO : -8.194 per-word bound, 292.9 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:50,932 : INFO : using autotuned alpha, starting with [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
2019-12-16 16:12:50,937 : INFO : using serial LDA version on this node
2019-12-16 16:12:50,954 : INFO : running online (single-pass) LDA training, 10 topics, 1 passes over the supplied corpus of 77 documents, updating model once every 77 documents, evaluating perplexity every 77 documents, iterating 50x with a convergence threshold of 0.001000


Perplexity: -8.194296510330643


2019-12-16 16:12:51,892 : INFO : -10.098 per-word bound, 1096.3 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:51,893 : INFO : PROGRESS: pass 0, at document #77/77
2019-12-16 16:12:52,188 : INFO : optimized alpha [0.18233542, 0.185698, 0.18769206, 0.18845245, 0.1848827, 0.18705593, 0.18017241, 0.18571228, 0.18557808, 0.18660522]
2019-12-16 16:12:52,202 : INFO : topic #6 (0.180): 0.005*"energy" + 0.004*"changes" + 0.004*"level" + 0.003*"development" + 0.003*"future" + 0.003*"planning" + 0.003*"emissions" + 0.003*"global" + 0.003*"sea" + 0.003*"land"
2019-12-16 16:12:52,203 : INFO : topic #0 (0.182): 0.004*"land" + 0.004*"development" + 0.004*"planning" + 0.004*"report" + 0.003*"energy" + 0.003*"transportation" + 0.003*"coastal" + 0.003*"plan" + 0.003*"risks" + 0.003*"emissions"
2019-12-16 16:12:52,205 : INFO : topic #5 (0.187): 0.006*"energy" + 0.005*"emissions" + 0.004*"development" + 0.004*"report" + 0.003*"changes" + 0.003*"governmen

Number of topics: 10 Number of Terms: 15
[([(0.0042568767, 'sea'),
   (0.0040918984, 'report'),
   (0.003735604, 'emissions'),
   (0.0036406913, 'energy'),
   (0.0035631943, 'risks'),
   (0.0034528512, 'development'),
   (0.003451777, 'infrastructure'),
   (0.003442626, 'management'),
   (0.0032847226, 'land'),
   (0.0030990536, 'changes'),
   (0.0030316547, 'future'),
   (0.0030281488, 'planning'),
   (0.0029966012, 'health'),
   (0.0029590551, 'information'),
   (0.0028457686, 'research')],
  -0.08745236583346197),
 ([(0.005374173, 'land'),
   (0.00514556, 'coastal'),
   (0.005022374, 'energy'),
   (0.004723662, 'planning'),
   (0.0047023804, 'level'),
   (0.0046558864, 'sea'),
   (0.0044960403, 'management'),
   (0.0037167498, 'changes'),
   (0.00320481, 'future'),
   (0.0030464665, 'community'),
   (0.002861485, 'government'),
   (0.0028312178, 'development'),
   (0.002807629, 'environment'),
   (0.0027247884, 'report'),
   (0.002702929, 'emissions')],
  -0.10858433315515417),
 ([(

2019-12-16 16:12:53,388 : INFO : -8.194 per-word bound, 292.9 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:53,389 : INFO : using autotuned alpha, starting with [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
2019-12-16 16:12:53,392 : INFO : using serial LDA version on this node
2019-12-16 16:12:53,412 : INFO : running online (single-pass) LDA training, 10 topics, 1 passes over the supplied corpus of 77 documents, updating model once every 77 documents, evaluating perplexity every 77 documents, iterating 50x with a convergence threshold of 0.001000


Perplexity: -8.194296510330643


2019-12-16 16:12:54,317 : INFO : -10.098 per-word bound, 1096.3 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:54,318 : INFO : PROGRESS: pass 0, at document #77/77
2019-12-16 16:12:54,609 : INFO : optimized alpha [0.18233542, 0.185698, 0.18769206, 0.18845245, 0.1848827, 0.18705593, 0.18017241, 0.18571228, 0.18557808, 0.18660522]
2019-12-16 16:12:54,622 : INFO : topic #6 (0.180): 0.005*"energy" + 0.004*"changes" + 0.004*"level" + 0.003*"development" + 0.003*"future" + 0.003*"planning" + 0.003*"emissions" + 0.003*"global" + 0.003*"sea" + 0.003*"land"
2019-12-16 16:12:54,623 : INFO : topic #0 (0.182): 0.004*"land" + 0.004*"development" + 0.004*"planning" + 0.004*"report" + 0.003*"energy" + 0.003*"transportation" + 0.003*"coastal" + 0.003*"plan" + 0.003*"risks" + 0.003*"emissions"
2019-12-16 16:12:54,625 : INFO : topic #5 (0.187): 0.006*"energy" + 0.005*"emissions" + 0.004*"development" + 0.004*"report" + 0.003*"changes" + 0.003*"governmen

Number of topics: 10 Number of Terms: 20
[([(0.0043884604, 'research'),
   (0.004029547, 'energy'),
   (0.0036049061, 'transportation'),
   (0.003591945, 'forest'),
   (0.003479897, 'changes'),
   (0.0034423464, 'government'),
   (0.0033330107, 'management'),
   (0.0032313599, 'coastal'),
   (0.0031589514, 'planning'),
   (0.0030093123, 'emissions'),
   (0.0029744778, 'report'),
   (0.002923624, 'sea'),
   (0.0028200778, 'infrastructure'),
   (0.0027928592, 'land'),
   (0.0027553109, 'community'),
   (0.002754537, 'level'),
   (0.002693396, 'future'),
   (0.002683115, 'assessment'),
   (0.0026585422, 'ice'),
   (0.0026392005, 'resources')],
  -0.12204075629009935),
 ([(0.0042568767, 'sea'),
   (0.0040918984, 'report'),
   (0.003735604, 'emissions'),
   (0.0036406913, 'energy'),
   (0.0035631943, 'risks'),
   (0.0034528512, 'development'),
   (0.003451777, 'infrastructure'),
   (0.003442626, 'management'),
   (0.0032847226, 'land'),
   (0.0030990536, 'changes'),
   (0.0030316547, 'futur

2019-12-16 16:12:55,740 : INFO : -8.194 per-word bound, 292.9 perplexity estimate based on a held-out corpus of 77 documents with 959067 words


Perplexity: -8.194296510330643
CPU times: user 2min 6s, sys: 2.43 s, total: 2min 8s
Wall time: 1min 11s


In [17]:
# create the model
model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    alpha='auto',
    eta='auto',
    num_topics=3,
    eval_every=1,
    random_state=42
)

2019-12-16 16:12:55,750 : INFO : using autotuned alpha, starting with [0.33333334, 0.33333334, 0.33333334]
2019-12-16 16:12:55,759 : INFO : using serial LDA version on this node
2019-12-16 16:12:55,767 : INFO : running online (single-pass) LDA training, 3 topics, 1 passes over the supplied corpus of 77 documents, updating model once every 77 documents, evaluating perplexity every 77 documents, iterating 50x with a convergence threshold of 0.001000
2019-12-16 16:12:56,691 : INFO : -9.871 per-word bound, 936.2 perplexity estimate based on a held-out corpus of 77 documents with 959067 words
2019-12-16 16:12:56,691 : INFO : PROGRESS: pass 0, at document #77/77
2019-12-16 16:12:56,971 : INFO : optimized alpha [0.5889133, 0.6060606, 0.61193645]
2019-12-16 16:12:56,976 : INFO : topic #0 (0.589): 0.004*"land" + 0.004*"planning" + 0.004*"development" + 0.004*"energy" + 0.003*"report" + 0.003*"coastal" + 0.003*"risks" + 0.003*"emissions" + 0.003*"plan" + 0.003*"sea"
2019-12-16 16:12:56,979 : INF

In [18]:
# find the top 10 words for each topic
top_topics = model.top_topics(corpus, topn=10)

In [19]:
# print the top 3 topics, and the top 10 words within those topics
# also includes the topics coherence scores
pprint(top_topics)

[([(0.004631676, 'energy'),
   (0.0045930427, 'management'),
   (0.0039982614, 'planning'),
   (0.0039370125, 'level'),
   (0.003597411, 'changes'),
   (0.0035456615, 'sea'),
   (0.0033018345, 'infrastructure'),
   (0.0031888986, 'future'),
   (0.003135145, 'land'),
   (0.0029807717, 'government')],
  -0.11312160203614283),
 ([(0.005660018, 'energy'),
   (0.0044247587, 'report'),
   (0.004079819, 'emissions'),
   (0.0040084627, 'changes'),
   (0.0036828027, 'sea'),
   (0.0036512404, 'coastal'),
   (0.003472614, 'development'),
   (0.003346625, 'land'),
   (0.003226643, 'future'),
   (0.0030616391, 'research')],
  -0.12724002054886982),
 ([(0.0041694823, 'land'),
   (0.0039146347, 'planning'),
   (0.003820342, 'development'),
   (0.0035005992, 'energy'),
   (0.0034255798, 'report'),
   (0.003383334, 'coastal'),
   (0.0032530655, 'risks'),
   (0.0031254895, 'emissions'),
   (0.0030975, 'plan'),
   (0.0030801585, 'sea')],
  -0.1981268823302916)]


In [20]:
# prepare inputs for the visualization
tfidf_model = TfidfModel(corpus)
corpus_tfidf = tfidf_model[corpus]

2019-12-16 16:12:57,088 : INFO : collecting document frequencies
2019-12-16 16:12:57,093 : INFO : PROGRESS: processing document #0
2019-12-16 16:12:57,165 : INFO : calculating IDF weights for 77 documents and 12496 features (158296 matrix non-zeros)


In [21]:
# initialize the visualization
viz = pyLDAvis.gensim.prepare(model, corpus_tfidf, dictionary)
# display the visualization
pyLDAvis.display(viz)

2019-12-16 16:12:59,392 : INFO : NumExpr defaulting to 4 threads.
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [22]:
# save the visualization.
pyLDAvis.save_html(viz, 'gensim_climate_change_lda_model_tika_alldocs.html')

In [23]:
# read a new file not trained to the model
new_doc = PlaintextCorpusReader(root='/Users/rahimjiwa/Documents/DataScience/UofT3666_AppliedNLP/Final_Testings',fileids='random_pdf.txt')
# get a list of words
new_doc = list(new_doc.words())
# create a bow representations of the new documents
new_doc_bow = dictionary.doc2bow(new_doc)
# get the document distributions
print(model.get_document_topics(new_doc_bow))

[(0, 0.058845174), (1, 0.66916853), (2, 0.2719863)]


In [24]:
# get the coherence score
coherence_model = CoherenceModel(model=model, corpus=corpus, coherence='u_mass')
print("Coherence: " + str(coherence_model.get_coherence()))


Coherence: -0.13066012318477496


In [25]:
# get the perplexity score
print("Perplexity: " + str(model.log_perplexity(corpus)))

2019-12-16 16:19:06,076 : INFO : -8.077 per-word bound, 270.1 perplexity estimate based on a held-out corpus of 77 documents with 959067 words


Perplexity: -8.077112493561172
