# LDA using Gensim

This notebook, goes through performing LDA on the text documents of the Climate Change PDFs using Gensim.

In this notebook, we will:
- identify the files and come up with a file list
- load the stopwords
- functionally get the documents
- normalize the documents
- create the dictionary and corpus
- create id2word
- initialize and train the model
- print out top topics
- determine the coherence score and perplexity
- create and save a visualization of topics using pyLDAvis
- test how an external document fits into the model

In [1]:
#imports
import codecs
import pandas as pd
import numpy as np
import os
import json
import time
import csv
import sklearn
import nltk
nltk.download('all')
import unicodedata
from nltk.corpus import PlaintextCorpusReader
from nltk.tokenize import WordPunctTokenizer
from nltk import pos_tag, sent_tokenize, wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import CountVectorizer
from gensim.corpora import Dictionary
from sklearn.base import BaseEstimator, TransformerMixin
from pprint import pprint
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models.callbacks import PerplexityMetric
from gensim.models.callbacks import CoherenceMetric
from gensim.models.coherencemodel import CoherenceModel
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import pyLDAvis.gensim
import pyLDAvis
from gensim.models import TfidfModel

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/rahimjiwa/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/rahimjiwa/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /Users/rahimjiwa/nltk_data...
[nltk_data]    |   Package biocreative_ppi is already up-to-date!
[nltk_data]    | Downloading package brown to
[nltk_data]    |     /Users/rahimjiwa/nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package brown_tei to
[nltk_data]    |     /Users/rahimjiwa/nltk_data...
[nltk_data]    |   Package brown_tei is already up-to-date!
[nltk_data]    | Downloading package cess_cat to
[nltk_data]    |     /Users/rahimjiwa/nltk_data...
[nltk_data]    |   Package cess_cat is already up-

[nltk_data]    |   Package timit is already up-to-date!
[nltk_data]    | Downloading package toolbox to
[nltk_data]    |     /Users/rahimjiwa/nltk_data...
[nltk_data]    |   Package toolbox is already up-to-date!
[nltk_data]    | Downloading package treebank to
[nltk_data]    |     /Users/rahimjiwa/nltk_data...
[nltk_data]    |   Package treebank is already up-to-date!
[nltk_data]    | Downloading package twitter_samples to
[nltk_data]    |     /Users/rahimjiwa/nltk_data...
[nltk_data]    |   Package twitter_samples is already up-to-date!
[nltk_data]    | Downloading package udhr to
[nltk_data]    |     /Users/rahimjiwa/nltk_data...
[nltk_data]    |   Package udhr is already up-to-date!
[nltk_data]    | Downloading package udhr2 to
[nltk_data]    |     /Users/rahimjiwa/nltk_data...
[nltk_data]    |   Package udhr2 is already up-to-date!
[nltk_data]    | Downloading package unicode_samples to
[nltk_data]    |     /Users/rahimjiwa/nltk_data...
[nltk_data]    |   Package unicode_samples i

In [2]:
# let's start by running over the folder where our files are, we can use the plaintextcorpusreader to obtain the fileids
reader = PlaintextCorpusReader('tikatext','.*txt')
# trying with added new docs

In [3]:
# create a list of fileids
file_list = reader.fileids()
file_list

['Ahead-of-the-Storm-1.txt',
 'ClimatRisk-E-ACCESSIBLE.txt',
 'En56-226-2008-eng.txt',
 'FBC_WaterGuide_FINAL.txt',
 'FloodRecovery-e.txt',
 'Guide-Building-Sustainable-and-Resilient-Communities-with-Asset-Management-EN.txt',
 'Guidebook-2016.txt',
 'HP5-122-2017-eng.txt',
 'Protect_Your_Home_From_Flooding_Brochure.txt',
 'Report on Effects of a Changing Climate to the US Department of Defense.txt',
 'Spring_Flood_Fact_Sheet.txt',
 'Synthesis_Eng.txt',
 'Urban_Forests_Guide.txt',
 "VOLUME 1 Canada's Changing Climate 2018.txt",
 'VOLUME 2 Preparing for Change 2018.txt',
 'VOLUME 3 Creating Resilient Communities 2018.txt',
 'VOLUME 4 Facing Rising Waters 2018.txt',
 'Vancouver-Climate-Change-Adaptation-Strategy-2012-11-07.txt',
 'Vulnerability Guidebook_June2_EN.txt',
 'WCEL_climate_change_FINAL.txt',
 'Windsor Climate Change Adaptation Plan.txt',
 'builders_guide_2010_final.txt',
 'ccp_impactonpeople.txt',
 'climate_data_discussion_primer.txt',
 'coastal_flooded_land_guidelines.txt',
 '

In [4]:
# load the stopwords text file
with open('stopwords.txt','r') as f:
    # create a list for stopwords
    stopwords = []
    # itearate through the text files, replacing any new line unicode characters
    for i in list(f):
        temp = str(i).replace('\n','')
        # append the stopwords to the list
        stopwords.append(temp)

In [5]:
print(stopwords)

['Stopwords and Punctuation:', '.', ',', 'and', 'the', 'of', 'to', '-', '/', 'in', '(', ')', 'a', 'for', ':', 'is', 'on', 'or', 'that', 'are', 'be', ').', '1', 'as', 'with', '://', '.,', 'from', '2', 'by', 'www', 'http', 'ca', '[', ']', '...', "'", '\\uf0b7', ';', 'at', ').', 'pdf', 'climate', 'change', 'canada', 'en', '):', 'so', 'html', 'do', '.;', 'com', 'canadian', 'impacts', 'adaptation']


In [6]:
# change directory to directory with files
os.chdir('/Users/rahimjiwa/Documents/DataScience/UofT3666_AppliedNLP/Final_Testings/tikatext/')
# trying with added new docs

In [7]:
# create a list for documents
docs = []
# iterate through file list
for i in file_list:
    # load the documents
    text = PlaintextCorpusReader(root=os.getcwd(),fileids=i)
    # create a list of the words in the documents
    i_doc = list(text.words())
    # create a new list for document
    new_i_doc = []
    # for each word in the documents
    for j in i_doc:
        #check if the length of the word is greater than 2 and that it is not a stopword
        if len(j) > 2 and j.lower() not in stopwords:
            # append to word to the new list of words
            new_i_doc.append(j)
    # append list of words to documents
    docs.append(new_i_doc)
    # print the file name
    print(i)

Ahead-of-the-Storm-1.txt
ClimatRisk-E-ACCESSIBLE.txt
En56-226-2008-eng.txt
FBC_WaterGuide_FINAL.txt
FloodRecovery-e.txt
Guide-Building-Sustainable-and-Resilient-Communities-with-Asset-Management-EN.txt
Guidebook-2016.txt
HP5-122-2017-eng.txt
Protect_Your_Home_From_Flooding_Brochure.txt
Report on Effects of a Changing Climate to the US Department of Defense.txt
Spring_Flood_Fact_Sheet.txt
Synthesis_Eng.txt
Urban_Forests_Guide.txt
VOLUME 1 Canada's Changing Climate 2018.txt
VOLUME 2 Preparing for Change 2018.txt
VOLUME 3 Creating Resilient Communities 2018.txt
VOLUME 4 Facing Rising Waters 2018.txt
Vancouver-Climate-Change-Adaptation-Strategy-2012-11-07.txt
Vulnerability Guidebook_June2_EN.txt
WCEL_climate_change_FINAL.txt
Windsor Climate Change Adaptation Plan.txt
builders_guide_2010_final.txt
ccp_impactonpeople.txt
climate_data_discussion_primer.txt
coastal_flooded_land_guidelines.txt
env-yukon-state-play-analysis-climate-change-impacts-adaptation.txt
final_climate_change_and_health_ba

In [8]:
# take a look at the output of the documents
docs

[['Natalia',
  'Moudrak',
  'Blair',
  'Feltmate',
  'Intact',
  'Centre',
  'AHEAD',
  'STORM',
  'SUPPORTED',
  'Developing',
  'Flood',
  'Resilience',
  'Guidance',
  'Commercial',
  'Real',
  'Estate',
  'October',
  '2019',
  'Ahead',
  'Storm',
  'Developing',
  'Flood',
  'Resilience',
  'Guidance',
  'Commercial',
  'Real',
  'Estate2',
  'AHEAD',
  'STORM',
  'Ahead',
  'Storm',
  'Developing',
  'Flood',
  'Resilience',
  'Guidance',
  'Commercial',
  'Real',
  'Estate',
  'About',
  'Intact',
  'Centre',
  'Intact',
  'Centre',
  'Intact',
  'Centre',
  'applied',
  'research',
  'centre',
  'University',
  'Waterloo',
  'Intact',
  'Centre',
  'was',
  'founded',
  '2015',
  'gift',
  'Intact',
  'Financial',
  'Corporation',
  'largest',
  'property',
  'casualty',
  'insurer',
  'Intact',
  'Centre',
  'helps',
  'homeowners',
  'communities',
  'businesses',
  'reduce',
  'risks',
  'associated',
  'extreme',
  'weather',
  'events',
  'additional',
  'information',
  '

In [9]:
# check how many documents are being used
len(docs)

35

In [10]:
# define a normalizer class
class TextNormalizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
    
    # define function for punctuation marks
    def is_punct(self, token):
        return all(unicodedata.category(char).startswith('P') for char in token)
    
    def normalize(self, document):
        return [
            # here you could return a lemmatized token instead
            token.lower()
            for sentence in document
            for token in sentence
            # check if a punctuation mark
            if not self.is_punct(token)
        ]
    
    # create a lemmatizer function
    def lemmatize(self, token):
        return self.lemmatizer.lemmatize(token)
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, documents):
        return [''.join(self.normalize(doc)) for doc in documents]

In [11]:
# initialize normalizer class
norm = TextNormalizer()

In [12]:
# create a list of the normalized documents
docs_normalized = []
# iterate through each document and apply the normalizer
for i in docs:
    docs_normalized.append(norm.fit_transform(i))

# take a look at the normalized documents
print(docs_normalized)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [13]:
# create a dictionary of the documents.
dictionary = Dictionary(docs_normalized)

# filter out words present in less than 3 documents or in more than 90% of the documents.
dictionary.filter_extremes(no_below=3, no_above=0.9)

2019-12-15 02:20:04,844 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2019-12-15 02:20:05,325 : INFO : built Dictionary(24508 unique tokens: ['', '000', '02210', '0e3', '100']...) from 35 documents (total 522623 corpus positions)
2019-12-15 02:20:05,487 : INFO : discarding 16505 tokens: [('02210', 1), ('0e3', 1), ('1001', 1), ('10017', 1), ('1004124542', 1), ('1004159387', 1), ('11205', 1), ('1983', 1), ('202015', 2), ('2021', 2)]...
2019-12-15 02:20:05,489 : INFO : keeping 8003 tokens which were in no less than 3 and no more than 31 (=90.0%) documents
2019-12-15 02:20:05,531 : INFO : resulting dictionary: Dictionary(8003 unique tokens: ['', '000', '100', '102', '113']...)


In [14]:
# create a bow representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs_normalized]

In [15]:
# check the number of tokens and the number of documents
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 8003
Number of documents: 35


In [16]:
# index word dictionary
temp = dictionary[0]  
id2word = dictionary.id2token
# create the model
model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    alpha='auto',
    eta='auto',
    num_topics=3,
    eval_every=1,
    random_state=42
)

2019-12-15 02:20:05,810 : INFO : using autotuned alpha, starting with [0.33333334, 0.33333334, 0.33333334]
2019-12-15 02:20:05,815 : INFO : using serial LDA version on this node
2019-12-15 02:20:05,827 : INFO : running online (single-pass) LDA training, 3 topics, 1 passes over the supplied corpus of 35 documents, updating model once every 35 documents, evaluating perplexity every 35 documents, iterating 50x with a convergence threshold of 0.001000
2019-12-15 02:20:06,317 : INFO : -9.432 per-word bound, 690.8 perplexity estimate based on a held-out corpus of 35 documents with 421662 words
2019-12-15 02:20:06,318 : INFO : PROGRESS: pass 0, at document #35/35
2019-12-15 02:20:06,495 : INFO : optimized alpha [0.6270602, 0.5856732, 0.60293716]
2019-12-15 02:20:06,499 : INFO : topic #0 (0.627): 0.005*"flood" + 0.005*"transportation" + 0.005*"sea" + 0.005*"level" + 0.005*"management" + 0.005*"infrastructure" + 0.004*"coastal" + 0.004*"future" + 0.004*"land" + 0.004*"development"
2019-12-15 02

In [17]:
# find the top 10 words for each topic
top_topics = model.top_topics(corpus, topn=10)

In [18]:
# print the top 3 topics, and the top 10 words within those topics
# also includes the topics coherence scores
pprint(top_topics)

[([(0.0053720153, 'flood'),
   (0.005234428, 'transportation'),
   (0.0049099647, 'sea'),
   (0.004871954, 'level'),
   (0.004605627, 'management'),
   (0.0045272624, 'infrastructure'),
   (0.0042316196, 'coastal'),
   (0.004010092, 'future'),
   (0.0039098933, 'land'),
   (0.0037432138, 'development')],
  -0.09846210343243858),
 ([(0.0053904555, 'coastal'),
   (0.0049055982, 'infrastructure'),
   (0.0042941193, 'management'),
   (0.003977475, 'risks'),
   (0.003809963, 'level'),
   (0.0037406697, 'transportation'),
   (0.0036307513, 'government'),
   (0.0036014856, 'changes'),
   (0.0033598999, 'sea'),
   (0.0031809667, 'development')],
  -0.11557995310937298),
 ([(0.005551861, 'management'),
   (0.005320358, 'sea'),
   (0.004429398, 'coastal'),
   (0.004234185, 'level'),
   (0.0042142803, 'transportation'),
   (0.0041705975, 'land'),
   (0.0039760373, 'development'),
   (0.0037914393, 'future'),
   (0.00367159, 'changes'),
   (0.0036065676, '')],
  -0.12853881189469024)]


In [19]:
# prepare inputs for the visualization
tfidf_model = TfidfModel(corpus)
corpus_tfidf = tfidf_model[corpus]

2019-12-15 02:20:06,609 : INFO : collecting document frequencies
2019-12-15 02:20:06,613 : INFO : PROGRESS: processing document #0
2019-12-15 02:20:06,655 : INFO : calculating IDF weights for 35 documents and 8003 features (70965 matrix non-zeros)


In [20]:
# initialize the visualization
viz = pyLDAvis.gensim.prepare(model, corpus_tfidf, dictionary)
# display the visualization
pyLDAvis.display(viz)

2019-12-15 02:20:07,924 : INFO : NumExpr defaulting to 4 threads.
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [21]:
# save the visualization.
pyLDAvis.save_html(viz, 'gensim_climate_change_lda_model_tikatext.html')

In [22]:
# read a new file not trained to the model
new_doc = PlaintextCorpusReader(root='/Users/rahimjiwa/Documents/DataScience/UofT3666_AppliedNLP/Final_Testings',fileids='random_pdf.txt')
# get a list of words
new_doc = list(new_doc.words())
# create a bow representations of the new documents
new_doc_bow = dictionary.doc2bow(new_doc)
# get the document distributions
print(model.get_document_topics(new_doc_bow))

[(0, 0.25012463), (1, 0.52804875), (2, 0.2218266)]


In [23]:
# get the coherence score
coherence_model = CoherenceModel(model=model, corpus=corpus, coherence='u_mass')
print("Coherence: " + str(coherence_model.get_coherence()))


Coherence: -0.1642578877380345


In [24]:
# get the perplexity score
print("Perplexity: " + str(model.log_perplexity(corpus)))

2019-12-15 02:24:44,603 : INFO : -7.919 per-word bound, 242.0 perplexity estimate based on a held-out corpus of 35 documents with 421662 words


Perplexity: -7.918767266679678
