# LDA using Gensim

This notebook, goes through performing LDA on the text documents of the Climate Change PDFs using Gensim.

In this notebook, we will:
- identify the files and come up with a file list
- load the stopwords
- functionally get the documents
- normalize the documents
- create the dictionary and corpus
- create id2word
- initialize and train the model
- print out top topics
- determine the coherence score and perplexity
- create and save a visualization of topics using pyLDAvis
- test how an external document fits into the model

In [1]:
#imports
import codecs
import pandas as pd
import numpy as np
import os
import json
import time
import csv
import sklearn
import nltk
nltk.download('all')
import unicodedata
from nltk.corpus import PlaintextCorpusReader
from nltk.tokenize import WordPunctTokenizer
from nltk import pos_tag, sent_tokenize, wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import CountVectorizer
from gensim.corpora import Dictionary
from sklearn.base import BaseEstimator, TransformerMixin
from pprint import pprint
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models.callbacks import PerplexityMetric
from gensim.models.callbacks import CoherenceMetric
from gensim.models.coherencemodel import CoherenceModel
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import pyLDAvis.gensim
import pyLDAvis
from gensim.models import TfidfModel

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/rahimjiwa/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/rahimjiwa/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /Users/rahimjiwa/nltk_data...
[nltk_data]    |   Package biocreative_ppi is already up-to-date!
[nltk_data]    | Downloading package brown to
[nltk_data]    |     /Users/rahimjiwa/nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package brown_tei to
[nltk_data]    |     /Users/rahimjiwa/nltk_data...
[nltk_data]    |   Package brown_tei is already up-to-date!
[nltk_data]    | Downloading package cess_cat to
[nltk_data]    |     /Users/rahimjiwa/nltk_data...
[nltk_data]    |   Package cess_cat is already up-

[nltk_data]    |   Package swadesh is already up-to-date!
[nltk_data]    | Downloading package switchboard to
[nltk_data]    |     /Users/rahimjiwa/nltk_data...
[nltk_data]    |   Package switchboard is already up-to-date!
[nltk_data]    | Downloading package timit to
[nltk_data]    |     /Users/rahimjiwa/nltk_data...
[nltk_data]    |   Package timit is already up-to-date!
[nltk_data]    | Downloading package toolbox to
[nltk_data]    |     /Users/rahimjiwa/nltk_data...
[nltk_data]    |   Package toolbox is already up-to-date!
[nltk_data]    | Downloading package treebank to
[nltk_data]    |     /Users/rahimjiwa/nltk_data...
[nltk_data]    |   Package treebank is already up-to-date!
[nltk_data]    | Downloading package twitter_samples to
[nltk_data]    |     /Users/rahimjiwa/nltk_data...
[nltk_data]    |   Package twitter_samples is already up-to-date!
[nltk_data]    | Downloading package udhr to
[nltk_data]    |     /Users/rahimjiwa/nltk_data...
[nltk_data]    |   Package udhr is alre

In [2]:
# let's start by running over the folder where our files are, we can use the plaintextcorpusreader to obtain the fileids
reader = PlaintextCorpusReader('tikatextcombined','.*txt')
# trying with added new docs

In [3]:
# create a list of fileids
file_list = reader.fileids()
file_list

['20170125-en.txt',
 '2017BernardSoubryPolicyBrief.txt',
 'A_Canadian_Opportunity_-_Tackling_climate_change_by_switching_to_clean_power.txt',
 'Adapting_to_a_Changing_Climate_in_NS.txt',
 'Ahead-of-the-Storm-1.txt',
 'AtlanticRegionAdaptationScienceActivitiesReport.txt',
 'BC-Agriculture-Climate-Change-Action-Plan.txt',
 'CAT_research_plan_2015.txt',
 'CCCR_FULLREPORT-EN-FINAL.txt',
 'ClimatRisk-E-ACCESSIBLE.txt',
 'En56-226-2008-eng.txt',
 'FBC_WaterGuide_FINAL.txt',
 'FloodRecovery-e.txt',
 'Guide-Building-Sustainable-and-Resilient-Communities-with-Asset-Management-EN.txt',
 'Guidebook-2016.txt',
 'HP5-122-2017-eng.txt',
 'IPCC_SRREN_Ch01.txt',
 'London_tech.txt',
 'PB_Are_the_Dutch_going_green.txt',
 'Perspectives on Climate Change Action in Canada English.txt',
 'Protect_Your_Home_From_Flooding_Brochure.txt',
 'Report on Effects of a Changing Climate to the US Department of Defense.txt',
 'Spring_Flood_Fact_Sheet.txt',
 'Synthesis_Eng.txt',
 'Tr045.txt',
 'UK-CCRA-2017-Synthesis-Re

In [4]:
# load the stopwords text file
with open('stopwords.txt','r') as f:
    # create a list for stopwords
    stopwords = []
    # itearate through the text files, replacing any new line unicode characters
    for i in list(f):
        temp = str(i).replace('\n','')
        # append the stopwords to the list
        stopwords.append(temp)

In [5]:
print(stopwords)

['Stopwords and Punctuation:', '.', ',', 'and', 'the', 'of', 'to', '-', '/', 'in', '(', ')', 'a', 'for', ':', 'is', 'on', 'or', 'that', 'are', 'be', ').', '1', 'as', 'with', '://', '.,', 'from', '2', 'by', 'www', 'http', 'ca', '[', ']', '...', "'", '\\uf0b7', ';', 'at', ').', 'pdf', 'climate', 'change', 'canada', 'en', '):', 'so', 'html', 'do', '.;', 'com', 'canadian', 'impacts', 'adaptation']


In [6]:
# change directory to directory with files
os.chdir('/Users/rahimjiwa/Documents/DataScience/UofT3666_AppliedNLP/Final_Testings/tikatextcombined/')
# trying with added new docs

In [7]:
# create a list for documents
docs = []
# iterate through file list
for i in file_list:
    # load the documents
    text = PlaintextCorpusReader(root=os.getcwd(),fileids=i)
    # create a list of the words in the documents
    i_doc = list(text.words())
    # create a new list for document
    new_i_doc = []
    # for each word in the documents
    for j in i_doc:
        #check if the length of the word is greater than 2 and that it is not a stopword
        if len(j) > 2 and j.lower() not in stopwords:
            # append to word to the new list of words
            new_i_doc.append(j)
    # append list of words to documents
    docs.append(new_i_doc)
    # print the file name
    print(i)

20170125-en.txt
2017BernardSoubryPolicyBrief.txt
A_Canadian_Opportunity_-_Tackling_climate_change_by_switching_to_clean_power.txt
Adapting_to_a_Changing_Climate_in_NS.txt
Ahead-of-the-Storm-1.txt
AtlanticRegionAdaptationScienceActivitiesReport.txt
BC-Agriculture-Climate-Change-Action-Plan.txt
CAT_research_plan_2015.txt
CCCR_FULLREPORT-EN-FINAL.txt
ClimatRisk-E-ACCESSIBLE.txt
En56-226-2008-eng.txt
FBC_WaterGuide_FINAL.txt
FloodRecovery-e.txt
Guide-Building-Sustainable-and-Resilient-Communities-with-Asset-Management-EN.txt
Guidebook-2016.txt
HP5-122-2017-eng.txt
IPCC_SRREN_Ch01.txt
London_tech.txt
PB_Are_the_Dutch_going_green.txt
Perspectives on Climate Change Action in Canada English.txt
Protect_Your_Home_From_Flooding_Brochure.txt
Report on Effects of a Changing Climate to the US Department of Defense.txt
Spring_Flood_Fact_Sheet.txt
Synthesis_Eng.txt
Tr045.txt
UK-CCRA-2017-Synthesis-Report-Committee-on-Climate-Change.txt
Urban_Forests_Guide.txt
VOLUME 1 Canada's Changing Climate 2018.t

In [8]:
# take a look at the output of the documents
docs

[['Clean',
  'Growth',
  'PAN',
  'FRAMEWORK',
  'Plan',
  'Address',
  'Grow',
  'Economy',
  'version',
  'Cat',
  'En4',
  '294',
  '2016E',
  'ISBN',
  '978',
  '660',
  '07023',
  'Photos',
  'Thinkstock',
  '2016',
  'Aussi',
  'disponible',
  'français',
  'PAN',
  'FRAMEWORK',
  'Clean',
  'Growth',
  'Plan',
  'Address',
  'Grow',
  'Economy',
  'FOREWORD',
  'Pan',
  'Framework',
  'Clean',
  'Growth',
  'presented',
  'here',
  'our',
  'collective',
  'plan',
  'grow',
  'our',
  'economy',
  'while',
  'reducing',
  'emissions',
  'building',
  'resilience',
  'adapt',
  'changing',
  'will',
  'help',
  'transition',
  'strong',
  'diverse',
  'competitive',
  'economy',
  'foster',
  'job',
  'creation',
  'new',
  'technologies',
  'exports',
  'provide',
  'healthy',
  'environment',
  'our',
  'children',
  'grandchildren',
  'Pan',
  'Framework',
  'both',
  'commitment',
  'world',
  'will',
  'its',
  'part',
  'plan',
  'meet',
  'needs',
  'Canadians',
  'have',


In [9]:
# check how many documents are being used
len(docs)

56

In [10]:
# define a normalizer class
class TextNormalizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
    
    # define function for punctuation marks
    def is_punct(self, token):
        return all(unicodedata.category(char).startswith('P') for char in token)
    
    def normalize(self, document):
        return [
            # here you could return a lemmatized token instead
            token.lower()
            for sentence in document
            for token in sentence
            # check if a punctuation mark
            if not self.is_punct(token)
        ]
    
    # create a lemmatizer function
    def lemmatize(self, token):
        return self.lemmatizer.lemmatize(token)
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, documents):
        return [''.join(self.normalize(doc)) for doc in documents]

In [11]:
# initialize normalizer class
norm = TextNormalizer()

In [12]:
# create a list of the normalized documents
docs_normalized = []
# iterate through each document and apply the normalizer
for i in docs:
    docs_normalized.append(norm.fit_transform(i))

# take a look at the normalized documents
print(docs_normalized)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [13]:
# create a dictionary of the documents.
dictionary = Dictionary(docs_normalized)

# filter out words present in less than 3 documents or in more than 90% of the documents.
dictionary.filter_extremes(no_below=3, no_above=0.9)

2019-12-15 02:03:40,544 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2019-12-15 02:03:41,245 : INFO : built Dictionary(35146 unique tokens: ['', '000', '07023', '100', '10027']...) from 56 documents (total 950187 corpus positions)
2019-12-15 02:03:41,303 : INFO : discarding 24181 tokens: [('07023', 1), ('2016e', 2), ('27086', 1), ('2mw', 1), ('3154', 2), ('315climateenglishreducedsize10', 1), ('36283', 1), ('50001', 2), ('523', 2), ('558', 1)]...
2019-12-15 02:03:41,304 : INFO : keeping 10965 tokens which were in no less than 3 and no more than 50 (=90.0%) documents
2019-12-15 02:03:41,331 : INFO : resulting dictionary: Dictionary(10965 unique tokens: ['', '000', '100', '10027', '110']...)


In [14]:
# create a bow representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs_normalized]

In [15]:
# check the number of tokens and the number of documents
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 10965
Number of documents: 56


In [16]:
# index word dictionary
temp = dictionary[0]  
id2word = dictionary.id2token
# create the model
model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    alpha='auto',
    eta='auto',
    num_topics=3,
    eval_every=1,
    random_state=42
)

2019-12-15 02:03:41,711 : INFO : using autotuned alpha, starting with [0.33333334, 0.33333334, 0.33333334]
2019-12-15 02:03:41,717 : INFO : using serial LDA version on this node
2019-12-15 02:03:41,733 : INFO : running online (single-pass) LDA training, 3 topics, 1 passes over the supplied corpus of 56 documents, updating model once every 56 documents, evaluating perplexity every 56 documents, iterating 50x with a convergence threshold of 0.001000
2019-12-15 02:03:42,502 : INFO : -9.742 per-word bound, 856.6 perplexity estimate based on a held-out corpus of 56 documents with 786045 words
2019-12-15 02:03:42,503 : INFO : PROGRESS: pass 0, at document #56/56
2019-12-15 02:03:42,702 : INFO : optimized alpha [0.6154381, 0.6088842, 0.5863802]
2019-12-15 02:03:42,707 : INFO : topic #0 (0.615): 0.005*"emissions" + 0.004*"development" + 0.004*"energy" + 0.004*"changes" + 0.004*"management" + 0.004*"future" + 0.004*"risks" + 0.004*"report" + 0.003*"infrastructure" + 0.003*"transportation"
2019-

In [17]:
# find the top 10 words for each topic
top_topics = model.top_topics(corpus, topn=10)

In [18]:
# print the top 3 topics, and the top 10 words within those topics
# also includes the topics coherence scores
pprint(top_topics)

[([(0.004416227, 'energy'),
   (0.004301955, 'sea'),
   (0.0041628396, 'local'),
   (0.0039565014, 'changes'),
   (0.003833714, 'land'),
   (0.0036392733, 'report'),
   (0.0035467532, 'research'),
   (0.0034543115, 'emissions'),
   (0.003360343, 'government'),
   (0.0031983773, 'future')],
  -0.08435206985256147),
 ([(0.00451812, 'emissions'),
   (0.0044105616, 'development'),
   (0.004219287, 'energy'),
   (0.0040020016, 'changes'),
   (0.0038823118, 'management'),
   (0.0037134914, 'future'),
   (0.0036080242, 'risks'),
   (0.003539546, 'report'),
   (0.0033220325, 'infrastructure'),
   (0.0030308308, 'transportation')],
  -0.09376667704954592),
 ([(0.0050185476, 'sea'),
   (0.004231277, 'report'),
   (0.0041005397, 'energy'),
   (0.0036841661, 'flood'),
   (0.0034437878, 'management'),
   (0.0033724422, 'coastal'),
   (0.0032880555, 'land'),
   (0.0032516045, 'development'),
   (0.0032026565, 'transportation'),
   (0.0030936964, 'changes')],
  -0.11112625537166283)]


In [19]:
# prepare inputs for the visualization
tfidf_model = TfidfModel(corpus)
corpus_tfidf = tfidf_model[corpus]

2019-12-15 02:03:42,781 : INFO : collecting document frequencies
2019-12-15 02:03:42,784 : INFO : PROGRESS: processing document #0
2019-12-15 02:03:42,820 : INFO : calculating IDF weights for 56 documents and 10965 features (121906 matrix non-zeros)


In [20]:
# initialize the visualization
viz = pyLDAvis.gensim.prepare(model, corpus_tfidf, dictionary)
# display the visualization
pyLDAvis.display(viz)

2019-12-15 02:03:44,724 : INFO : NumExpr defaulting to 4 threads.
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [21]:
# save the visualization.
pyLDAvis.save_html(viz, 'gensim_climate_change_lda_model.html')

In [22]:
# read a new file not trained to the model
new_doc = PlaintextCorpusReader(root='/Users/rahimjiwa/Documents/DataScience/UofT3666_AppliedNLP/Final_Testings',fileids='random_pdf.txt')
# get a list of words
new_doc = list(new_doc.words())
# create a bow representations of the new documents
new_doc_bow = dictionary.doc2bow(new_doc)
# get the document distributions
print(model.get_document_topics(new_doc_bow))

[(0, 0.24605206), (1, 0.12830782), (2, 0.62564015)]


In [23]:
# get the coherence score
coherence_model = CoherenceModel(model=model, corpus=corpus, coherence='u_mass')
print("Coherence: " + str(coherence_model.get_coherence()))


Coherence: -0.15028962290524578


In [24]:
# get the perplexity score
print("Perplexity: " + str(model.log_perplexity(corpus)))

2019-12-15 02:10:32,203 : INFO : -8.036 per-word bound, 262.5 perplexity estimate based on a held-out corpus of 56 documents with 786045 words


Perplexity: -8.03618820716205
