In [1]:
import os

# I saved all PDF files under the PDF folder in my env directory
pdf_docs_path = os.path.join("PDF")
document = "protect-your-home-from-snow-ice-storms.pdf"
one_pdf_path = os.path.join(pdf_docs_path, document)


In [2]:
import pdftotext

with open(one_pdf_path, "rb") as f:
    pdf = pdftotext.PDF(f)
    
print("This file has", len(pdf), "pages.")

This file has 16 pages.


In [3]:
pdf[0]

'Protect your home from\nSnow & ice storms\nDesigned for safer living ® is a program endorsed\nby Canada’s insurers to promote disaster-resilient homes.\n'

In [4]:
import pandas as pd
df = pd.DataFrame(columns=['File', 'Page', 'Text'])

In [5]:
# Spit out pages
i=0
for page in pdf:
    i+=1
    df.loc[i] = [document,i,page]

In [6]:
df

Unnamed: 0,File,Page,Text
1,protect-your-home-from-snow-ice-storms.pdf,1,Protect your home from\nSnow & ice storms\nDes...
2,protect-your-home-from-snow-ice-storms.pdf,2,for Catastrophic Loss Reduction\nThe Institute...
3,protect-your-home-from-snow-ice-storms.pdf,3,for homeowners\nWinter weather represents a si...
4,protect-your-home-from-snow-ice-storms.pdf,4,Understand your home’s vulnerabili...
5,protect-your-home-from-snow-ice-storms.pdf,5,These actions will not guarantee the safety of...
6,protect-your-home-from-snow-ice-storms.pdf,6,"Snow can collect behind roof obstructions, suc..."
7,protect-your-home-from-snow-ice-storms.pdf,7,The first step is to carefully remove any loos...
8,protect-your-home-from-snow-ice-storms.pdf,8,...
9,protect-your-home-from-snow-ice-storms.pdf,9,"Also, make sure underlayment is installed in r..."
10,protect-your-home-from-snow-ice-storms.pdf,10,Frozen Pipes\nFrozen pipes are one of the most...


In [7]:
# Now let's supercharge the damn thing
with os.scandir(pdf_docs_path) as entries:
    for entry in entries:
        if entry.name != ".DS_Store":
            print(entry.name)

81363.pdf
A Practitioners Guide to ClimateChange Adaptation in Ontario's Ecosystems Ver 1 2011.pdf
A_Residential_Guide_to_Flood_Prevention_and_Recovery.pdf
adapt_bulletin-adapt1-eng.pdf
Adapting to Climate Change in Coastal Communities In Canada White Paper.pdf
ANINTROENGLISH_EC.PDF
Bruce_(2006)_AdaptingtoClimateChange_ARisk-basedGuideforONMunicipalities.pdf
builders_guide_2010_final.pdf
ccp_impactonpeople.pdf
Climate Change Adaptation - A Priorities Plan for Canada_2012.pdf
climate_data_discussion_primer.pdf
ClimatRisk-E-ACCESSIBLE.pdf
Coastal Flooding Ferryland, NL.pdf
coastal_flooded_land_guidelines.pdf
En56-226-2008-eng.pdf
env-yukon-state-play-analysis-climate-change-impacts-adaptation.pdf
FBC_WaterGuide_FINAL.pdf
final_climate_change_and_health_backgrounder_overview.pdf
Floodproofing.pdf
FloodRecovery-e.pdf
Guide-Building-Sustainable-and-Resilient-Communities-with-Asset-Management-EN.pdf
Guidebook-2016.pdf
health_facilit-instal_sante-eng.pdf
HP5-122-2017-eng.pdf
landuse-e.pdf
mun

In [8]:
# Nice, now we can iterate through all files...
import re

df = pd.DataFrame(columns=['File', 'Page', 'Text'])
acPage = 0

with os.scandir(pdf_docs_path) as entries:
    for entry in entries:
        if entry.name != ".DS_Store":
            # Extract text and add to the datastore
            document = entry.name
            one_pdf_path = os.path.join(pdf_docs_path, document)
            
            with open(one_pdf_path, "rb") as f:
                try:
                    pdf = pdftotext.PDF(f)
                    i=0
                    for page in pdf:
                        i+=1
                        #contents = page.replace("\n", "")
                        contents = re.sub(r"[^a-zA-Z0-9:.,!?%$@]+", ' ', page)
                        if contents != "":
                            acPage+=1
                            df.loc[acPage] = [document,i,contents]
                except:
                    print("Error on document",document)
            
            

Error on document Coastal Flooding Ferryland, NL.pdf


In [29]:
df

Unnamed: 0,File,Page,Text
1,81363.pdf,1,Climate Change Mitigation:\na Strategic Approa...
2,81363.pdf,2,Toronto and Region Conservation (TRCA) has ...
3,81363.pdf,3,An Overview of\nGetting to Carbon Neutral: A G...
4,81363.pdf,4,Getting to Carbon Neutral does not dictate w...
5,81363.pdf,5,Tens steps to carbon neutrality\n Getting...
...,...,...,...
2437,Windsor Climate Change Adaptation Plan.pdf,17,Climate ...
2438,Windsor Climate Change Adaptation Plan.pdf,18,Climate C...
2439,Windsor Climate Change Adaptation Plan.pdf,19,Climate Cha...
2440,Windsor Climate Change Adaptation Plan.pdf,20,Climate...


In [31]:
df.to_excel("all-pages.xlsx")

### Corpus Generator

Pre-Processing:
Tokenization: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.
Words that have fewer than 3 characters are removed.
All stopwords are removed.
Lemmatization — words in third person are changed to first person and verbs in past and future tenses are changed into present.
Stemming — words are reduced to their root form.

In [50]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer 
from nltk.stem.porter import *
import numpy as np
np.random.seed(42)
import nltk
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
#nltk.download('wordnet')

In [51]:
# Lemmatization function
def lemmatize_stemming(text):
    return st.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [53]:
# Apply lemmatization and stemming to the document
processed_docs = df['Text'].map(preprocess)

In [54]:
processed_docs

1         [clim, chang, mitig, strategic, approach, city]
2       [toronto, reg, conserv, trca, trust, sourc, in...
3       [overview, get, carbon, neut, guid, canad, mun...
4       [get, carbon, neut, dict, reduc, opt, best, im...
5       [ten, step, carbon, neut, get, carbon, neut, o...
                              ...                        
2437    [clim, chang, adapt, plan, med, risk, impact, ...
2438    [clim, chang, adapt, plan, tabl, pot, med, pos...
2439    [clim, chang, adapt, plan, in, flow, monit, en...
2440    [clim, chang, adapt, plan, gen, develop, clear...
2441    [ref, expert, panel, clim, chang, adapt, adapt...
Name: Text, Length: 2441, dtype: object

In [57]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 30:
        break

0 approach
1 chang
2 city
3 clim
4 mitig
5 strategic
6 adapt
7 bas
8 canad
9 carbon
10 comb
11 commun
12 company
13 conserv
14 consult
15 design
16 enco
17 environ
18 footprint
19 fund
20 fut
21 get
22 glen
23 group
24 guid
25 individ
26 inform
27 infrastruct
28 liv
29 municip
30 neut


In [58]:
# Remove extremes from the dictionary
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

For each document we create a dictionary reporting how many
words and how many times those words appear. Save this to ‘bow_corpus’, then check our selected document earlier.

In [59]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [62]:
bow_doc_12 = bow_corpus[13]
for i in range(len(bow_doc_12)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_12[i][0], 
                                               dictionary[bow_doc_12[i][0]], 
bow_doc_12[i][1]))

Word 19 ("guid") appears 4 time.
Word 22 ("infrastruct") appears 1 time.
Word 48 ("act") appears 2 time.
Word 81 ("gen") appears 2 time.
Word 102 ("plan") appears 2 time.
Word 106 ("process") appears 1 time.
Word 116 ("suit") appears 1 time.
Word 122 ("util") appears 1 time.
Word 214 ("pop") appears 1 time.
Word 216 ("poss") appears 1 time.
Word 219 ("priorit") appears 1 time.
Word 222 ("rat") appears 1 time.
Word 250 ("temp") appears 1 time.
Word 270 ("com") appears 1 time.
Word 286 ("fact") appears 1 time.
Word 312 ("ontario") appears 6 time.
Word 350 ("vary") appears 1 time.
Word 372 ("crit") appears 1 time.
Word 451 ("rapid") appears 1 time.
Word 491 ("mat") appears 1 time.
Word 500 ("valu") appears 1 time.
Word 503 ("adopt") appears 1 time.
Word 529 ("ecosystem") appears 1 time.
Word 530 ("practit") appears 1 time.
Word 531 ("vert") appears 1 time.
Word 540 ("min") appears 1 time.
Word 550 ("continu") appears 1 time.
Word 555 ("expert") appears 1 time.
Word 564 ("vuln") appears 4 

In [63]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.37054438827005176),
 (1, 0.42497066072705464),
 (2, 0.4883576063083897),
 (3, 0.6660357664596728)]


In [155]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=20, id2word=dictionary, passes=2, workers=2)

In [156]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.015*"http" + 0.014*"transport" + 0.011*"wat" + 0.010*"environ" + 0.010*"plan" + 0.009*"man" + 0.009*"retriev" + 0.009*"act" + 0.008*"commun" + 0.008*"report"
Topic: 1 
Words: 0.014*"plan" + 0.013*"develop" + 0.010*"risk" + 0.010*"act" + 0.009*"coast" + 0.008*"man" + 0.008*"nat" + 0.007*"loc" + 0.007*"model" + 0.007*"land"
Topic: 2 
Words: 0.011*"plan" + 0.011*"govern" + 0.011*"environ" + 0.010*"retriev" + 0.009*"http" + 0.009*"reg" + 0.009*"coast" + 0.009*"municip" + 0.009*"brit" + 0.009*"columb"
Topic: 3 
Words: 0.012*"level" + 0.010*"increas" + 0.008*"storm" + 0.008*"nat" + 0.007*"area" + 0.007*"risk" + 0.007*"wat" + 0.007*"ris" + 0.006*"year" + 0.006*"temp"
Topic: 4 
Words: 0.013*"loc" + 0.013*"govern" + 0.012*"risk" + 0.011*"transport" + 0.010*"reg" + 0.009*"commun" + 0.008*"vuln" + 0.008*"man" + 0.007*"act" + 0.007*"develop"
Topic: 5 
Words: 0.017*"assess" + 0.016*"forest" + 0.014*"scenario" + 0.013*"vuln" + 0.011*"project" + 0.010*"inform" + 0.010*"fut" + 0.009

In [84]:
processed_docs[10]

['clim',
 'chang',
 'act',
 'strategy',
 'nee',
 'restrict',
 'siz',
 'loc',
 'municip',
 'cas',
 'study',
 'guid',
 'cov',
 'expery',
 'larg',
 'med',
 'smal',
 'commun',
 'select',
 'canad',
 'nor',
 'americ',
 'world',
 'wid',
 'array',
 'pract',
 'opt',
 'avail',
 'municip',
 'attempt',
 'reduc',
 'carbon',
 'footprint',
 'get',
 'carbon',
 'neut',
 'let',
 'municip',
 'select',
 'best',
 'fit',
 'individ',
 'econom',
 'soc',
 'environ',
 'condit',
 'build',
 'strong',
 'econom',
 'cas',
 'act',
 'provid',
 'environ',
 'rat',
 'act',
 'get',
 'carbon',
 'neut',
 'lay',
 'sery',
 'bas',
 'formula',
 'estim',
 'guidelin',
 'calc',
 'reduc',
 'accru',
 'select',
 'opt',
 'exampl',
 'estim',
 'guidelin',
 'predict',
 'expect',
 'reduc',
 'inst',
 'kilomet',
 'light',
 'rail',
 'construct',
 'gas',
 'plant',
 'process',
 'ton',
 'solid',
 'wast',
 'serv',
 'hect',
 'municip',
 'district',
 'energy',
 'scheme',
 'municip',
 'calc',
 'pot',
 'reduc',
 'achiev',
 'guid',
 'show',
 'us',
 '

In [152]:
df.loc[10]['Text']

' A climate change action strategy need not be restricted by the size or location of a municipality. The case studies in the guide cover the experiences of large, medium and smaller communities selected from across Canada, North America and around the world. There is a wide array of practical options available to a municipality attempting to reduce its carbon footprint. Getting to Carbon Neutral lets a municipality select the mix that best fits its individual economic, social and environmental conditions. Building a strong economic case for action To provide the environmental rationale for action, Getting to Carbon Neutral lays out a series of basic formulae Estimation Guidelines for calculating the GHG reductions that accrue to each of the selected options. For example, these Estimation Guidelines can be used to predict the expected GHG reductions from: installing X kilometres of light rail constructing a gasification plant to process Y tonnes of solid waste or servicing Z hectares of

In [145]:
lda_model[bow_corpus[10]]

[(0, 0.02391171),
 (1, 0.02395),
 (2, 0.023921039),
 (3, 0.023868334),
 (4, 0.023920579),
 (5, 0.85648894),
 (6, 0.023939386)]

In [142]:
np.amax(lda_model[bow_corpus[index]],axis=0)[0]

0.0

In [157]:
newArray=""
for index in range (1,len(bow_corpus)):
    newArray=np.append(newArray,np.amax(lda_model[bow_corpus[index]],axis=0)[0])


In [135]:
len(newArray),len(bow_corpus)

(2441, 2441)

In [158]:
df['lda-topic']=newArray

In [159]:
df

Unnamed: 0,File,Page,Text,lda-topic
1,81363.pdf,1,Climate Change Mitigation: a Strategic Approac...,
2,81363.pdf,2,Toronto and Region Conservation TRCA has beco...,7.0
3,81363.pdf,3,An Overview of Getting to Carbon Neutral: A Gu...,11.0
4,81363.pdf,4,Getting to Carbon Neutral does not dictate wh...,9.0
5,81363.pdf,5,Tens steps to carbon neutrality Getting to Ca...,11.0
...,...,...,...,...
2437,Windsor Climate Change Adaptation Plan.pdf,17,Climate Change Adaptation Plan 2012 Medium Lo...,8.0
2438,Windsor Climate Change Adaptation Plan.pdf,18,Climate Change Adaptation Plan 2012 Table 5: ...,8.0
2439,Windsor Climate Change Adaptation Plan.pdf,19,Climate Change Adaptation Plan 2012 Initiate ...,8.0
2440,Windsor Climate Change Adaptation Plan.pdf,20,Climate Change Adaptation Plan 2012 General D...,7.0


In [160]:
df.to_excel("all-pages-lda.xlsx")