In [1]:
import os

# I saved all PDF files under the PDF folder in my env directory
pdf_docs_path = os.path.join("PDF")
one_pdf_path = os.path.join(pdf_docs_path,"protect-your-home-from-snow-ice-storms.pdf")


In [2]:
# Trying a better result with pdftotext
import pdftotext

with open(one_pdf_path, "rb") as f:
    pdf = pdftotext.PDF(f)
    
print("This file has", len(pdf), "pages.")

This file has 16 pages.


In [6]:
# PDFtoText is much better than PyPDF, with automatic ligature conversion!
# Now we can get rid of new lines and stray spaces

import re

docText = ""
for page in pdf:
    docText = docText + re.sub('\s+', ' ', page).strip()

In [37]:
from collections import Counter
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load("en_core_web_sm")

import string

# Loads text with linguistic annotations from Spacy
my_doc = nlp(docText)

filteredDoc = []

for sentence in my_doc.sents:
    for word in sentence:
        if not(word.is_stop) and (word.pos_=='NOUN' or word.pos_=='PROPN'):
            filteredDoc.append(word.text.lower())

nounsFreqDistribution = Counter(filteredDoc)
    

In [38]:
nounsFreqDistribution

Counter({'home': 35,
         'snow': 33,
         'ice': 48,
         'storms': 2,
         'living': 9,
         'program': 6,
         'canada': 10,
         'insurers': 6,
         'disaster': 12,
         'homes.for': 1,
         'loss': 12,
         'reduction': 4,
         'institute': 6,
         'catastrophic': 4,
         'iclr': 5,
         'world': 2,
         'class': 1,
         'centre': 2,
         'multidisciplinary': 1,
         'prevention': 3,
         'research': 3,
         'communication': 2,
         'profit': 1,
         'insurance': 3,
         'industry': 1,
         'university': 2,
         'western': 2,
         'ontario': 6,
         'mission': 1,
         'life': 1,
         'property': 7,
         'weather': 25,
         'earthquakes': 2,
         'identification': 2,
         'support': 2,
         'actions': 5,
         'society': 1,
         'capacity': 3,
         'anticipate': 1,
         'disasters': 3,
         'mandate': 1,
         'increase': 

In [33]:
listOfWords=""
for word in nounsFreqDistribution.most_common(50):
    listOfWords = listOfWords + word[0] + ", "
print (listOfWords)

roof, ice, home, snow, water, weather, damage, winter, pipes, homeowners, attic, heat, house, disaster, dams, air, Canada, living, temperature, insulation, homes, drainage, loss, property, steps, temperatures, collapse, Ontario, sources, C, Homeowners, program, insurers, Loss, Institute, ICLR, actions, homeowner, leaks, areas, snaps, slope, Ice, penetrations, systems, debris, thermostat, Reduction, Catastrophic, snowfall, 


Now, what if we devise a dictionary of relevant nouns that apply to disaster types, and constrain the results to that list? Let's try with this...

In [34]:
dictionary = ["snow","change","climate","heatwave","adaptation","tornado","water","icestorm","risk","impact","level","community","land","management","planning","development","http","plan","infrastructure","sea","event","action","vulnerability","flood","assessment","storm","temperature","rise","resource","weather","strategy","damage","effect","precipitation","hazard","ice","protection","home","flooding","erosion","environment","emission","al","winter","heat","forest","wind","mitigation","emergency","coast","shoreline","greenhouse","elevation","carbon","wave","dike","wetland","disaster","conservation","reduction","fire","rain","drainage","ground","power","stormwater","roof","rainfall","extreme","wildfire","reference","vegetation","threat","drought","disease","coastline","sewer","nature","neutral","neutrality"]

for distWord in nounsFreqDistribution.most_common(50):
    if distWord[0] in dictionary:
        print(distWord)

('roof', 57)
('ice', 43)
('home', 34)
('snow', 30)
('water', 26)
('weather', 25)
('damage', 24)
('winter', 23)
('heat', 14)
('disaster', 11)
('temperature', 9)
('drainage', 8)


In [44]:
# Defining a function for frequent word extraction, returning a simple string:
def frequentClimateWordsExtractor(text):
    from collections import Counter
    import spacy
    from spacy.lang.en.stop_words import STOP_WORDS
    import string
    
    # Dictionary of relevant words
    dictionary = ["snow","change","climate","heatwave","adaptation","tornado","water","icestorm","risk","impact","level","community","land","management","planning","development","http","plan","infrastructure","sea","event","action","vulnerability","flood","assessment","storm","temperature", "low","rise","resource","weather","strategy","damage","effect","precipitation","hazard","ice","protection","home","flooding","erosion","environment","emission","al","winter","heat","forest","wind","mitigation","emergency","coast","shoreline","greenhouse","elevation","carbon","wave","dike","wetland","disaster","conservation","reduction","fire","rain","drainage","ground","power","stormwater","roof","rainfall","extreme","wildfire","reference","vegetation","threat","drought","disease","coastline","sewer","nature","neutral","neutrality"]

    # Loads text with linguistic annotations from Spacy
    my_doc = nlp(text)

    filteredDoc = []
    filteredList = []
    
    # Returns a list with relevant words filtered by the dictionary
    for sentence in my_doc.sents:
        for word in sentence:
            if not(word.is_stop) and (word.pos_=='NOUN' or word.pos_=='PROPN'):
                filteredDoc.append(word.text.lower())

    nounsFreqDistribution = Counter(filteredDoc)
    
    listOfWords=""
    for word in nounsFreqDistribution.most_common(300):
        if word[0] in dictionary:
            listOfWords = listOfWords + word[0] + ", "
            filteredList.append(word[0])
    return listOfWords, filteredList

In [45]:
frequentClimateWordsExtractor("Snow may change the chances of a tornado")

('snow, tornado, ', ['snow', 'tornado'])

### Processing all documents:

In [46]:
# Now let's extract all words from all documents:
import re
import pandas as pd
import pdftotext

#allPagesCorpus = ""

df = pd.DataFrame(columns=['File', 'Page', 'Text', 'Keywords'])
acPage = 0

with os.scandir(pdf_docs_path) as entries:
    for entry in entries:
        if entry.name != ".DS_Store":
            # Extract text and add to the datastore
            document = entry.name
            one_pdf_path = os.path.join(pdf_docs_path, document)
            
            with open(one_pdf_path, "rb") as f:
                try:
                    pdf = pdftotext.PDF(f)
                    i=0
                    for page in pdf:
                        i+=1
                        contents = re.sub(r"[^a-zA-Z0-9:.,!?%$@]+", ' ', page)
                        if contents != "":
                            #allPagesCorpus = allPagesCorpus + contents
                            acPage+=1
                            keywords, wordList = frequentClimateWordsExtractor(contents)
                            df.loc[acPage] = [document,i,contents,keywords]
                except:
                    print("Error on document",document)

In [47]:
df.to_excel("all-pages-freqdist.xlsx")

In [48]:
# Trying a simple rule-based approach to disaster classification
def disasterType(key_arr):
    # Undefined to start with
    
    disaster_class = "Undefined"
    numDetected = 0

    if "carbon" in key_arr and ("neutral" in key_arr or "neutrality" in key_arr):
        disaster_class = "Carbon Neutrality"
        numDetected+=1
        
    if "adaptation" in key_arr and ("change" in key_arr or "plan" in key_arr):
        disaster_class = "Climate Change Adaptation"
        numDetected+=1
        
    if "drought" in key_arr:
        disaster_class = "Drought"
        numDetected+=1
        
    if "flood" in key_arr or "flooding" in key_arr or "rainfall" in key_arr or "stormwater" in key_arr or ("sea" in key_arr and "level" in key_arr and "rise" in key_arr):
        disaster_class = "Flooding"
        numDetected+=1
        
    if "heat" in key_arr or "heatwave" in key_arr:
        disaster_class = "Heatwave"
        numDetected+=1
        
    if "mitigation" in key_arr:
        disaster_class = "Mitigation"
        numDetected+=1
        
    if "wind" in key_arr or "tornado" in key_arr:
        disaster_class = "Severe Wind"
        numDetected+=1
        
    if "snow" in key_arr or "snowstorm" in key_arr:
        disaster_class = "Snowstorm"
        numDetected+=1

    if "temperature" in key_arr and "low" in key_arr:
        disaster_class = "Low Temperatures"
        numDetected+=1


    if "fire" in key_arr or "wildfire" in key_arr:
        disaster_class = "Wildfire"
        numDetected+=1
        
    if numDetected > 1:
        disaster_class = "Multiple"
        
    if "http" in key_arr or "al" in key_arr or "reference" in key_arr:
        disaster_class = "References"
    
    return disaster_class

In [49]:
# Now exporting again with disaster class:
import re
import pandas as pd
import pdftotext

#allPagesCorpus = ""

df = pd.DataFrame(columns=['File', 'Page', 'Text', 'Keywords', "Disaster"])
acPage = 0

with os.scandir(pdf_docs_path) as entries:
    for entry in entries:
        if entry.name != ".DS_Store":
            # Extract text and add to the datastore
            document = entry.name
            one_pdf_path = os.path.join(pdf_docs_path, document)
            
            with open(one_pdf_path, "rb") as f:
                try:
                    pdf = pdftotext.PDF(f)
                    i=0
                    for page in pdf:
                        i+=1
                        contents = re.sub(r"[^a-zA-Z0-9:.,!?%$@]+", ' ', page)
                        if contents != "":
                            #allPagesCorpus = allPagesCorpus + contents
                            acPage+=1
                            keywords, wordList = frequentClimateWordsExtractor(contents)
                            disasterClass = disasterType(wordList)
                            df.loc[acPage] = [document,i,contents,keywords, disasterClass]
                except:
                    print("Error on document",document)

In [219]:
df.to_excel("all-pages-freqdist-tagged.xlsx")

In [55]:
# Defining the imperative sentence extractor function
def impSentenceExtractor(someText):
    import spacy
    nlp = spacy.load("en_core_web_sm")
    
    doc = nlp(someText)
    impSentList=[]
    # Extract sentences from block of text
    
    for sentence in doc.sents:
        if sentence[0].pos_=='VERB' and (sentence[0].tag_=="VB" or sentence[0].tag_=="VBG"):
            impSentList.append(sentence.text)
    
    return impSentList

In [56]:
sents = impSentenceExtractor("Do this right. Protect your basement. Keep doors open. Look for any form of penetration between your attic and roof, such as a vent.")

In [57]:
sents

['Protect your basement.',
 'Keep doors open.',
 'Look for any form of penetration between your attic and roof, such as a vent.']

### Processing all documents, classifying every page and outputting imperative sentences for every page

In [58]:
# Now exporting again with disaster class AND imperative sentences
import re
import pandas as pd
import pdftotext

#allPagesCorpus = ""

df = pd.DataFrame(columns=['File', 'Page', 'Text', 'Keywords', "Disaster", "Actions"])
acPage = 0

with os.scandir(pdf_docs_path) as entries:
    for entry in entries:
        if entry.name != ".DS_Store":   
            # Extract text and add to the datastore
            document = entry.name
            one_pdf_path = os.path.join(pdf_docs_path, document)
            
            with open(one_pdf_path, "rb") as f:
                try:
                    pdf = pdftotext.PDF(f)
                    i=0
                    for page in pdf:
                        i+=1
                        contents = re.sub(r"[^a-zA-Z0-9:.,!?%$@]+", ' ', page)
                        if contents != "":
                            #allPagesCorpus = allPagesCorpus + contents
                            acPage+=1
                            keywords, wordList = frequentClimateWordsExtractor(contents)
                            disasterClass = disasterType(wordList)
                            impSents = impSentenceExtractor(contents)
                            df.loc[acPage] = [document,i,contents,keywords, disasterClass, impSents]
                except:
                    print("Error on document",document)

In [60]:
df.to_excel("all-pages-freqdist-tagged-impsents.xlsx")

### And just for fun, exporting all to a JSON object on the fly

In [296]:
acPage = 0
jsonFiles = []

with os.scandir(pdf_docs_path) as entries:
    for entry in entries:
        if entry.name != ".DS_Store":
        #if entry.name == "protect-your-home-from-basement-flooding.pdf":   
            # Extract text and add to the datastore
            document = entry.name
            one_pdf_path = os.path.join(pdf_docs_path, document)

            
            with open(one_pdf_path, "rb") as f:
                try:
                    pdf = pdftotext.PDF(f)
                    i=0
                    jsonObject = {}
                    jsonObject['file'] = document
                    jsonObject['pages'] = []
                    
                    for page in pdf:
                        pageContent = {}
                        i+=1
                        contents = re.sub(r"[^a-zA-Z0-9:.,!?%$@]+", ' ', page)
                        if contents != "":
                            #allPagesCorpus = allPagesCorpus + contents
                            acPage+=1
                            keywords, wordList = frequentClimateWordsExtractor(contents)
                            disasterClass = disasterType(wordList)
                            impSents = impSentenceExtractor(contents)
                            pageContent['page'] = i
                            #pageContent['text'] = contents
                            pageContent['disasterType'] = disasterClass
                            pageContent['actions'] = impSents
                            
                            # Only output if there are actions and disaster type is not Undefined
                            if len(impSents) != 0 and disasterClass != "Undefined" and disasterClass != "References":
                                jsonObject['pages'].append(pageContent)
                    
                    jsonFiles.append(jsonObject)
                except:
                    print("Error on document",document)

In [298]:
# Save JSON to file
f=open("all-actions.json", 'w', encoding="utf-8")
f.write(str(jsonFiles))

343604