In [1]:
import spacy
import numpy as np
from correlation import *
from spacy import displacy

nlp = spacy.load("en_core_web_sm")

In [2]:
from textutils import importer
import json

annual_report = importer.TextImporter("../texts/AnnualReport2017-2018.txt")
with open("../ndc_keywords/ndc_south_africa.json") as f:
    ndc_keywords = json.load(f)

climate_keywords = ndc_keywords['climate change']

In [3]:
token_correlator = TokenArrayCorrelator(climate_keywords, 0.7, "CLIMATE_TOKEN")
span_correlator = SpanCorrelator(climate_keywords, 0.7, "CLIMATE_SPAN")
generic_correlator = KeywordCorrelator(climate_keywords)


In [4]:
print(climate_keywords)

['adaption program', 'adaption projects', 'resilience', 'institutional capacity', 'response', 'budget reprioritisation', 'development', 'planner', 'regulator practitioners', 'geographical circumstances', 'land use scheme', 'authorisation system', 'spatial land use management act', 'spluma']


# First, explore the generic correlator functionality

In [5]:
sentence_to_correlate = "We need to adapt our project to be more resillient to geographical circumstances"
unrelated_sentence = "The next time the leaders will meet in paris"

print(generic_correlator([sentence_to_correlate, unrelated_sentence]))

[0.4462148  0.05690929]


We can see that the sentence which uses some of the NDC keywords has a much higher embedding score than the unrelated sentence

# Now, lets see how we can integrate the output of our embedder into Spacy, an NLP library

In [6]:
# Load our text into a spacy document
doc = nlp(annual_report.text)

In [7]:
doc.ents = []
from n_gram_correlation import NGramCorrelateSpacy

n_gram_cor = NGramCorrelateSpacy(climate_keywords, 0.7, "CLIMATE_N")

n_gram_cor.correlate_spans(doc, 4)

# Currently takes 1m20s to correlate 4-gram spans
# will revise algorithm to use the ability of embedding many spans at the same time.

# Now only takes 3 seconds for some reason...have no idea?


In [9]:
print(doc.ents)

(Budget Office
SADC, the
African Development, Budget Office
(, Development, and the, development of the project, the development of the, for
Sustainable Development, 
Economic Development,, and
Transcription System, budget slowed
the, budget advice.
, Budget Reports
2015/16, Budget Actual
amounts)
