In [1]:
import spacy
from spacy.tokens import Span
from policyanalysis.nlptools.correlation import KeywordCorrelator, entity_correlation_tagger, TokenArrayCorrelator
nlp = spacy.load("en_core_web_sm")
tfhub_url = "https://tfhub.dev/google/universal-sentence-encoder/4"


In [2]:
doc = nlp("The quick brown fox jumped over the lazy dog. Another lazy dog went by. What happened next? We do not know.")
tokens = [t for t in doc[0:4]]
print(tokens)
corr = TokenArrayCorrelator(["Brown foxes are quick"], 0.35, "FAST")
slow_corr = TokenArrayCorrelator(["some animals are lazy"], 0.35, "SLOW")
corr(doc, tokens)
slow_corr(doc, [t for t in doc[4:8]])
[print(e.text, e.label_) for e in doc.ents]


[The, quick, brown, fox]
The quick brown FAST
jumped over the SLOW


[None, None]

In [3]:

KeywordCorrelator.add_span_subject_correlator("quick_foxes", ["Brown foxes are quick"])
KeywordCorrelator.add_span_subject_correlator("lazy_animals", ['some animals are lazy'])
threegram_sugg = spacy.pipeline.spancat.build_ngram_suggester([3])
n_gram_size = 7

def get_n_gram_tuples(size, doc_len):
    return [(i, i+size) for i in range(doc_len-size+1)]

print(doc[0:4].text, doc[0:4]._.quick_foxes)
print(doc[4:8].text, doc[4:8]._.lazy_animals)


The quick brown fox 0.6648871
jumped over the lazy 0.361911


In [4]:
from typing import List

In [5]:
# doc.ents = list(doc.ents) + [Span(doc, 0, 3, label="GOOD")]
doc.ents = []
n_gram_spans = [doc[t[0]:t[1]] for t in get_n_gram_tuples(4, len(doc))]

In [6]:
entity_correlation_tagger(doc, n_gram_spans, 0.35, 'quick_foxes', "FAST")
entity_correlation_tagger(doc, n_gram_spans, 0.35, 'lazy_animals', "SLOW")

In [7]:
[print(e.text, e.label_, e._.quick_foxes, e._.lazy_animals)  for e in doc.ents]

The quick brown fox FAST 0.6648871 0.16049126
jumped over the lazy SLOW 0.039034665 0.361911
dog. Another lazy SLOW 0.15132414 0.5753756


[None, None, None]

In [8]:
from spacy import displacy

displacy.render(doc[3:], style="ent")
displacy.render(doc, style='ent')

In [9]:
import json
from policyanalysis.nlptools import importer
exampledoc = importer.TextImporter("texts/AnnualReport2017-2018.txt")
with open("ndc_keywords/ndc_south_africa.json") as f:
    kwds = json.load(f)

In [10]:
KeywordCorrelator.add_span_subject_correlator("climate_corr", keywords=kwds['climate change'])

In [11]:
doc = nlp(exampledoc.text)
n_gram_spans = [doc[t[0]:t[1]] for t in get_n_gram_tuples(4, len(doc))]


In [12]:
len(n_gram_spans)
doc.ents = []

In [13]:
entity_correlation_tagger(doc, n_gram_spans[::4], 0.7, 'climate_corr', "CLIMATE")

In [14]:
doc.ents[9]._.climate_corr

0.7091955

In [15]:
KeywordCorrelator.add_span_subject_correlator('warning_corr', keywords=kwds['early warning'])
entity_correlation_tagger(doc, n_gram_spans[::4], 0.7,  'warning_corr', "WARNING")
doc.ents



(Budget Office
 SADC,
 the
 African Development,
 Budget Office
 (,
 Development, and the,
 development of the project,
 the development of the,
 for
 Sustainable Development,
 
 Economic Development,,
 and
 Transcription System,
 budget slowed
 the,
 budget advice.,
 Budget Reports
 2015/16,
 Budget Actual
 amounts)

In [16]:
from spacy import displacy

for e in doc.ents:
    displacy.render(doc[e.start-20:e.end+20], style='ent')

In [17]:
doc = nlp("hello world this sentence is full of stop words an apple disappeared")

In [18]:
toks = []
toks = toks + [doc[2], doc[6], doc[8]]

" ".join([str(t) for t in toks])

'this of words'

In [None]:
from policyanalysis.nlptools.correlation import SpanCorrelator, TokenArrayCorrelator

climate_kwds = kwds['climate change']
span_correlate = SpanCorrelator(climate_kwds, 0.7, "CLIMATE")