In [None]:
import spacy
import numpy as np
from correlation import *
from spacy import displacy

nlp = spacy.load("en_core_web_sm")

In [None]:
from textutils import importer
import json

annual_report = importer.TextImporter("../texts/AnnualReport2017-2018.txt")
with open("../ndc_keywords/ndc_south_africa.json") as f:
    ndc_keywords = json.load(f)

climate_keywords = ndc_keywords['climate change']
policy = ["policy", "integrate", "implement", "committee", "consultation"]
food = ["nutritions", "diets", "farm", "agriculture", "ecology"]
ndc_national_adaption_plan = ["nap", "sector plan", "nccrp", "vulnerable sector", 
                              "geographic vulnerability"]
ndc_early_warning = ["system", "vulnerability", "needs", "assessment", "network", "weather",
   "earth", "observation", "academic", "community"]
doc = nlp(annual_report.text)

# Running NGram correlation on a document

NGrams are all possible subsections of a document of length N. NGramCorrelateSpacy is a class that has the
ability to break a document up into all possible defined NGrams, then correlate them against a 
group of keywords to flag for review.

Before running any entity tagging operation, the current document's entity list should be cleared, as Spacy 
prohibits overlapping entities.

In this example, we will initialize an NGramCorrelateSpacy object with a list of climate change keywords
and set it to tag entities if their correlation surpasses a given value. If the correlation for an NGram 
surpasses 0.7, as seen in this example, it will be tagged with the flag CLIMATE_N. 

We start the correlation process by passing the correlator the document, and the size of NGrams we want to analyze.
Using N=4 will correlate all possible spans of length 4.

After correlation of the possible 4-Grams, we display passeges surrounding five of the flagged entities.

In [None]:
from n_gram_correlation import NGramCorrelateSpacy
ngram_climate_corr = NGramCorrelateSpacy(climate_keywords, 0.7, "CLIMATE")
ngram_policy_corr = NGramCorrelateSpacy(policy, 0.7, "POLICY")
ngram_food_corr = NGramCorrelateSpacy(food, 0.7, "FOOD")
ngram_adaption_corr = NGramCorrelateSpacy(ndc_national_adaption_plan, 0.7, "ADAPTION")
ngram_warning_corr = NGramCorrelateSpacy(ndc_early_warning, 0.7, "WARNING")
doc.ents = []
ngram_climate_corr.correlate_spans(doc, 4)
ngram_policy_corr.correlate_spans(doc, 4)
ngram_food_corr.correlate_spans(doc, 4)
ngram_adaption_corr.correlate_spans(doc, 4)
ngram_warning_corr.correlate_spans(doc, 4)

In [None]:

colors = {
    'CLIMATE': "green",
    'WARNING': "red",
    "POLICY": "yellow",
    "FOOD": "orange",
    "ADAPTION": "pink"
}
for e in doc.ents:
    displacy.render(doc[e.start-10:e.end+10], style='ent', options={'colors': colors})
# displacy.render(doc, style='ent', options={"colors": colors})

# Other correlation tools

## KeywordCorrelator
The KeywordCorrelator class is a simple tool to analyze the correlation of any set of strings
to a static list of keywords. This class is initialized with a list of keywords that we are
interested in. We can then make calls on an instance of this class with a list of strings, 
and it will return a list of coefficients (cosine similarities) between the embeddings 
of each sentence and the embeddings of the keywords. The returned similarities
are the maximum cosine similarity found amongst a given sentence and all the keyword embeddings.

This clase is used as the basis for all other correlators.

## SpanCorrelator
The SpanCorrelator is a slightly more intelligent version of the generic correlator. It features
Spacy integration, allowing it to mark provided sections of a document with tags if the sections
score high enough against the embeddings of the initialized keywords. 

## TokenArrayCorrelator
The TokenArrayCorrelator is a specialized version of the SpanCorrelator. It can be passed an array of token arrays
which do not have to be consecutive, although the position of tokens in a given array must be in
an ascending order by location in the document. It operates similarly to SpanCorrelator, embedding
the strings created by the selections of tokens against initialized keywords, then tagging 
entities in the document which include the token subsets if their cosine similarities 
pass a certain threshold.

In [None]:
token_correlator = TokenArrayCorrelator(climate_keywords, 0.4, "CLIMATE_TOKEN")
span_correlator = SpanCorrelator(climate_keywords, 0.4, "CLIMATE_SPAN")
generic_correlator = KeywordCorrelator(climate_keywords)


# First, explore the generic correlator functionality

In [None]:
sentence_to_correlate = "We need to adapt our project to be more resillient to geographical circumstances."
unrelated_sentence = "The next time the leaders will meet in paris"

print(generic_correlator([sentence_to_correlate, unrelated_sentence]))

We can see that the sentence which uses some of the NDC keywords has a much higher embedding score than the unrelated sentence

# Now, lets see how we can integrate the output of our embedder into Spacy, an NLP library

In [None]:
doc = nlp(sentence_to_correlate+ " " +unrelated_sentence)
doc.ents = []
span_correlator(doc, [s for s in doc.sents])

In [None]:
displacy.render(doc, style='ent')

In [None]:
words_to_omit = {'to', 'the', 'our', 'we'}
tokens_filtered = [[t for t in sent if t.text.lower() not in words_to_omit] for sent in doc.sents]


In [None]:
filtered_to_strings = [" ".join([str(t) for t in passage]) for passage in tokens_filtered]
print(filtered_to_strings) # See that stopwords have been removed
print(generic_correlator(filtered_to_strings)) # Notice that by removing some stop words, we have boosted the score in the first sentence

In [None]:
doc.ents = []

token_correlator(doc, tokens_filtered)
displacy.render(doc, style='ent', options={"colors": {"CLIMATE_TOKEN": "blue"}})