In [1]:
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")


In [5]:
embeddings = embed([
    "The quick brown fox jumps over the lazy dog.",
    "I want to get an embedding",
    "I want to find an embedding"
])
np.inner(embeddings, embeddings)


tf.Tensor(
[[-3.1330168e-02 -6.3386343e-02 -1.6075011e-02 ... -3.2427795e-02
  -4.5757405e-02  5.3704556e-02]
 [ 6.3447729e-02 -7.4907422e-02  1.8337060e-02 ... -6.3235313e-03
  -8.7126193e-04 -3.7472900e-02]
 [ 4.2843256e-02 -4.4590678e-02 -2.4835004e-03 ...  3.7239727e-02
   9.6465948e-05 -3.2832187e-02]], shape=(3, 512), dtype=float32)


array([[ 1.0000001 , -0.03166059, -0.02791631],
       [-0.03166059,  1.0000001 ,  0.9253975 ],
       [-0.02791631,  0.9253975 ,  1.0000001 ]], dtype=float32)

In [None]:
# From https://colab.research.google.com/github/tensorflow/hub/blob/master/examples/colab/semantic_similarity_with_tf_hub_universal_encoder.ipynb#scrollTo=h1FFCTKm7ba4

def plot_similarity(labels, features, rotation):
  corr = np.inner(features, features)
  sns.set(font_scale=1.2)
  g = sns.heatmap(
      corr,
      xticklabels=labels,
      yticklabels=labels,
      vmin=0,
      vmax=1,
      cmap="YlOrRd")
  g.set_xticklabels(labels, rotation=rotation)
  g.set_title("Semantic Textual Similarity")

def run_and_plot(messages_):
  message_embeddings_ = embed(messages_)
  plot_similarity(messages_, message_embeddings_, 90)

In [None]:
run_and_plot([
    "Sustainability is our top goal",
    "We will combat climate change taking the following",
    "Coal is good",
    "Women's rights are human rights"
])

In [3]:
from nlptools.correlation import KeywordCorrelateSpacy

In [4]:
import json
from nlptools import importer

In [None]:
# Todo
# Filter out root words from correlation list
# Figure out what correlation number means
# Find a minimum correlation mumber such that meaning is still contained


In [5]:
import spacy
from nlptools import importer
from spacy.tokens import Token
from spacy.language import Language
nlp = spacy.load("en_core_web_sm")

In [6]:
exampledoc = importer.TextImporter("../texts/AnnualReport2017-2018.txt")

In [7]:
with open("../ndc_keywords/ndc_south_africa.json") as f:
    kwds = json.load(f)

print(kwds['climate change'])
nlp.add_pipe("kwd_correlate_factory", config={"tf_model": "https://tfhub.dev/google/universal-sentence-encoder/4", "keywords": kwds['climate change'], "correlation_tag": "kwd_correlate"})

['adaption program', 'adaption projects', 'resilience', 'institutional capacity', 'response', 'budget reprioritisation', 'development', 'planner', 'regulator practitioners', 'geographical circumstances', 'land use scheme', 'authorisation system', 'spatial land use management act', 'spluma']


<nlptools.correlation.KeywordCorrelateSpacy at 0x17b3cf645e0>

In [8]:
doc = nlp(exampledoc.text)

In [12]:
thresh = 0.5

for i, t in enumerate(doc):
    if t._.kwd_correlate > thresh:
        print(f"{str(t)}: {t._.kwd_correlate}")

enduring: 0.5766626596450806
eport: 0.5190252661705017
Secretary: 0.5134242177009583
SECRETARY: 0.5134242177009583
Strengthening: 0.609574556350708
Accountability: 0.681199312210083
Building: 0.5129121541976929
Capacity: 0.647070586681366
Strengthening: 0.609574556350708
Institutional: 0.704737663269043
Capacity: 0.647070586681366
Development: 0.9999998807907104
Plan: 0.5336602330207825
Programme: 0.6636916399002075
Programme: 0.6636916399002075
Programme: 0.6636916399002075
Programme: 0.6636916399002075
Programme: 0.6636916399002075
Programme: 0.6636916399002075
Institutional: 0.704737663269043
Improvement: 0.5251588821411133
Programme: 0.6636916399002075
Programme: 0.6636916399002075
Improvement: 0.5251588821411133
Schedule: 0.5952224731445312
Budget: 0.7207304239273071
Budget: 0.7207304239273071
Budget: 0.7207304239273071
Programme: 0.6636916399002075
Programme: 0.6636916399002075
Improvement: 0.5251588821411133
Improvement: 0.5251588821411133
Programme: 0.6636916399002075
Programme