In [5]:
from operator import itemgetter
from typing import Tuple, Dict, List, Union
import spacy
from spacy.tokens import Span, Token
from spacy.tokens.underscore import Underscore
import benepar
from benepar.integrations.spacy_extensions import ConstituentData, get_constituent
from benepar.spacy_plugin import BeneparComponent



In [63]:
from spacy.symbols import nsubj, nsubjpass, csubj, csubjpass, acl, agent, \
    dobj, iobj, nmod, attr, oprd, pobj, advcl, ccomp, acomp, prep, \
    xcomp, \
    NOUN, ADJ, VERB, PROPN

SUBJ_RELATIONS = {nsubj, nsubjpass, csubj, csubjpass, acl}
OBJ_RELATIONS = {dobj, iobj, nmod, ccomp, acomp, pobj, prep}

RELEVANT_DEP_TAGS = SUBJ_RELATIONS | OBJ_RELATIONS

RELEVANT_POS_TAGS = {NOUN, ADJ, VERB, PROPN}



# !python -m spacy download en_core_web_sm

In [33]:
# benepar.download("benepar_en3")

In [34]:
# benepar_path = "/Users/ronpick/workspace/zero-shot-stance/models/benepar_en3"
# benepar_component = BeneparComponent("benepar_en3")
# benepar_component

In [35]:
# nlp = spacy.load('en')
parser = spacy.load("en_core_web_sm")
# parser = nlp.add_pipe("benepar", config={"model": "benepar_en3"})
# parser = nlp.add_pipe("parser")
parser

<spacy.lang.en.English at 0x1686ad430>

In [47]:
# text = "The time for action is now. It is never too late to do something."
# text = "I totally agree with this premise. As a younger person I was against Nuclear power (I was in college during 3 mile island) but now it seems that nuclear should be in the mix. Fission technology is better, and will continue to get better if we actively promote its development. The prospect of fusion energy also needs to be explored. If it's good enough for the sun and the stars, it's good enough for me."

text1 = "Regulation of corporations has been subverted by corporations. States that incorporate corporations are not " \
       "equipped to regulate corporations that are rich enough to influence elections, are rich enough to muster a " \
       "legal team that can bankrupt the state. Money from corporations and their principals cannot be permitted in " \
       "the political process if democracy is to survive."

text2 = "Absolutely it's needs to be defined and regulated in its use, as currently the word 'natural' " \
       "when used on food products is totally confusing and meaningless. Clearly they are trying to imply the item is " \
       "'healthy' or possibly 'organic', but when you see food 'manufacturers' like Frito-Lay or Campbell's with " \
       "products labelled 'natural', that alone should set off alarms that all is not what it seems. ;-)"

text3 = "America will never be a truly great country until health care is provided for all for little to no cost. " \
        "We pay for public education whether we want to or not, we pay for wars that cost trillions. " \
        "The U.S. can afford health care for all. Just do it."

text4 = "While do like the 99 cent rack on my kindle book store, there's a 50/50 chance that what I look at is " \
        "self-published because before digital publishing, no one would touch it- and for good reason. A good amount " \
        "of it is really really bad. So that said, publishers offer the value added-ness, if you will, but setting a " \
        "standard that makes for an enjoyable read. I hope Amazon, if they decide to eat up the older publishers, " \
        "hires those with generations of wisdom and allows those with it to exercise it in helping me make sure that " \
        "my time spent reading is worth my while."

text5 = """Obesity is NOT a "life-style choice." I have not eaten in a fast food restaurant in over 5 years.
I eat no sugar at all -- no baked goods, no candy, no sodas, no jam or honey, no syrup (let alone "a pound of double
stuff Oreos every day"). I eat no red meat -- just fish and lean, skinless poultry. I eat no flour, just whole grains.
I eat fresh leafy vegetables and fruits. I eat very little dairy. I limit my calories to between 1500 and 1800 a day.
I exercise. And yet, at 5' 7", I weigh 215 pounds, and have for many years. Those of you who accuse me of a lack of
"self-discipline" or of "gluttony" are ignorant bigots who should be ashamed of yourselves."""

In [69]:
from typing import Dict, List, Union
from operator import itemgetter

t: Token
sent: Span
def get_relevant_tokens(root: Token) -> Dict[int, Token]:
    relevant_tokens: Dict[int, Token] = {}
    if root.pos in RELEVANT_POS_TAGS:
        relevant_tokens[root.i] = root

    for c in root.children:
        relevant_tokens.update(get_relevant_tokens(c))

    return relevant_tokens


def get_chunks(tokens_by_position: Dict[int, Token]) -> List[str]:
    prev_i = -2
    chunks = []
    current_chunk = []
    for i, token in sorted(tokens_by_position.items(), key=itemgetter(0)):
        if prev_i == i - 1:
            if token.pos != VERB:
                current_chunk.append(token)
        else:
            if len(current_chunk) > 1 or (len(current_chunk) == 1 and current_chunk[0].pos != VERB):
                chunk_str = " ".join([t.text for t in current_chunk])
                chunks.append(chunk_str)

            current_chunk = [token]

        prev_i = i

    if len(current_chunk) > 0:
        chunk_str = " ".join([t.text for t in current_chunk])
        chunks.append(chunk_str)

    return chunks


def get_relevant_nps(root: Token) -> List[str]:
    nps = []
    xcomp_root: Union[Token, None] = None
    extracted = False
    for c in root.children:
        # print(c.text)
        if c.dep in SUBJ_RELATIONS:
            relevant_tokens = get_relevant_tokens(c)
            subj_chunks = get_chunks(relevant_tokens)
            nps.extend(subj_chunks)
            extracted = extracted or (len(subj_chunks) > 0)
        elif c.dep in OBJ_RELATIONS:
            relevant_tokens = get_relevant_tokens(c)
            obj_chunks = get_chunks(relevant_tokens)
            nps.extend(obj_chunks)
            extracted = extracted or (len(obj_chunks) > 0)
        elif c.dep == xcomp:
            xcomp_root = c

    if (not extracted) and (xcomp_root is not None):
        print("GOING DEEPR")
        nps.extend(get_relevant_nps(xcomp_root))

    return nps


# find the root of the dependency parsing
doc = parser(text3)
for sent in doc.sents:
    print(sent)
    # print([(t.text, t.pos_, f"{t.head.text} -> {t.dep_}") for t in sent])
    nps = get_relevant_nps(sent.root)
    print(nps)


America will never be a truly great country until health care is provided for all for little to no cost.
['America']
We pay for public education whether we want to or not, we pay for wars that cost trillions.
['public education']
The U.S. can afford health care for all.
['U.S.', 'health care']
Just do it.
[]


In [70]:
topics = []
for sent in doc.sents:
    print(sent)
    print(" ".join([f"({t.text}, {t.pos_}, {t.dep_})" for t in sent]))
    for chunk in sent.noun_chunks:
        for token in chunk:
            print(f"\t\t{token.head.text}-->{token.text}, {token.dep_}")
            if token.dep in RELEVANT_DEP_TAGS:
                topics.append(chunk)
                print(f"\t{chunk}")
                # print([(c.lemma_, c.dep_) for c in chunk])
                break

print(topics)
    #     print("\t".join(map(str, [chunk.text , chunk.lemma_, chunk.start, chunk.end])))
    # print()
# span: Span = chunks[0]
# span.start, span. end, span.text

Absolutely it's needs to be defined and regulated in its use, as currently the word 'natural' when used on food products is totally confusing and meaningless.
(Absolutely, ADV, advmod) (it, PRON, nsubj) ('s, AUX, ROOT) (needs, NOUN, attr) (to, PART, aux) (be, AUX, auxpass) (defined, VERB, xcomp) (and, CCONJ, cc) (regulated, VERB, conj) (in, ADP, prep) (its, PRON, poss) (use, NOUN, pobj) (,, PUNCT, punct) (as, ADP, mark) (currently, ADV, pcomp) (the, DET, det) (word, NOUN, dep) (', PUNCT, punct) (natural, ADJ, amod) (', PUNCT, punct) (when, ADV, advmod) (used, VERB, advcl) (on, ADP, prep) (food, NOUN, compound) (products, NOUN, pobj) (is, VERB, dep) (totally, ADV, advmod) (confusing, ADJ, acomp) (and, CCONJ, cc) (meaningless, ADJ, conj) (., PUNCT, punct)
		's-->it, nsubj
	it
		's-->needs, attr
		use-->its, poss
		in-->use, pobj
	its use
		products-->food, compound
		on-->products, pobj
	food products
Clearly they are trying to imply the item is 'healthy' or possibly 'organic', but when 

In [None]:
for token in sent:
    print(token.dep_)

In [None]:
print(sent._.parse_string)
# (S (NP (NP (DT The) (NN time)) (PP (IN for) (NP (NN action)))) (VP (VBZ is) (ADVP (RB now))) (. .))
print(sent._.labels)
# ('S',)
print(list(sent._.children))
# The time for action

In [None]:
list(sent.noun_chunks)
u: Underscore = sent._
u.span_extensions

In [None]:
c = next(sent._.children)
print(c._.parse_string)
print(c._.labels)
c = next(c._.children)
print(c._.parse_string)

In [None]:
# for (start, end, label_index) in zip(con.starts, con.ends, con.labels):
#     label = con.label_vocab[label_index]
#     print(start, end, label)


In [None]:
def extract_from_np(np_span: Span) -> list:
    # print(f"NP: {np_span}")
    tokens = []
    children = list(np_span._.children)
    if len(children) == 0:
        return [repr(np_span)]

    for child in children:
        if len(child._.labels) == 0:
            tokens.append(repr(child))
            # print(f"tokens: {tokens}")
            continue

        # print(f"extract: {child._.labels} - {child}")
        if child._.labels[0] == "PP":
            # print("out")
            tokens.extend(get_NPs(child))
            continue

        # print("continue")
        tokens.extend(extract_from_np(child))

    return tokens



def get_NPs(span: Span) -> list:
    # print(span)
    nps = []
    for child in span._.children:
        if len(child._.labels) == 0:
            continue

        # print(f"get: {child._.labels} - {child}")
        if child._.labels[0] == "NP":
            nps.extend(extract_from_np(child))

        nps.extend(get_NPs(child))

    return nps

In [None]:
for sent in doc.sents:
    print(sent)
    nps = get_NPs(sent)
    print(nps)