In [2]:
# https://github.com/nikitakit/self-attentive-parser
import spacy, benepar
from spacy import displacy
import nltk

In [3]:
benepar.download('benepar_en3')

[nltk_data] Downloading package benepar_en3 to
[nltk_data]     /home/zhengzhang/nltk_data...
[nltk_data]   Package benepar_en3 is already up-to-date!


True

In [4]:
nlp = spacy.load('en_core_web_md')
if spacy.__version__.startswith('2'):
    nlp.add_pipe(benepar.BeneparComponent("benepar_en3"))
else:
    nlp.add_pipe("benepar", config={"model": "benepar_en3"})
doc = nlp("This particular location has a good check in deal. We came here near Christmas time to buy some presents and we had a good experience.")
sent = list(doc.sents)[0]
print(sent._.parse_string)

(S (NP (DT This) (JJ particular) (NN location)) (VP (VBZ has) (NP (NP (DT a) (JJ good) (NN check)) (PP (IN in) (NP (NN deal))))) (. .))




In [5]:
for c in sent._.constituents:
    print(c, c._.labels, c._.parse_string, c._.children, c._.children)

This particular location has a good check in deal. ('S',) (S (NP (DT This) (JJ particular) (NN location)) (VP (VBZ has) (NP (NP (DT a) (JJ good) (NN check)) (PP (IN in) (NP (NN deal))))) (. .)) <generator object get_child_spans at 0x7f752905ac10> <generator object get_child_spans at 0x7f752905ac80>
This particular location ('NP',) (NP (DT This) (JJ particular) (NN location)) <generator object get_child_spans at 0x7f752905aba0> <generator object get_child_spans at 0x7f752905ac80>
This () (DT This) <generator object get_child_spans at 0x7f752905ac10> <generator object get_child_spans at 0x7f752905ac80>
particular () (JJ particular) <generator object get_child_spans at 0x7f752905aba0> <generator object get_child_spans at 0x7f752905ac80>
location () (NN location) <generator object get_child_spans at 0x7f752905ac10> <generator object get_child_spans at 0x7f752905ac80>
has a good check in deal ('VP',) (VP (VBZ has) (NP (NP (DT a) (JJ good) (NN check)) (PP (IN in) (NP (NN deal))))) <generator

In [6]:
from pprint import pprint

def enumerateSort(span):
    if len(span._.labels) == 0:
        # leaf node
        return [span._.parse_string]
    
    partial_enumerated = []
    partial_enumerated.append(span._.labels[0])

    sub_enumerated = []
    for c in span._.children:
        if len(sub_enumerated) == 0:
            sub_enumerated.extend(enumerateSort(c))
            continue
        else:
            right_side = enumerateSort(c)
            left_side = sub_enumerated
            sub_enumerated = [ a + " " + b for a in left_side for b in right_side ]
    
    partial_enumerated.extend(sub_enumerated)
    # print(f"span: {span._.labels}, partial_enumerated: {partial_enumerated}")
    return partial_enumerated


In [7]:
pprint(enumerateSort(sent))

['S',
 'NP VP (. .)',
 'NP (VBZ has) NP (. .)',
 'NP (VBZ has) NP PP (. .)',
 'NP (VBZ has) NP (IN in) NP (. .)',
 'NP (VBZ has) (DT a) (JJ good) (NN check) PP (. .)',
 'NP (VBZ has) (DT a) (JJ good) (NN check) (IN in) NP (. .)',
 '(DT This) (JJ particular) (NN location) VP (. .)',
 '(DT This) (JJ particular) (NN location) (VBZ has) NP (. .)',
 '(DT This) (JJ particular) (NN location) (VBZ has) NP PP (. .)',
 '(DT This) (JJ particular) (NN location) (VBZ has) NP (IN in) NP (. .)',
 '(DT This) (JJ particular) (NN location) (VBZ has) (DT a) (JJ good) (NN '
 'check) PP (. .)',
 '(DT This) (JJ particular) (NN location) (VBZ has) (DT a) (JJ good) (NN '
 'check) (IN in) NP (. .)']


In [17]:
import re

SyntaxMapping = {
    "NP": "A",
    "NN": "B",
    "CC": "C",
    "CD": "D",
    "DT": "E",
    "EX": "F",
    "FW": "G",
    "IN": "H",
    "JJ": "I",
    "JJR": "J",
    "JJS": "K",
    "LS": "L",
    "MD": "M",
    "NNS": "N",
    "NNP": "O",
    "NNPS": "P",
    "PDT": "Q",
    "POS": "R",
    "PRP": "S",
    "PRP$": "T",
    "RB": "U",
    "RBR": "V",
    "RBS": "W",
    "RP": "X",
    "TO": "Y",
    "UH": "Z",
    "VB": "a",
    "VBG": "b",
    "VBD": "c",
    "VBN": "d",
    "VBP": "e",
    "VBZ": "f",
    "WDT": "g",
    "WP": "h",
    "WRB": "i",
    "VP": "j"
}

enumerated = enumerateSort(sent)
new_enumerated = []
for enum in enumerated:
    new_enum = re.sub(r"\((\w+) (\w+)\)", r"\1", enum)
    new_enum = re.sub(r"\(\. \.\)", r"", new_enum)
    new_enumerated.append(new_enum)
    print(new_enum)

S
NP VP 
NP VBZ NP 
NP VBZ NP PP 
NP VBZ NP IN NP 
NP VBZ DT JJ NN PP 
NP VBZ DT JJ NN IN NP 
DT JJ NN VP 
DT JJ NN VBZ NP 
DT JJ NN VBZ NP PP 
DT JJ NN VBZ NP IN NP 
DT JJ NN VBZ DT JJ NN PP 
DT JJ NN VBZ DT JJ NN IN NP 


In [22]:
reg_exps = []
for enum in new_enumerated[1:]:
    reg_exp = ""
    print(enum)
    for e in enum.strip().split(" "):
        print(e)
        reg_exp += SyntaxMapping[e]
    reg_exps.append(reg_exp)

pprint(reg_exps)

NP VP 
NP
VP



KeyError: ''

In [12]:
displacy.serve(doc, style="det")

ValueError: [E087] Unknown displaCy style: det.