⊕ [Stanford CoreNLP API in NLTK · nltk/nltk Wiki](https://github.com/nltk/nltk/wiki/Stanford-CoreNLP-API-in-NLTK)


In [1]:
from nltk.parse import CoreNLPParser
parser = CoreNLPParser('http://localhost:9004')
list(parser.raw_parse('Ich bin schwanger'))

[Tree('ROOT', [Tree('NUR', [Tree('S', [Tree('PPER', ['Ich']), Tree('VAFIN', ['bin']), Tree('AP', [Tree('ADJD', ['schwanger'])])])])])]

In [2]:
pos_tagger = CoreNLPParser('http://localhost:9004', tagtype='pos')
pos_tagger.tag('Ich bin schwanger'.split())

[('Ich', 'PPER'), ('bin', 'VAFIN'), ('schwanger', 'ADJD')]

In [4]:
ner_tagger = CoreNLPParser('http://localhost:9004', tagtype='ner')
ner_tagger.tag('Donald Trump besuchte Angela Merkel in Berlin.'.split())

[('Donald', 'PERSON'),
 ('Trump', 'PERSON'),
 ('besuchte', 'O'),
 ('Angela', 'PERSON'),
 ('Merkel', 'PERSON'),
 ('in', 'O'),
 ('Berlin', 'LOCATION'),
 ('.', 'O')]

In [6]:
text="Shenzhen ist das Silicon Valley für Hardware-Firmen"
ner_tagger.tag(text.split())

[('Shenzhen', 'LOCATION'),
 ('ist', 'O'),
 ('das', 'O'),
 ('Silicon', 'ORGANIZATION'),
 ('Valley', 'ORGANIZATION'),
 ('für', 'O'),
 ('Hardware-Firmen', 'O')]

In [17]:
from pynlp import StanfordCoreNLP

# annotators = tokenize, ssplit, pos, lemma, ner, depparse, coref
# annotators = 'tokenize, ssplit, pos, lemma, ner, entitymentions, coref, sentiment, quote, openie'
annotators = 'tokenize, ssplit, pos, lemma, ner, entitymentions, coref, sentiment, openie'
# annotators = 'tokenize, ssplit, pos, lemma, ner'
options = {'openie.resolve_coref': True}

nlp = StanfordCoreNLP(annotators=annotators, options=options)

In [18]:
text = ('GOP Sen. Rand Paul was assaulted in his home in Bowling Green, Kentucky, on Friday, '
        'according to Kentucky State Police. State troopers responded to a call to the senator\'s '
        'residence at 3:21 p.m. Friday. Police arrested a man named Rene Albert Boucher, who they '
        'allege "intentionally assaulted" Paul, causing him "minor injury". Boucher, 59, of Bowling '
        'Green was charged with one count of fourth-degree assault. As of Saturday afternoon, he '
        'was being held in the Warren County Regional Jail on a $5,000 bond.')
document = nlp(text)
print(document) # prints 'text'

GOP Sen. Rand Paul was assaulted in his home in Bowling Green, Kentucky, on Friday, according to Kentucky State Police. State troopers responded to a call to the senator's residence at 3:21 p.m. Friday. Police arrested a man named Rene Albert Boucher, who they allege "intentionally assaulted" Paul, causing him "minor injury". Boucher, 59, of Bowling Green was charged with one count of fourth-degree assault. As of Saturday afternoon, he was being held in the Warren County Regional Jail on a $5,000 bond.


In [19]:
for index, sentence in enumerate(document):
    print(index, sentence, sep=' )')


0 )GOP Sen. Rand Paul was assaulted in his home in Bowling Green, Kentucky, on Friday, according to Kentucky State Police. 
1 )State troopers responded to a call to the senator's residence at 3:21 p.m. Friday. 
2 )Police arrested a man named Rene Albert Boucher, who they allege "intentionally assaulted" Paul, causing him "minor injury". 
3 )Boucher, 59, of Bowling Green was charged with one count of fourth-degree assault. 
4 )As of Saturday afternoon, he was being held in the Warren County Regional Jail on a $5,000 bond.


In [20]:
[str(entity) for entity in document.entities if entity.type == 'PERSON']


['Rand Paul', 'his', 'Rene Albert Boucher', 'Paul', 'him', 'Boucher', 'he']

In [21]:
first_sentence = document[0]
for entity in first_sentence.entities:
    print(entity, '({})'.format(entity.type))

GOP (ORGANIZATION)
Rand Paul (PERSON)
Bowling Green (CITY)
Kentucky (STATE_OR_PROVINCE)
Friday (DATE)
Kentucky State Police (ORGANIZATION)
his (PERSON)


In [22]:
# Let's find all the 'VB' tags in the first sentence. A Sentence object iterates over Token objects.

for token in first_sentence:
    if 'VB' in token.pos:
        print(token, token.pos)

was VBD
assaulted VBN
according VBG


In [23]:
# Using the same words, lets see the lemmas.

for token in first_sentence:
    if 'VB' in token.pos:
       print(token, '->', token.lemma)

was -> be
assaulted -> assault
according -> accord


In [24]:
# Coreference resultion
# Let's use pynlp to find the first CorefChain in the text.

chain = document.coref_chains[0]
print(chain)

((State troopers))-[id=13] responded to a call to the senator's residence at 3:21 p.m. Friday. 
Police arrested a man named Rene Albert Boucher, who (they)-[id=16] allege "intentionally assaulted" Paul, causing him "minor injury". 



In [26]:
ref = chain.referent
print('Coreference: {}\n'.format(ref))

for attr in 'type', 'number', 'animacy', 'gender':
    print(attr,  getattr(ref, attr), sep=': ')

# Note that we can also index coreferences by id
# assert chain[4].is_referent

Coreference: State troopers

type: NOMINAL
number: PLURAL
animacy: ANIMATE
gender: UNKNOWN


In [27]:
print(document.quotes)


[]


In [28]:
with open('annotation.dat', 'wb') as file:
    file.write(document.to_bytes())

In [29]:
# To load a pynlp document, instantiate a Document with the from_bytes class method.

from pynlp import Document
with open('annotation.dat', 'rb') as file:
    document = Document.from_bytes(file.read())