In [1]:
# original stanfordcorenlp python wrapper source code version
# doesn't has relation annotator from corenlp, so I added it in my code
# !pip install stanfordcorenlp

### My forked corenlp python wrapper with relation support installation (testing)
```bash
pip uninstall stanfordcorenlp
git clone https://github.com/suisuiwudi/stanford-corenlp
cd stanford-corenlp
python setup.py bdist_wheel --universal
pip install dist/stanfordcorenlp-*.whl
```

In [2]:
from stanfordcorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP('../stanford-corenlp-full-2018-02-27') 
# this server needs root to run notebook server while running
# ex. sudo jupyter notebook

In [3]:
sentence = 'Guangdong University of Foreign Studies is located in Guangzhou.'
print('Tokenize:', nlp.word_tokenize(sentence))
print('Part of Speech:', nlp.pos_tag(sentence))
print('Named Entities:', nlp.ner(sentence))
print('Constituency Parsing:', nlp.parse(sentence))
print('Dependency Parsing:', nlp.dependency_parse(sentence))

Tokenize: ['Guangdong', 'University', 'of', 'Foreign', 'Studies', 'is', 'located', 'in', 'Guangzhou', '.']
Part of Speech: [('Guangdong', 'NNP'), ('University', 'NNP'), ('of', 'IN'), ('Foreign', 'NNP'), ('Studies', 'NNPS'), ('is', 'VBZ'), ('located', 'JJ'), ('in', 'IN'), ('Guangzhou', 'NNP'), ('.', '.')]
Named Entities: [('Guangdong', 'ORGANIZATION'), ('University', 'ORGANIZATION'), ('of', 'ORGANIZATION'), ('Foreign', 'ORGANIZATION'), ('Studies', 'ORGANIZATION'), ('is', 'O'), ('located', 'O'), ('in', 'O'), ('Guangzhou', 'CITY'), ('.', 'O')]
Constituency Parsing: (ROOT
  (S
    (NP
      (NP (NNP Guangdong) (NNP University))
      (PP (IN of)
        (NP (NNP Foreign) (NNPS Studies))))
    (VP (VBZ is)
      (ADJP (JJ located)
        (PP (IN in)
          (NP (NNP Guangzhou)))))
    (. .)))
Dependency Parsing: [('ROOT', 0, 7), ('compound', 2, 1), ('nsubjpass', 7, 2), ('case', 5, 3), ('compound', 5, 4), ('nmod', 2, 5), ('auxpass', 7, 6), ('case', 9, 8), ('nmod', 7, 9), ('punct', 7, 10)]

### Generate dataset from CoNLL 2003 (using spacy code):

In [4]:
def _consume_os(tags):
    ## reference: https://github.com/explosion/spaCy/blob/c7d53348d7c0474852dc5ebe5794f2816ef7eb01/spacy/gold.pyx
    while tags and tags[0] == 'O':
        yield tags.pop(0)


def _consume_ent(tags):
    if not tags:
        return []
    tag = tags.pop(0)
    target_in = 'I' + tag[1:]
    target_last = 'L' + tag[1:]
    length = 1
    while tags and tags[0] in {target_in, target_last}:
        length += 1
        tags.pop(0)
    label = tag[2:]
    if length == 1:
        return ['U-' + label]
    else:
        start = 'B-' + label
        end = 'L-' + label
        middle = ['I-%s' % label for _ in range(1, length - 1)]
        return [start] + middle + [end]
    
def iob_to_biluo(tags):
    out = []
    curr_label = None
    tags = list(tags)
    while tags:
        out.extend(_consume_os(tags))
        out.extend(_consume_ent(tags))
    return out

def read_conll_ner(input_path):
    ## reference: https://github.com/explosion/spaCy/blob/master/spacy/cli/converters/conll_ner2json.py
    text = open(input_path,'r', encoding='utf-8').read()
    i = 0
    delimit_docs = '-DOCSTART- -X- O O'
    output_docs = []
    for doc in text.strip().split(delimit_docs):
        doc = doc.strip()
        if not doc:
            continue
        output_doc = []
        for sent in doc.split('\n\n'):
            sent = sent.strip()
            if not sent:
                continue
            lines = [line.strip() for line in sent.split('\n') if line.strip()]
            words, tags, chunks, iob_ents = zip(*[line.split() for line in lines])
            biluo_ents = iob_to_biluo(iob_ents)
            output_doc.append({'tokens': [
                {'orth': w, 'tag': tag, 'ner': ent} for (w, tag, ent) in
                zip(words, tags, biluo_ents)
            ]})
        output_docs.append({
            'id': len(output_docs),
            'paragraphs': [{'sentences': output_doc}]
        })
        output_doc = []
    return output_docs

In [5]:
test_data = read_conll_ner('CoNLL - 2003/en/test.txt')

In [6]:
print(test_data[0]['paragraphs'][0]['sentences'][1])

{'tokens': [{'ner': 'O', 'tag': 'NN', 'orth': 'SOCCER'}, {'ner': 'O', 'tag': ':', 'orth': '-'}, {'ner': 'U-LOC', 'tag': 'NNP', 'orth': 'JAPAN'}, {'ner': 'O', 'tag': 'VB', 'orth': 'GET'}, {'ner': 'O', 'tag': 'NNP', 'orth': 'LUCKY'}, {'ner': 'O', 'tag': 'NNP', 'orth': 'WIN'}, {'ner': 'O', 'tag': ',', 'orth': ','}, {'ner': 'U-PER', 'tag': 'NNP', 'orth': 'CHINA'}, {'ner': 'O', 'tag': 'IN', 'orth': 'IN'}, {'ner': 'O', 'tag': 'DT', 'orth': 'SURPRISE'}, {'ner': 'O', 'tag': 'NN', 'orth': 'DEFEAT'}, {'ner': 'O', 'tag': '.', 'orth': '.'}]}


### Extract fisrt sentence from test CoNLL data

In [7]:
tokens = [token['orth'] for token in test_data[0]['paragraphs'][0]['sentences'][1]['tokens']]
sentence = ' '.join(tokens)

In [8]:

print('Part of Speech:', nlp.pos_tag(sentence))
print('Named Entities:', nlp.ner(sentence))
print('Constituency Parsing:', nlp.parse(sentence))
print('Dependency Parsing:', nlp.dependency_parse(sentence))

Part of Speech: [('SOCCER', 'NN'), ('-', ':'), ('JAPAN', 'NNP'), ('GET', 'VBP'), ('LUCKY', 'JJ'), ('WIN', 'NN'), (',', ','), ('CHINA', 'NNP'), ('IN', 'IN'), ('SURPRISE', 'NNP'), ('DEFEAT', 'NNP'), ('.', '.')]
Named Entities: [('SOCCER', 'O'), ('-', 'O'), ('JAPAN', 'COUNTRY'), ('GET', 'O'), ('LUCKY', 'O'), ('WIN', 'O'), (',', 'O'), ('CHINA', 'COUNTRY'), ('IN', 'O'), ('SURPRISE', 'O'), ('DEFEAT', 'O'), ('.', 'O')]
Constituency Parsing: (ROOT
  (FRAG
    (NP (NN SOCCER))
    (: -)
    (S
      (NP (NNP JAPAN))
      (VP (VBP GET)
        (NP
          (NP (JJ LUCKY) (NN WIN))
          (, ,)
          (NP
            (NP (NNP CHINA))
            (PP (IN IN)
              (NP (NNP SURPRISE) (NNP DEFEAT)))))))
    (. .)))
Dependency Parsing: [('ROOT', 0, 1), ('punct', 1, 2), ('nsubj', 4, 3), ('dep', 1, 4), ('amod', 6, 5), ('dobj', 4, 6), ('punct', 4, 7), ('dep', 4, 8), ('case', 11, 9), ('compound', 11, 10), ('nmod', 8, 11), ('punct', 1, 12)]


In [9]:
print('Relation Extractor:', nlp.relation(sentence))

Relation Extractor: {'sentences': [{'enhancedDependencies': [{'dependentGloss': 'SOCCER', 'dep': 'ROOT', 'dependent': 1, 'governorGloss': 'ROOT', 'governor': 0}, {'dependentGloss': '-', 'dep': 'punct', 'dependent': 2, 'governorGloss': 'SOCCER', 'governor': 1}, {'dependentGloss': 'JAPAN', 'dep': 'nsubj', 'dependent': 3, 'governorGloss': 'GET', 'governor': 4}, {'dependentGloss': 'GET', 'dep': 'appos', 'dependent': 4, 'governorGloss': 'SOCCER', 'governor': 1}, {'dependentGloss': 'LUCKY', 'dep': 'amod', 'dependent': 5, 'governorGloss': 'WIN', 'governor': 6}, {'dependentGloss': 'WIN', 'dep': 'dobj', 'dependent': 6, 'governorGloss': 'GET', 'governor': 4}, {'dependentGloss': ',', 'dep': 'punct', 'dependent': 7, 'governorGloss': 'WIN', 'governor': 6}, {'dependentGloss': 'CHINA', 'dep': 'appos', 'dependent': 8, 'governorGloss': 'WIN', 'governor': 6}, {'dependentGloss': 'IN', 'dep': 'case', 'dependent': 9, 'governorGloss': 'DEFEAT', 'governor': 11}, {'dependentGloss': 'SURPRISE', 'dep': 'compoun

I'm still need to figure out how ot print the relation extractor from corenlp java server.