In [1]:
# importing all necessary packages
import os, glob, itertools

### Understanding File Structure

In [2]:
for f in os.listdir("/Users/rbalasubramaniam/OntoNotes-5.0-NER-BIO/"):
    if not f.startswith("."):
        print(f)

conll-formatted-ontonotes-5.0
onto.train.ner.sample
README.md
onto.development.ner.sample
onto.test.ner.sample
agg.py


Right away we notice that we have the **conll-formatted-ontontes-5.0** folder. Lets collect the major folders of interest in a list **tags**.

In [3]:
tags = []
for f in os.listdir("/Users/rbalasubramaniam/OntoNotes-5.0-NER-BIO/conll-formatted-ontonotes-5.0/data/"):
    if not f.startswith("."):
        print(f)
        tags.append(f)

test
development
conll-2012-test
train


Now, we will create a dictionary with the parent directory and file name information of files that end with the extension **.gold_conll**. Let the dictionary object have key as the name of parent folder and the value as the list of files.

In [9]:
from collections import defaultdict
dict_gold_conll = defaultdict(dict)

In [10]:
for tag in tags:
    for root,dirs,files in os.walk('/Users/rbalasubramaniam/OntoNotes-5.0-NER-BIO/conll-formatted-ontonotes-5.0/data/'+tag):
        filelist = [f for f in files if f.endswith(".gold_conll")]
        if len(filelist) > 0:
            dict_gold_conll[tag][root] = filelist

In [11]:
for tag in tags:
    print("dirs in {}:".format(tag))
    print("we have {} dirs in gold_conll files".format(len(dict_gold_conll[tag].keys())))

dirs in test:
we have 151 dirs in gold_conll files
dirs in development:
we have 154 dirs in gold_conll files
dirs in conll-2012-test:
we have 37 dirs in gold_conll files
dirs in train:
we have 187 dirs in gold_conll files


### Understanding the file contents

Let's see what the contents of these files are. We will use only one file to do this.

In [27]:
all_lengths = []
for key1 in list(dict_gold_conll["test"].keys())[:1]:
        print("root in {}:".format(key1))
        for cur_file in dict_gold_conll["test"][key1]:
                with open(key1 + '/' + cur_file, 'r') as f:
                    print(cur_file)
                    for line in f.readlines():
                        l = line.strip()
                        l = ' '.join(l.split())
                        ls = l.split(" ")
                        all_lengths.append(len(ls))
                        print("length: {}".format(len(ls)))
                        print("before: {}".format(l))
                        print("after: {}".format(ls))

root in /Users/rbalasubramaniam/OntoNotes-5.0-NER-BIO/conll-formatted-ontonotes-5.0/data/test/data/english/annotations/mz/sinorama/10:
ectb_1029.gold_conll
length: 5
before: #begin document (mz/sinorama/10/ectb_1029); part 000
after: ['#begin', 'document', '(mz/sinorama/10/ectb_1029);', 'part', '000']
length: 12
before: mz/sinorama/10/ectb_1029 0 0 Powerful JJ (TOP(NP(NP(NP* - - - - (WORK_OF_ART* -
after: ['mz/sinorama/10/ectb_1029', '0', '0', 'Powerful', 'JJ', '(TOP(NP(NP(NP*', '-', '-', '-', '-', '(WORK_OF_ART*', '-']
length: 12
before: mz/sinorama/10/ectb_1029 0 1 Tools NNS *) - - - - * -
after: ['mz/sinorama/10/ectb_1029', '0', '1', 'Tools', 'NNS', '*)', '-', '-', '-', '-', '*', '-']
length: 12
before: mz/sinorama/10/ectb_1029 0 2 for IN (PP* - - - - * -
after: ['mz/sinorama/10/ectb_1029', '0', '2', 'for', 'IN', '(PP*', '-', '-', '-', '-', '*', '-']
length: 12
before: mz/sinorama/10/ectb_1029 0 3 Biotechnology NN (NP*))) - - - - * -
after: ['mz/sinorama/10/ectb_1029', '0', '3', 'Bi

before: mz/sinorama/10/ectb_1029 1 24 to TO (S(VP* - - - - * * * * -
after: ['mz/sinorama/10/ectb_1029', '1', '24', 'to', 'TO', '(S(VP*', '-', '-', '-', '-', '*', '*', '*', '*', '-']
length: 15
before: mz/sinorama/10/ectb_1029 1 25 make VB (VP* make 01 2 - * * * (V*) -
after: ['mz/sinorama/10/ectb_1029', '1', '25', 'make', 'VB', '(VP*', 'make', '01', '2', '-', '*', '*', '*', '(V*)', '-']
length: 15
before: mz/sinorama/10/ectb_1029 1 26 biochips NNS (NP*)))) - - - - * * * (ARG1*) -
after: ['mz/sinorama/10/ectb_1029', '1', '26', 'biochips', 'NNS', '(NP*))))', '-', '-', '-', '-', '*', '*', '*', '(ARG1*)', '-']
length: 15
before: mz/sinorama/10/ectb_1029 1 27 : : * - - - - * * * * -
after: ['mz/sinorama/10/ectb_1029', '1', '27', ':', ':', '*', '-', '-', '-', '-', '*', '*', '*', '*', '-']
length: 15
before: mz/sinorama/10/ectb_1029 1 28 U NNP (NP(NP(NML* - - - - (ORG* * * * -
after: ['mz/sinorama/10/ectb_1029', '1', '28', 'U', 'NNP', '(NP(NP(NML*', '-', '-', '-', '-', '(ORG*', '*', '*', '*'

length: 20
before: mz/sinorama/10/ectb_1029 2 39 A16 NNP *) - - - - * * * * * * * * *) -
after: ['mz/sinorama/10/ectb_1029', '2', '39', 'A16', 'NNP', '*)', '-', '-', '-', '-', '*', '*', '*', '*', '*', '*', '*', '*', '*)', '-']
length: 20
before: mz/sinorama/10/ectb_1029 2 40 , , * - - - - * * * * * * * * * -
after: ['mz/sinorama/10/ectb_1029', '2', '40', ',', ',', '*', '-', '-', '-', '-', '*', '*', '*', '*', '*', '*', '*', '*', '*', '-']
length: 20
before: mz/sinorama/10/ectb_1029 2 41 which WDT (SBAR(WHNP*) - - - - * * * * * * * * (R-ARG1*) -
after: ['mz/sinorama/10/ectb_1029', '2', '41', 'which', 'WDT', '(SBAR(WHNP*)', '-', '-', '-', '-', '*', '*', '*', '*', '*', '*', '*', '*', '(R-ARG1*)', '-']
length: 20
before: mz/sinorama/10/ectb_1029 2 42 is VBZ (S(VP* be 03 - - * * * * * * * (V*) * -
after: ['mz/sinorama/10/ectb_1029', '2', '42', 'is', 'VBZ', '(S(VP*', 'be', '03', '-', '-', '*', '*', '*', '*', '*', '*', '*', '(V*)', '*', '-']
length: 20
before: mz/sinorama/10/ectb_1029 2 43 eas

after: ['mz/sinorama/10/ectb_1029', '3', '13', 'far', 'RB', '*)', '-', '-', '-', '-', '*', '*', '*)', '*', '-']
length: 15
before: mz/sinorama/10/ectb_1029 3 14 established VBN * establish 01 - - * * (V*) * -
after: ['mz/sinorama/10/ectb_1029', '3', '14', 'established', 'VBN', '*', 'establish', '01', '-', '-', '*', '*', '(V*)', '*', '-']
length: 15
before: mz/sinorama/10/ectb_1029 3 15 in IN (PP* - - - - * * (ARGM-LOC* * -
after: ['mz/sinorama/10/ectb_1029', '3', '15', 'in', 'IN', '(PP*', '-', '-', '-', '-', '*', '*', '(ARGM-LOC*', '*', '-']
length: 15
before: mz/sinorama/10/ectb_1029 3 16 Taiwan NNP (NP*)))))) - - - - (GPE) * *) *) (24)|11)
after: ['mz/sinorama/10/ectb_1029', '3', '16', 'Taiwan', 'NNP', '(NP*))))))', '-', '-', '-', '-', '(GPE)', '*', '*)', '*)', '(24)|11)']
length: 15
before: mz/sinorama/10/ectb_1029 3 17 still RB (ADVP*) - - - - * * * (ARGM-TMP*) -
after: ['mz/sinorama/10/ectb_1029', '3', '17', 'still', 'RB', '(ADVP*)', '-', '-', '-', '-', '*', '*', '*', '(ARGM-TMP*)

length: 15
before: mz/sinorama/10/ectb_1029 3 16 to TO (VP* - - - - * * * * -
after: ['mz/sinorama/10/ectb_1029', '3', '16', 'to', 'TO', '(VP*', '-', '-', '-', '-', '*', '*', '*', '*', '-']
length: 15
before: mz/sinorama/10/ectb_1029 3 17 sign VB (VP* sign 02 1 - * * * (V*) -
after: ['mz/sinorama/10/ectb_1029', '3', '17', 'sign', 'VB', '(VP*', 'sign', '02', '1', '-', '*', '*', '*', '(V*)', '-']
length: 15
before: mz/sinorama/10/ectb_1029 3 18 technology NN (NP(NP(NP* technology - 1 - * * * (ARG1* -
after: ['mz/sinorama/10/ectb_1029', '3', '18', 'technology', 'NN', '(NP(NP(NP*', 'technology', '-', '1', '-', '*', '*', '*', '(ARG1*', '-']
length: 15
before: mz/sinorama/10/ectb_1029 3 19 transfer NN * - - - - * * * * -
after: ['mz/sinorama/10/ectb_1029', '3', '19', 'transfer', 'NN', '*', '-', '-', '-', '-', '*', '*', '*', '*', '-']
length: 15
before: mz/sinorama/10/ectb_1029 3 20 agreements NNS *) - - - - * * * * -
after: ['mz/sinorama/10/ectb_1029', '3', '20', 'agreements', 'NNS', '*)', '

length: 15
before: mz/sinorama/10/ectb_1029 4 24 in IN (PP* - - - - * * * * -
after: ['mz/sinorama/10/ectb_1029', '4', '24', 'in', 'IN', '(PP*', '-', '-', '-', '-', '*', '*', '*', '*', '-']
length: 15
before: mz/sinorama/10/ectb_1029 4 25 miniaturized VBN (NP* miniaturize 01 - - * * * (V*) -
after: ['mz/sinorama/10/ectb_1029', '4', '25', 'miniaturized', 'VBN', '(NP*', 'miniaturize', '01', '-', '-', '*', '*', '*', '(V*)', '-']
length: 15
before: mz/sinorama/10/ectb_1029 4 26 form NN *))))))))))) form - 4 - * *) *) * -
after: ['mz/sinorama/10/ectb_1029', '4', '26', 'form', 'NN', '*)))))))))))', 'form', '-', '4', '-', '*', '*)', '*)', '*', '-']
length: 15
before: mz/sinorama/10/ectb_1029 4 27 . . *)) - - - - * * * * -
after: ['mz/sinorama/10/ectb_1029', '4', '27', '.', '.', '*))', '-', '-', '-', '-', '*', '*', '*', '*', '-']
length: 1
before: 
after: ['']
length: 15
before: mz/sinorama/10/ectb_1029 4 0 Meanwhile RB (TOP(S(S(ADVP*) - - - - * * (ARGM-TMP*) * -
after: ['mz/sinorama/10/ectb_1

before: mz/sinorama/10/ectb_1029 5 19 biochips NNS (NP*))) - - - - * * * (ARG1*) *) -
after: ['mz/sinorama/10/ectb_1029', '5', '19', 'biochips', 'NNS', '(NP*)))', '-', '-', '-', '-', '*', '*', '*', '(ARG1*)', '*)', '-']
length: 16
before: mz/sinorama/10/ectb_1029 5 20 , , * - - - - * * * * * -
after: ['mz/sinorama/10/ectb_1029', '5', '20', ',', ',', '*', '-', '-', '-', '-', '*', '*', '*', '*', '*', '-']
length: 16
before: mz/sinorama/10/ectb_1029 5 21 they PRP (NP*) - - - - * * * (ARG0*) (ARG0*) (15)
after: ['mz/sinorama/10/ectb_1029', '5', '21', 'they', 'PRP', '(NP*)', '-', '-', '-', '-', '*', '*', '*', '(ARG0*)', '(ARG0*)', '(15)']
length: 16
before: mz/sinorama/10/ectb_1029 5 22 can MD (VP* - - - - * * * * (ARGM-MOD*) -
after: ['mz/sinorama/10/ectb_1029', '5', '22', 'can', 'MD', '(VP*', '-', '-', '-', '-', '*', '*', '*', '*', '(ARGM-MOD*)', '-']
length: 16
before: mz/sinorama/10/ectb_1029 5 23 obtain VB (VP* obtain 01 1 - * * * * (V*) -
after: ['mz/sinorama/10/ectb_1029', '5', '23',

length: 16
before: mz/sinorama/10/ectb_1029 5 11 out RP (PRT*) - - - - * * * * * -
after: ['mz/sinorama/10/ectb_1029', '5', '11', 'out', 'RP', '(PRT*)', '-', '-', '-', '-', '*', '*', '*', '*', '*', '-']
length: 16
before: mz/sinorama/10/ectb_1029 5 12 research NN (NP* research - 1 - * (ARG1* * * * -
after: ['mz/sinorama/10/ectb_1029', '5', '12', 'research', 'NN', '(NP*', 'research', '-', '1', '-', '*', '(ARG1*', '*', '*', '*', '-']
length: 16
before: mz/sinorama/10/ectb_1029 5 13 and CC * - - - - * * * * * -
after: ['mz/sinorama/10/ectb_1029', '5', '13', 'and', 'CC', '*', '-', '-', '-', '-', '*', '*', '*', '*', '*', '-']
length: 16
before: mz/sinorama/10/ectb_1029 5 14 development NN *)) - - - - * *) * * * -
after: ['mz/sinorama/10/ectb_1029', '5', '14', 'development', 'NN', '*))', '-', '-', '-', '-', '*', '*)', '*', '*', '*', '-']
length: 16
before: mz/sinorama/10/ectb_1029 5 15 , , * - - - - * * * * * -
after: ['mz/sinorama/10/ectb_1029', '5', '15', ',', ',', '*', '-', '-', '-', '-',

after: ['mz/sinorama/10/ectb_1019', '0', '0', 'Genteel', 'JJ', '(TOP(NP*', '-', '-', '-', '-', '*', '-']
length: 12
before: mz/sinorama/10/ectb_1019 0 1 Journalis NN * - - - - * -
after: ['mz/sinorama/10/ectb_1019', '0', '1', 'Journalis', 'NN', '*', '-', '-', '-', '-', '*', '-']
length: 12
before: mz/sinorama/10/ectb_1019 0 2 -- : *)) - - - - * -
after: ['mz/sinorama/10/ectb_1019', '0', '2', '--', ':', '*))', '-', '-', '-', '-', '*', '-']
length: 1
before: 
after: ['']
length: 12
before: mz/sinorama/10/ectb_1019 0 0 Thomas NNP (TOP(NP* - - - - (PERSON* (1
after: ['mz/sinorama/10/ectb_1019', '0', '0', 'Thomas', 'NNP', '(TOP(NP*', '-', '-', '-', '-', '(PERSON*', '(1']
length: 12
before: mz/sinorama/10/ectb_1019 0 1 Lu NNP *)) - - - - *) 1)
after: ['mz/sinorama/10/ectb_1019', '0', '1', 'Lu', 'NNP', '*))', '-', '-', '-', '-', '*)', '1)']
length: 1
before: 
after: ['']
length: 13
before: mz/sinorama/10/ectb_1019 0 0 -LRB- -LRB- (TOP(FRAG* - - - - * * -
after: ['mz/sinorama/10/ectb_1019', '0

length: 1
before: 
after: ['']
length: 13
before: mz/sinorama/10/ectb_1019 1 0 Funnily RB (TOP(S(ADVP* - - - - * (ARGM-ADV* -
after: ['mz/sinorama/10/ectb_1019', '1', '0', 'Funnily', 'RB', '(TOP(S(ADVP*', '-', '-', '-', '-', '*', '(ARGM-ADV*', '-']
length: 13
before: mz/sinorama/10/ectb_1019 1 1 enough RB *) - - - - * *) -
after: ['mz/sinorama/10/ectb_1019', '1', '1', 'enough', 'RB', '*)', '-', '-', '-', '-', '*', '*)', '-']
length: 13
before: mz/sinorama/10/ectb_1019 1 2 , , * - - - - * * -
after: ['mz/sinorama/10/ectb_1019', '1', '2', ',', ',', '*', '-', '-', '-', '-', '*', '*', '-']
length: 13
before: mz/sinorama/10/ectb_1019 1 3 an DT (NP* - - - - * (ARG1* -
after: ['mz/sinorama/10/ectb_1019', '1', '3', 'an', 'DT', '(NP*', '-', '-', '-', '-', '*', '(ARG1*', '-']
length: 13
before: mz/sinorama/10/ectb_1019 1 4 unassuming JJ * - - - - * * -
after: ['mz/sinorama/10/ectb_1019', '1', '4', 'unassuming', 'JJ', '*', '-', '-', '-', '-', '*', '*', '-']
length: 13
before: mz/sinorama/10/ectb_

length: 14
before: mz/sinorama/10/ectb_1019 2 2 was VBD (VP* be 01 1 - * (V*) * -
after: ['mz/sinorama/10/ectb_1019', '2', '2', 'was', 'VBD', '(VP*', 'be', '01', '1', '-', '*', '(V*)', '*', '-']
length: 14
before: mz/sinorama/10/ectb_1019 2 3 a DT (NP(NP* - - - - * (ARG2* * -
after: ['mz/sinorama/10/ectb_1019', '2', '3', 'a', 'DT', '(NP(NP*', '-', '-', '-', '-', '*', '(ARG2*', '*', '-']
length: 14
before: mz/sinorama/10/ectb_1019 2 4 universally RB (ADJP* - - - - * * (ARGM-ADV*) -
after: ['mz/sinorama/10/ectb_1019', '2', '4', 'universally', 'RB', '(ADJP*', '-', '-', '-', '-', '*', '*', '(ARGM-ADV*)', '-']
length: 14
before: mz/sinorama/10/ectb_1019 2 5 admired VBN *) admire 01 1 - * * (V*) -
after: ['mz/sinorama/10/ectb_1019', '2', '5', 'admired', 'VBN', '*)', 'admire', '01', '1', '-', '*', '*', '(V*)', '-']
length: 14
before: mz/sinorama/10/ectb_1019 2 6 beauty NN *) - - - - * * (ARG1*) -
after: ['mz/sinorama/10/ectb_1019', '2', '6', 'beauty', 'NN', '*)', '-', '-', '-', '-', '*', '*',

before: mz/sinorama/10/ectb_1049 0 18 as IN (SBAR* - - - - * * * * (ARGM-ADV* * * * * * -
after: ['mz/sinorama/10/ectb_1049', '0', '18', 'as', 'IN', '(SBAR*', '-', '-', '-', '-', '*', '*', '*', '*', '(ARGM-ADV*', '*', '*', '*', '*', '*', '-']
length: 21
before: mz/sinorama/10/ectb_1049 0 19 old JJ (S(S(NP* - - - - * * * * * * (ARG0* * * * (39
after: ['mz/sinorama/10/ectb_1049', '0', '19', 'old', 'JJ', '(S(S(NP*', '-', '-', '-', '-', '*', '*', '*', '*', '*', '*', '(ARG0*', '*', '*', '*', '(39']
length: 21
before: mz/sinorama/10/ectb_1049 0 20 shops NNS *) shop - 1 - * * * * * * *) * * * 39)
after: ['mz/sinorama/10/ectb_1049', '0', '20', 'shops', 'NNS', '*)', 'shop', '-', '1', '-', '*', '*', '*', '*', '*', '*', '*)', '*', '*', '*', '39)']
length: 21
before: mz/sinorama/10/ectb_1049 0 21 have VBP (VP* have 01 - - * * * * * (V*) * * * * -
after: ['mz/sinorama/10/ectb_1049', '0', '21', 'have', 'VBP', '(VP*', 'have', '01', '-', '-', '*', '*', '*', '*', '*', '(V*)', '*', '*', '*', '*', '-']
l

length: 17
before: mz/sinorama/10/ectb_1049 1 4 known VBN * know 01 2 - * (V*) * * * * -
after: ['mz/sinorama/10/ectb_1049', '1', '4', 'known', 'VBN', '*', 'know', '01', '2', '-', '*', '(V*)', '*', '*', '*', '*', '-']
length: 17
before: mz/sinorama/10/ectb_1049 1 5 as IN (PP* - - - - * (ARG2* * * * * -
after: ['mz/sinorama/10/ectb_1049', '1', '5', 'as', 'IN', '(PP*', '-', '-', '-', '-', '*', '(ARG2*', '*', '*', '*', '*', '-']
length: 17
before: mz/sinorama/10/ectb_1049 1 6 Tanshui NNP (NP*)))) - - - - (GPE) *) *) * * * 52)
after: ['mz/sinorama/10/ectb_1049', '1', '6', 'Tanshui', 'NNP', '(NP*))))', '-', '-', '-', '-', '(GPE)', '*)', '*)', '*', '*', '*', '52)']
length: 17
before: mz/sinorama/10/ectb_1049 1 7 was VBD (VP* be 01 4 - * * (V*) * * * -
after: ['mz/sinorama/10/ectb_1049', '1', '7', 'was', 'VBD', '(VP*', 'be', '01', '4', '-', '*', '*', '(V*)', '*', '*', '*', '-']
length: 17
before: mz/sinorama/10/ectb_1049 1 8 at IN (PP* - - - - * * (ARG2* * * * -
after: ['mz/sinorama/10/ectb_1

after: ['mz/sinorama/10/ectb_1049', '1', '13', 'and', 'CC', '*', '-', '-', '-', '-', '*', '*', '*', '*', '*', '*', '-']
length: 17
before: mz/sinorama/10/ectb_1049 1 14 because IN (SBAR* - - - - * * * * * * -
after: ['mz/sinorama/10/ectb_1049', '1', '14', 'because', 'IN', '(SBAR*', '-', '-', '-', '-', '*', '*', '*', '*', '*', '*', '-']
length: 17
before: mz/sinorama/10/ectb_1049 1 15 the DT (S(NP* - - - - * * (ARG0* (ARG0* * * -
after: ['mz/sinorama/10/ectb_1049', '1', '15', 'the', 'DT', '(S(NP*', '-', '-', '-', '-', '*', '*', '(ARG0*', '(ARG0*', '*', '*', '-']
length: 17
before: mz/sinorama/10/ectb_1049 1 16 Japanese NNPS *) - - - - (NORP) * *) *) * * -
after: ['mz/sinorama/10/ectb_1049', '1', '16', 'Japanese', 'NNPS', '*)', '-', '-', '-', '-', '(NORP)', '*', '*)', '*)', '*', '*', '-']
length: 17
before: mz/sinorama/10/ectb_1049 1 17 invested VBD (VP* invest 01 1 - * * (V*) * * * -
after: ['mz/sinorama/10/ectb_1049', '1', '17', 'invested', 'VBD', '(VP*', 'invest', '01', '1', '-', '*',

length: 24
before: mz/sinorama/10/ectb_1049 2 55 university NN *) - - - - * * * * * * * * * * * * *) 0)
after: ['mz/sinorama/10/ectb_1049', '2', '55', 'university', 'NN', '*)', '-', '-', '-', '-', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*)', '0)']
length: 24
before: mz/sinorama/10/ectb_1049 2 56 could MD (VP* - - - - * * * * * * * * * * * * (ARGM-MOD*) -
after: ['mz/sinorama/10/ectb_1049', '2', '56', 'could', 'MD', '(VP*', '-', '-', '-', '-', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '(ARGM-MOD*)', '-']
length: 24
before: mz/sinorama/10/ectb_1049 2 57 set VB (VP* set 01 7 - * * * * * * * * * * * * (V*) -
after: ['mz/sinorama/10/ectb_1049', '2', '57', 'set', 'VB', '(VP*', 'set', '01', '7', '-', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '(V*)', '-']
length: 24
before: mz/sinorama/10/ectb_1049 2 58 its PRP$ (NP* - - - - * * * * * * * * * * * * (ARG1* (0)
after: ['mz/sinorama/10/ectb_1049', '2', '58', 'its', 'PRP$', '(NP*', '-', '-',

before: mz/sinorama/10/ectb_1049 3 29 antiques NNS (NP*) - - - - * * * * * * * -
after: ['mz/sinorama/10/ectb_1049', '3', '29', 'antiques', 'NNS', '(NP*)', '-', '-', '-', '-', '*', '*', '*', '*', '*', '*', '*', '-']
length: 18
before: mz/sinorama/10/ectb_1049 3 30 , , * - - - - * * * * * * * -
after: ['mz/sinorama/10/ectb_1049', '3', '30', ',', ',', '*', '-', '-', '-', '-', '*', '*', '*', '*', '*', '*', '*', '-']
length: 18
before: mz/sinorama/10/ectb_1049 3 31 and CC * - - - - * * * * * * * -
after: ['mz/sinorama/10/ectb_1049', '3', '31', 'and', 'CC', '*', '-', '-', '-', '-', '*', '*', '*', '*', '*', '*', '*', '-']
length: 18
before: mz/sinorama/10/ectb_1049 3 32 snack NN (NP* - - - - * * * * * * * -
after: ['mz/sinorama/10/ectb_1049', '3', '32', 'snack', 'NN', '(NP*', '-', '-', '-', '-', '*', '*', '*', '*', '*', '*', '*', '-']
length: 18
before: mz/sinorama/10/ectb_1049 3 33 foods NNS *)))))))))))) food - 1 - * * * *) *) *) *) -
after: ['mz/sinorama/10/ectb_1049', '3', '33', 'foods',

length: 17
before: mz/sinorama/10/ectb_1049 4 8 over IN (PP* - - - - * (ARGM-LOC* * * * * -
after: ['mz/sinorama/10/ectb_1049', '4', '8', 'over', 'IN', '(PP*', '-', '-', '-', '-', '*', '(ARGM-LOC*', '*', '*', '*', '*', '-']
length: 17
before: mz/sinorama/10/ectb_1049 4 9 her PRP (NP*))))) - - - - * *) * *) * * (3)
after: ['mz/sinorama/10/ectb_1049', '4', '9', 'her', 'PRP', '(NP*)))))', '-', '-', '-', '-', '*', '*)', '*', '*)', '*', '*', '(3)']
length: 17
before: mz/sinorama/10/ectb_1049 4 10 , , * - - - - * * * * * * -
after: ['mz/sinorama/10/ectb_1049', '4', '10', ',', ',', '*', '-', '-', '-', '-', '*', '*', '*', '*', '*', '*', '-']
length: 17
before: mz/sinorama/10/ectb_1049 4 11 are VBP (VP* be 01 1 - * * * (V*) * * -
after: ['mz/sinorama/10/ectb_1049', '4', '11', 'are', 'VBP', '(VP*', 'be', '01', '1', '-', '*', '*', '*', '(V*)', '*', '*', '-']
length: 17
before: mz/sinorama/10/ectb_1049 4 12 fearful JJ (ADJP* - - - - * * * (ARG2* * * -
after: ['mz/sinorama/10/ectb_1049', '4', '12',

before: mz/sinorama/10/ectb_1049 6 4 bus NN * bus - 1 - * * -
after: ['mz/sinorama/10/ectb_1049', '6', '4', 'bus', 'NN', '*', 'bus', '-', '1', '-', '*', '*', '-']
length: 13
before: mz/sinorama/10/ectb_1049 6 5 service NN *) service - 13 - * * -
after: ['mz/sinorama/10/ectb_1049', '6', '5', 'service', 'NN', '*)', 'service', '-', '13', '-', '*', '*', '-']
length: 13
before: mz/sinorama/10/ectb_1049 6 6 , , * - - - - * * -
after: ['mz/sinorama/10/ectb_1049', '6', '6', ',', ',', '*', '-', '-', '-', '-', '*', '*', '-']
length: 13
before: mz/sinorama/10/ectb_1049 6 7 the DT (NP(NP* - - - - (FAC* * -
after: ['mz/sinorama/10/ectb_1049', '6', '7', 'the', 'DT', '(NP(NP*', '-', '-', '-', '-', '(FAC*', '*', '-']
length: 13
before: mz/sinorama/10/ectb_1049 6 8 Tanshui NNP * - - - - * * (6)
after: ['mz/sinorama/10/ectb_1049', '6', '8', 'Tanshui', 'NNP', '*', '-', '-', '-', '-', '*', '*', '(6)']
length: 13
before: mz/sinorama/10/ectb_1049 6 9 Line NNP *) - - - - *) * -
after: ['mz/sinorama/10/ectb_1

length: 15
before: mz/sinorama/10/ectb_1039 1 13 firm JJ * - - - - * * * * -
after: ['mz/sinorama/10/ectb_1039', '1', '13', 'firm', 'JJ', '*', '-', '-', '-', '-', '*', '*', '*', '*', '-']
length: 15
before: mz/sinorama/10/ectb_1039 1 14 conclusion NN *) conclusion - 1 - * * * * -
after: ['mz/sinorama/10/ectb_1039', '1', '14', 'conclusion', 'NN', '*)', 'conclusion', '-', '1', '-', '*', '*', '*', '*', '-']
length: 15
before: mz/sinorama/10/ectb_1039 1 15 on IN (PP* - - - - * * * * -
after: ['mz/sinorama/10/ectb_1039', '1', '15', 'on', 'IN', '(PP*', '-', '-', '-', '-', '*', '*', '*', '*', '-']
length: 15
before: mz/sinorama/10/ectb_1039 1 16 the DT (NP* - - - - * * * * (13
after: ['mz/sinorama/10/ectb_1039', '1', '16', 'the', 'DT', '(NP*', '-', '-', '-', '-', '*', '*', '*', '*', '(13']
length: 15
before: mz/sinorama/10/ectb_1039 1 17 " `` (NML* - - - - * * * * -
after: ['mz/sinorama/10/ectb_1039', '1', '17', '"', '``', '(NML*', '-', '-', '-', '-', '*', '*', '*', '*', '-']
length: 15
befor

before: mz/sinorama/10/ectb_1039 2 7 one CD * - - - - (CARDINAL) * * * * * * -
after: ['mz/sinorama/10/ectb_1039', '2', '7', 'one', 'CD', '*', '-', '-', '-', '-', '(CARDINAL)', '*', '*', '*', '*', '*', '*', '-']
length: 18
before: mz/sinorama/10/ectb_1039 2 8 China NNP * - - - - (GPE) * * * * * * -
after: ['mz/sinorama/10/ectb_1039', '2', '8', 'China', 'NNP', '*', '-', '-', '-', '-', '(GPE)', '*', '*', '*', '*', '*', '*', '-']
length: 18
before: mz/sinorama/10/ectb_1039 2 9 " '' *) - - - - * * * * * * * -
after: ['mz/sinorama/10/ectb_1039', '2', '9', '"', "''", '*)', '-', '-', '-', '-', '*', '*', '*', '*', '*', '*', '*', '-']
length: 18
before: mz/sinorama/10/ectb_1039 2 10 issue NN *))) issue - 1 - * * *) *) * * * 2)
after: ['mz/sinorama/10/ectb_1039', '2', '10', 'issue', 'NN', '*)))', 'issue', '-', '1', '-', '*', '*', '*)', '*)', '*', '*', '*', '2)']
length: 18
before: mz/sinorama/10/ectb_1039 2 11 was VBD (VP* be 01 1 - * * * (V*) * * * -
after: ['mz/sinorama/10/ectb_1039', '2', '11

length: 5
before: #begin document (mz/sinorama/10/ectb_1069); part 000
after: ['#begin', 'document', '(mz/sinorama/10/ectb_1069);', 'part', '000']
length: 14
before: mz/sinorama/10/ectb_1069 0 0 Tips NNS (TOP(NP(NP*) tip 03 4 - * (V*) * -
after: ['mz/sinorama/10/ectb_1069', '0', '0', 'Tips', 'NNS', '(TOP(NP(NP*)', 'tip', '03', '4', '-', '*', '(V*)', '*', '-']
length: 14
before: mz/sinorama/10/ectb_1069 0 1 for IN (PP* - - - - * (ARG3* * -
after: ['mz/sinorama/10/ectb_1069', '0', '1', 'for', 'IN', '(PP*', '-', '-', '-', '-', '*', '(ARG3*', '*', '-']
length: 14
before: mz/sinorama/10/ectb_1069 0 2 Getting VBG (S(VP* get 13 - - * * (V*) -
after: ['mz/sinorama/10/ectb_1069', '0', '2', 'Getting', 'VBG', '(S(VP*', 'get', '13', '-', '-', '*', '*', '(V*)', '-']
length: 14
before: mz/sinorama/10/ectb_1069 0 3 through IN (PP* - - - - * * * -
after: ['mz/sinorama/10/ectb_1069', '0', '3', 'through', 'IN', '(PP*', '-', '-', '-', '-', '*', '*', '*', '-']
length: 14
before: mz/sinorama/10/ectb_1069 0

length: 16
before: mz/sinorama/10/ectb_1069 1 9 've VBP (VP* have 01 12 - * * (V*) * * -
after: ['mz/sinorama/10/ectb_1069', '1', '9', "'ve", 'VBP', '(VP*', 'have', '01', '12', '-', '*', '*', '(V*)', '*', '*', '-']
length: 16
before: mz/sinorama/10/ectb_1069 1 10 got VBN (VP* get 06 - - * * * (V*) * -
after: ['mz/sinorama/10/ectb_1069', '1', '10', 'got', 'VBN', '(VP*', 'get', '06', '-', '-', '*', '*', '*', '(V*)', '*', '-']
length: 16
before: mz/sinorama/10/ectb_1069 1 11 to TO (S(VP* - - - - * * * (ARG1* * -
after: ['mz/sinorama/10/ectb_1069', '1', '11', 'to', 'TO', '(S(VP*', '-', '-', '-', '-', '*', '*', '*', '(ARG1*', '*', '-']
length: 16
before: mz/sinorama/10/ectb_1069 1 12 set VB (VP* set 01 1 - * * * * (V*) -
after: ['mz/sinorama/10/ectb_1069', '1', '12', 'set', 'VB', '(VP*', 'set', '01', '1', '-', '*', '*', '*', '*', '(V*)', '-']
length: 16
before: mz/sinorama/10/ectb_1069 1 13 aside RB (ADVP*) - - - - * * * * (ARG2*) -
after: ['mz/sinorama/10/ectb_1069', '1', '13', 'aside', 'R

length: 19
before: mz/sinorama/10/ectb_1069 3 26 adults NNS (NP*)) - - - - * * * * *) * * * -
after: ['mz/sinorama/10/ectb_1069', '3', '26', 'adults', 'NNS', '(NP*))', '-', '-', '-', '-', '*', '*', '*', '*', '*)', '*', '*', '*', '-']
length: 19
before: mz/sinorama/10/ectb_1069 3 27 , , * - - - - * * * * * * * * -
after: ['mz/sinorama/10/ectb_1069', '3', '27', ',', ',', '*', '-', '-', '-', '-', '*', '*', '*', '*', '*', '*', '*', '*', '-']
length: 19
before: mz/sinorama/10/ectb_1069 3 28 they PRP (NP*) - - - - * * * * (ARG0*) * * * (1)
after: ['mz/sinorama/10/ectb_1069', '3', '28', 'they', 'PRP', '(NP*)', '-', '-', '-', '-', '*', '*', '*', '*', '(ARG0*)', '*', '*', '*', '(1)']
length: 19
before: mz/sinorama/10/ectb_1069 3 29 set VBD (VP* set 01 7 - * * * * (V*) * * * -
after: ['mz/sinorama/10/ectb_1069', '3', '29', 'set', 'VBD', '(VP*', 'set', '01', '7', '-', '*', '*', '*', '*', '(V*)', '*', '*', '*', '-']
length: 19
before: mz/sinorama/10/ectb_1069 3 30 their PRP$ (NP* - - - - * * * * (

length: 15
before: mz/sinorama/10/ectb_1069 4 0 Is VBZ (TOP(SQ* be 01 3 - * (V*) * * -
after: ['mz/sinorama/10/ectb_1069', '4', '0', 'Is', 'VBZ', '(TOP(SQ*', 'be', '01', '3', '-', '*', '(V*)', '*', '*', '-']
length: 15
before: mz/sinorama/10/ectb_1069 4 1 there EX (NP*) - - - - * * * * -
after: ['mz/sinorama/10/ectb_1069', '4', '1', 'there', 'EX', '(NP*)', '-', '-', '-', '-', '*', '*', '*', '*', '-']
length: 15
before: mz/sinorama/10/ectb_1069 4 2 some DT (NP(NP* - - - - * (ARG1* * (ARGM-MNR* -
after: ['mz/sinorama/10/ectb_1069', '4', '2', 'some', 'DT', '(NP(NP*', '-', '-', '-', '-', '*', '(ARG1*', '*', '(ARGM-MNR*', '-']
length: 15
before: mz/sinorama/10/ectb_1069 4 3 way NN *) way - 2 - * * * *) -
after: ['mz/sinorama/10/ectb_1069', '4', '3', 'way', 'NN', '*)', 'way', '-', '2', '-', '*', '*', '*', '*)', '-']
length: 15
before: mz/sinorama/10/ectb_1069 4 4 that WRB (SBAR(WHADVP*) - - - - * * * (R-ARGM-MNR*) -
after: ['mz/sinorama/10/ectb_1069', '4', '4', 'that', 'WRB', '(SBAR(WHADVP*)

after: ['mz/sinorama/10/ectb_1069', '5', '13', 'one', 'PRP', '(S(NP*)', '-', '-', '-', '-', '*', '*', '*', '(ARG0*)', '-']
length: 15
before: mz/sinorama/10/ectb_1069 5 14 has VBZ (VP* have 01 - - * * (V*) * -
after: ['mz/sinorama/10/ectb_1069', '5', '14', 'has', 'VBZ', '(VP*', 'have', '01', '-', '-', '*', '*', '(V*)', '*', '-']
length: 15
before: mz/sinorama/10/ectb_1069 5 15 already RB (ADVP*) - - - - * * * (ARGM-TMP*) -
after: ['mz/sinorama/10/ectb_1069', '5', '15', 'already', 'RB', '(ADVP*)', '-', '-', '-', '-', '*', '*', '*', '(ARGM-TMP*)', '-']
length: 15
before: mz/sinorama/10/ectb_1069 5 16 amply RB (ADVP*) - - - - * * * (ARGM-EXT*) -
after: ['mz/sinorama/10/ectb_1069', '5', '16', 'amply', 'RB', '(ADVP*)', '-', '-', '-', '-', '*', '*', '*', '(ARGM-EXT*)', '-']
length: 15
before: mz/sinorama/10/ectb_1069 5 17 tasted VBN (VP* taste 01 3 - * * * (V*) -
after: ['mz/sinorama/10/ectb_1069', '5', '17', 'tasted', 'VBN', '(VP*', 'taste', '01', '3', '-', '*', '*', '*', '(V*)', '-']
lengt

before: mz/sinorama/10/ectb_1069 5 5 too RB (ADVP* - - - - * * (ARGM-TMP* * * * * -
after: ['mz/sinorama/10/ectb_1069', '5', '5', 'too', 'RB', '(ADVP*', '-', '-', '-', '-', '*', '*', '(ARGM-TMP*', '*', '*', '*', '*', '-']
length: 18
before: mz/sinorama/10/ectb_1069 5 6 early RB *)) - - - - * * *) * * * * -
after: ['mz/sinorama/10/ectb_1069', '5', '6', 'early', 'RB', '*))', '-', '-', '-', '-', '*', '*', '*)', '*', '*', '*', '*', '-']
length: 18
before: mz/sinorama/10/ectb_1069 5 7 , , * - - - - * * * * * * * -
after: ['mz/sinorama/10/ectb_1069', '5', '7', ',', ',', '*', '-', '-', '-', '-', '*', '*', '*', '*', '*', '*', '*', '-']
length: 18
before: mz/sinorama/10/ectb_1069 5 8 and CC * - - - - * * * * * * * -
after: ['mz/sinorama/10/ectb_1069', '5', '8', 'and', 'CC', '*', '-', '-', '-', '-', '*', '*', '*', '*', '*', '*', '*', '-']
length: 18
before: mz/sinorama/10/ectb_1069 5 9 do VB (VP* do 01 - - * * * (V*) * * * -
after: ['mz/sinorama/10/ectb_1069', '5', '9', 'do', 'VB', '(VP*', 'do',

before: mz/sinorama/10/ectb_1059 1 28 could MD (VP* - - - - * * * * * (ARGM-MOD*) -
after: ['mz/sinorama/10/ectb_1059', '1', '28', 'could', 'MD', '(VP*', '-', '-', '-', '-', '*', '*', '*', '*', '*', '(ARGM-MOD*)', '-']
length: 17
before: mz/sinorama/10/ectb_1059 1 29 help VB (VP* help 01 2 - * * * * * (V*) -
after: ['mz/sinorama/10/ectb_1059', '1', '29', 'help', 'VB', '(VP*', 'help', '01', '2', '-', '*', '*', '*', '*', '*', '(V*)', '-']
length: 17
before: mz/sinorama/10/ectb_1059 1 30 yourself PRP (NP*) - - - - * * * * * (ARG2*) -
after: ['mz/sinorama/10/ectb_1059', '1', '30', 'yourself', 'PRP', '(NP*)', '-', '-', '-', '-', '*', '*', '*', '*', '*', '(ARG2*)', '-']
length: 17
before: mz/sinorama/10/ectb_1059 1 31 to IN (PP*)))))))))))))) - - - - * *) *) * * (ARG1*) -
after: ['mz/sinorama/10/ectb_1059', '1', '31', 'to', 'IN', '(PP*))))))))))))))', '-', '-', '-', '-', '*', '*)', '*)', '*', '*', '(ARG1*)', '-']
length: 17
before: mz/sinorama/10/ectb_1059 1 32 . . *)) - - - - * * * * * * -


before: mz/sinorama/10/ectb_1059 2 36 Democracy NNP * - - - - * * * * * -
after: ['mz/sinorama/10/ectb_1059', '2', '36', 'Democracy', 'NNP', '*', '-', '-', '-', '-', '*', '*', '*', '*', '*', '-']
length: 16
before: mz/sinorama/10/ectb_1059 2 37 Association NNP *)))) - - - - *) * *) * * 24)
after: ['mz/sinorama/10/ectb_1059', '2', '37', 'Association', 'NNP', '*))))', '-', '-', '-', '-', '*)', '*', '*)', '*', '*', '24)']
length: 16
before: mz/sinorama/10/ectb_1059 2 38 launched VBD (VP* launch 01 2 - * * (V*) * * -
after: ['mz/sinorama/10/ectb_1059', '2', '38', 'launched', 'VBD', '(VP*', 'launch', '01', '2', '-', '*', '*', '(V*)', '*', '*', '-']
length: 16
before: mz/sinorama/10/ectb_1059 2 39 a DT (NP(NP* - - - - * * (ARG1* (ARG0* (ARG0* (16
after: ['mz/sinorama/10/ectb_1059', '2', '39', 'a', 'DT', '(NP(NP*', '-', '-', '-', '-', '*', '*', '(ARG1*', '(ARG0*', '(ARG0*', '(16']
length: 16
before: mz/sinorama/10/ectb_1059 2 40 program NN *) program - 1 - * * * *) *) -
after: ['mz/sinorama/1

after: ['mz/sinorama/10/ectb_1059', '3', '4', 'behind', 'IN', '(PP*', '-', '-', '-', '-', '*', '*', '-']
length: 13
before: mz/sinorama/10/ectb_1059 3 5 Tsao NNP (NP(NP(NP* - - - - (PERSON) * (13
after: ['mz/sinorama/10/ectb_1059', '3', '5', 'Tsao', 'NNP', '(NP(NP(NP*', '-', '-', '-', '-', '(PERSON)', '*', '(13']
length: 13
before: mz/sinorama/10/ectb_1059 3 6 's POS *) - - - - * * 13)
after: ['mz/sinorama/10/ectb_1059', '3', '6', "'s", 'POS', '*)', '-', '-', '-', '-', '*', '*', '13)']
length: 13
before: mz/sinorama/10/ectb_1059 3 7 gathering NN * - - - - * * -
after: ['mz/sinorama/10/ectb_1059', '3', '7', 'gathering', 'NN', '*', '-', '-', '-', '-', '*', '*', '-']
length: 13
before: mz/sinorama/10/ectb_1059 3 8 together RB *) - - - - * * -
after: ['mz/sinorama/10/ectb_1059', '3', '8', 'together', 'RB', '*)', '-', '-', '-', '-', '*', '*', '-']
length: 13
before: mz/sinorama/10/ectb_1059 3 9 of IN (PP* - - - - * * -
after: ['mz/sinorama/10/ectb_1059', '3', '9', 'of', 'IN', '(PP*', '-', '

before: mz/sinorama/10/ectb_1059 4 2 fish NN (NP(NP* - - - - * * (ARG1* * -
after: ['mz/sinorama/10/ectb_1059', '4', '2', 'fish', 'NN', '(NP(NP*', '-', '-', '-', '-', '*', '*', '(ARG1*', '*', '-']
length: 15
before: mz/sinorama/10/ectb_1059 4 3 farms NNS *) - - - - * * * * -
after: ['mz/sinorama/10/ectb_1059', '4', '3', 'farms', 'NNS', '*)', '-', '-', '-', '-', '*', '*', '*', '*', '-']
length: 15
before: mz/sinorama/10/ectb_1059 4 4 in IN (PP* - - - - * * * * -
after: ['mz/sinorama/10/ectb_1059', '4', '4', 'in', 'IN', '(PP*', '-', '-', '-', '-', '*', '*', '*', '*', '-']
length: 15
before: mz/sinorama/10/ectb_1059 4 5 the DT (NP* - - - - * * * * -
after: ['mz/sinorama/10/ectb_1059', '4', '5', 'the', 'DT', '(NP*', '-', '-', '-', '-', '*', '*', '*', '*', '-']
length: 15
before: mz/sinorama/10/ectb_1059 4 6 locality NN *))) - - - - * * *) * -
after: ['mz/sinorama/10/ectb_1059', '4', '6', 'locality', 'NN', '*)))', '-', '-', '-', '-', '*', '*', '*)', '*', '-']
length: 15
before: mz/sinorama/

after: ['mz/sinorama/10/ectb_1009', '0', '27', 'largest', 'JJS', '*', '-', '-', '-', '-', '*', '*', '*', '*', '*', '-']
length: 16
before: mz/sinorama/10/ectb_1009 0 28 concentration NN *) concentration - 2 - * * * * * -
after: ['mz/sinorama/10/ectb_1009', '0', '28', 'concentration', 'NN', '*)', 'concentration', '-', '2', '-', '*', '*', '*', '*', '*', '-']
length: 16
before: mz/sinorama/10/ectb_1009 0 29 of IN (PP* - - - - * * * * * -
after: ['mz/sinorama/10/ectb_1009', '0', '29', 'of', 'IN', '(PP*', '-', '-', '-', '-', '*', '*', '*', '*', '*', '-']
length: 16
before: mz/sinorama/10/ectb_1009 0 30 Taiwan NNP (NP* - - - - (GPE) * * * * (53)
after: ['mz/sinorama/10/ectb_1009', '0', '30', 'Taiwan', 'NNP', '(NP*', '-', '-', '-', '-', '(GPE)', '*', '*', '*', '*', '(53)']
length: 16
before: mz/sinorama/10/ectb_1009 0 31 firms NNS *)) - - - - * * * * * -
after: ['mz/sinorama/10/ectb_1009', '0', '31', 'firms', 'NNS', '*))', '-', '-', '-', '-', '*', '*', '*', '*', '*', '-']
length: 16
before: m

length: 13
before: mz/sinorama/10/ectb_1009 1 4 than IN * - - - - * * -
after: ['mz/sinorama/10/ectb_1009', '1', '4', 'than', 'IN', '*', '-', '-', '-', '-', '*', '*', '-']
length: 13
before: mz/sinorama/10/ectb_1009 1 5 a DT *) - - - - * * -
after: ['mz/sinorama/10/ectb_1009', '1', '5', 'a', 'DT', '*)', '-', '-', '-', '-', '*', '*', '-']
length: 13
before: mz/sinorama/10/ectb_1009 1 6 decade NN *) decade - 1 - * * -
after: ['mz/sinorama/10/ectb_1009', '1', '6', 'decade', 'NN', '*)', 'decade', '-', '1', '-', '*', '*', '-']
length: 13
before: mz/sinorama/10/ectb_1009 1 7 ago RB *) - - - - *) *) -
after: ['mz/sinorama/10/ectb_1009', '1', '7', 'ago', 'RB', '*)', '-', '-', '-', '-', '*)', '*)', '-']
length: 13
before: mz/sinorama/10/ectb_1009 1 8 Dongguan NNP (NP*) - - - - (GPE) (ARG1*) (77)
after: ['mz/sinorama/10/ectb_1009', '1', '8', 'Dongguan', 'NNP', '(NP*)', '-', '-', '-', '-', '(GPE)', '(ARG1*)', '(77)']
length: 13
before: mz/sinorama/10/ectb_1009 1 9 was VBD (VP* be 01 1 - * (V*) -


length: 14
before: mz/sinorama/10/ectb_1009 2 4 was VBD (VP* be 03 - - * (V*) * -
after: ['mz/sinorama/10/ectb_1009', '2', '4', 'was', 'VBD', '(VP*', 'be', '03', '-', '-', '*', '(V*)', '*', '-']
length: 14
before: mz/sinorama/10/ectb_1009 2 5 set VBN (VP* set 02 - - * * (V*) -
after: ['mz/sinorama/10/ectb_1009', '2', '5', 'set', 'VBN', '(VP*', 'set', '02', '-', '-', '*', '*', '(V*)', '-']
length: 14
before: mz/sinorama/10/ectb_1009 2 6 at IN (PP* - - - - * * (ARG2* -
after: ['mz/sinorama/10/ectb_1009', '2', '6', 'at', 'IN', '(PP*', '-', '-', '-', '-', '*', '*', '(ARG2*', '-']
length: 14
before: mz/sinorama/10/ectb_1009 2 7 RMB NNP (NP(NP(NP* - - - - (MONEY* * * -
after: ['mz/sinorama/10/ectb_1009', '2', '7', 'RMB', 'NNP', '(NP(NP(NP*', '-', '-', '-', '-', '(MONEY*', '*', '*', '-']
length: 14
before: mz/sinorama/10/ectb_1009 2 8 8.00 CD *) - - - - *) * * -
after: ['mz/sinorama/10/ectb_1009', '2', '8', '8.00', 'CD', '*)', '-', '-', '-', '-', '*)', '*', '*', '-']
length: 14
before: mz/sin

after: ['mz/sinorama/10/ectb_1009', '2', '24', 'Taiwan', 'NNP', '(NP*))))))))))', '-', '-', '-', '-', '(GPE)', '*)', '*', '*', '*', '*)', '(9)']
length: 17
before: mz/sinorama/10/ectb_1009 2 25 . . * - - - - * * * * * * -
after: ['mz/sinorama/10/ectb_1009', '2', '25', '.', '.', '*', '-', '-', '-', '-', '*', '*', '*', '*', '*', '*', '-']
length: 17
before: mz/sinorama/10/ectb_1009 2 26 " '' *)) - - - - * * * * * * -
after: ['mz/sinorama/10/ectb_1009', '2', '26', '"', "''", '*))', '-', '-', '-', '-', '*', '*', '*', '*', '*', '*', '-']
length: 1
before: 
after: ['']
length: 14
before: mz/sinorama/10/ectb_1009 2 0 But CC (TOP(S* - - - - * (ARGM-DIS*) (ARGM-DIS*) -
after: ['mz/sinorama/10/ectb_1009', '2', '0', 'But', 'CC', '(TOP(S*', '-', '-', '-', '-', '*', '(ARGM-DIS*)', '(ARGM-DIS*)', '-']
length: 14
before: mz/sinorama/10/ectb_1009 2 1 now RB (ADVP*) - - - - * (ARGM-TMP*) (ARGM-TMP*) -
after: ['mz/sinorama/10/ectb_1009', '2', '1', 'now', 'RB', '(ADVP*)', '-', '-', '-', '-', '*', '(ARGM-

before: mz/sinorama/10/ectb_1009 4 3 strivers NNS *)))) - - - - * -
after: ['mz/sinorama/10/ectb_1009', '4', '3', 'strivers', 'NNS', '*))))', '-', '-', '-', '-', '*', '-']
length: 1
before: 
after: ['']
length: 19
before: mz/sinorama/10/ectb_1009 4 0 Although IN (TOP(S(SBAR* - - - - * * * * * (ARGM-ADV* * * -
after: ['mz/sinorama/10/ectb_1009', '4', '0', 'Although', 'IN', '(TOP(S(SBAR*', '-', '-', '-', '-', '*', '*', '*', '*', '*', '(ARGM-ADV*', '*', '*', '-']
length: 19
before: mz/sinorama/10/ectb_1009 4 1 the DT (S(NP(NP* - - - - * * (ARG0* * * * * * (29
after: ['mz/sinorama/10/ectb_1009', '4', '1', 'the', 'DT', '(S(NP(NP*', '-', '-', '-', '-', '*', '*', '(ARG0*', '*', '*', '*', '*', '*', '(29']
length: 19
before: mz/sinorama/10/ectb_1009 4 2 Taiwan NNP * - - - - (GPE) * * * * * * * (85)
after: ['mz/sinorama/10/ectb_1009', '4', '2', 'Taiwan', 'NNP', '*', '-', '-', '-', '-', '(GPE)', '*', '*', '*', '*', '*', '*', '*', '(85)']
length: 19
before: mz/sinorama/10/ectb_1009 4 3 firms NNS *

before: mz/sinorama/10/ectb_1009 4 3 spirit NN *) - - - - * * *) * * *) * -
after: ['mz/sinorama/10/ectb_1009', '4', '3', 'spirit', 'NN', '*)', '-', '-', '-', '-', '*', '*', '*)', '*', '*', '*)', '*', '-']
length: 18
before: mz/sinorama/10/ectb_1009 4 4 has VBZ (VP* have 01 - - * (V*) * * * * * -
after: ['mz/sinorama/10/ectb_1009', '4', '4', 'has', 'VBZ', '(VP*', 'have', '01', '-', '-', '*', '(V*)', '*', '*', '*', '*', '*', '-']
length: 18
before: mz/sinorama/10/ectb_1009 4 5 blossomed VBN (VP* blossom 01 - - * * (V*) * * * * -
after: ['mz/sinorama/10/ectb_1009', '4', '5', 'blossomed', 'VBN', '(VP*', 'blossom', '01', '-', '-', '*', '*', '(V*)', '*', '*', '*', '*', '-']
length: 18
before: mz/sinorama/10/ectb_1009 4 6 here RB (ADVP*) - - - - * * (ARGM-LOC*) * * * * -
after: ['mz/sinorama/10/ectb_1009', '4', '6', 'here', 'RB', '(ADVP*)', '-', '-', '-', '-', '*', '*', '(ARGM-LOC*)', '*', '*', '*', '*', '-']
length: 18
before: mz/sinorama/10/ectb_1009 4 7 by IN (PP* - - - - * * (ARGM-MNR* *

length: 13
before: mz/sinorama/10/ectb_1009 6 7 their PRP$ (NP* - - - - * (ARG1* (68)
after: ['mz/sinorama/10/ectb_1009', '6', '7', 'their', 'PRP$', '(NP*', '-', '-', '-', '-', '*', '(ARG1*', '(68)']
length: 13
before: mz/sinorama/10/ectb_1009 6 8 guard NN *))) - - - - * *) -
after: ['mz/sinorama/10/ectb_1009', '6', '8', 'guard', 'NN', '*)))', '-', '-', '-', '-', '*', '*)', '-']
length: 13
before: mz/sinorama/10/ectb_1009 6 9 . . *)) - - - - * * -
after: ['mz/sinorama/10/ectb_1009', '6', '9', '.', '.', '*))', '-', '-', '-', '-', '*', '*', '-']
length: 1
before: 
after: ['']
length: 18
before: mz/sinorama/10/ectb_1009 6 0 Because IN (TOP(S(PP* - - - - * * * (ARGM-CAU* * * * -
after: ['mz/sinorama/10/ectb_1009', '6', '0', 'Because', 'IN', '(TOP(S(PP*', '-', '-', '-', '-', '*', '*', '*', '(ARGM-CAU*', '*', '*', '*', '-']
length: 18
before: mz/sinorama/10/ectb_1009 6 1 of IN * - - - - * * * * * * * -
after: ['mz/sinorama/10/ectb_1009', '6', '1', 'of', 'IN', '*', '-', '-', '-', '-', '*', '*

length: 19
before: mz/sinorama/10/ectb_1009 7 6 law NN *) law - 1 - * *) * * * * * * -
after: ['mz/sinorama/10/ectb_1009', '7', '6', 'law', 'NN', '*)', 'law', '-', '1', '-', '*', '*)', '*', '*', '*', '*', '*', '*', '-']
length: 19
before: mz/sinorama/10/ectb_1009 7 7 adopted VBN (VP* adopt 01 - - * (V*) * * * * * * -
after: ['mz/sinorama/10/ectb_1009', '7', '7', 'adopted', 'VBN', '(VP*', 'adopt', '01', '-', '-', '*', '(V*)', '*', '*', '*', '*', '*', '*', '-']
length: 19
before: mz/sinorama/10/ectb_1009 7 8 by IN (PP* - - - - * (ARG0* * * * * * * -
after: ['mz/sinorama/10/ectb_1009', '7', '8', 'by', 'IN', '(PP*', '-', '-', '-', '-', '*', '(ARG0*', '*', '*', '*', '*', '*', '*', '-']
length: 19
before: mz/sinorama/10/ectb_1009 7 9 the DT (NP* - - - - * * * * * * * * (43
after: ['mz/sinorama/10/ectb_1009', '7', '9', 'the', 'DT', '(NP*', '-', '-', '-', '-', '*', '*', '*', '*', '*', '*', '*', '*', '(43']
length: 19
before: mz/sinorama/10/ectb_1009 7 10 PRC NNP *)) - - - - (GPE) *) * * * * * 

before: mz/sinorama/10/ectb_1009 8 8 have VBP (VP* have 01 - - * (V*) * * * * -
after: ['mz/sinorama/10/ectb_1009', '8', '8', 'have', 'VBP', '(VP*', 'have', '01', '-', '-', '*', '(V*)', '*', '*', '*', '*', '-']
length: 17
before: mz/sinorama/10/ectb_1009 8 9 also RB (ADVP*) - - - - * * (ARGM-DIS*) * * * -
after: ['mz/sinorama/10/ectb_1009', '8', '9', 'also', 'RB', '(ADVP*)', '-', '-', '-', '-', '*', '*', '(ARGM-DIS*)', '*', '*', '*', '-']
length: 17
before: mz/sinorama/10/ectb_1009 8 10 tried VBN (VP* try 01 1 - * * (V*) * * * -
after: ['mz/sinorama/10/ectb_1009', '8', '10', 'tried', 'VBN', '(VP*', 'try', '01', '1', '-', '*', '*', '(V*)', '*', '*', '*', '-']
length: 17
before: mz/sinorama/10/ectb_1009 8 11 to TO (S(VP* - - - - * * (ARG1* * * * -
after: ['mz/sinorama/10/ectb_1009', '8', '11', 'to', 'TO', '(S(VP*', '-', '-', '-', '-', '*', '*', '(ARG1*', '*', '*', '*', '-']
length: 17
before: mz/sinorama/10/ectb_1009 8 12 invest VB (VP* invest 01 1 - * * * (V*) * * -
after: ['mz/sinorama

In [28]:
print(set(all_lengths))

{1, 2, 5, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24}


We see that some sentences could be split into lists of length equal to or less than 5 and some could be split into lists of greater than and equal to 12. Let us look at sentences split into lists of length 5 or less than that.

In [30]:
for key1 in list(dict_gold_conll["test"].keys())[:1]:
        print("root in {}:".format(key1))
        for cur_file in dict_gold_conll["test"][key1]:
                with open(key1 + '/' + cur_file, 'r') as f:
                    print(cur_file)
                    for line in f.readlines():
                        l = line.strip()
                        l = ' '.join(l.split())
                        ls = l.split(" ")
                        if len(ls) <= 5:
                            print("length: {}".format(len(ls)))
                            print("before: {}".format(l))
                            print("after: {}".format(ls))

root in /Users/rbalasubramaniam/OntoNotes-5.0-NER-BIO/conll-formatted-ontonotes-5.0/data/test/data/english/annotations/mz/sinorama/10:
ectb_1029.gold_conll
length: 5
before: #begin document (mz/sinorama/10/ectb_1029); part 000
after: ['#begin', 'document', '(mz/sinorama/10/ectb_1029);', 'part', '000']
length: 1
before: 
after: ['']
length: 1
before: 
after: ['']
length: 1
before: 
after: ['']
length: 1
before: 
after: ['']
length: 1
before: 
after: ['']
length: 1
before: 
after: ['']
length: 1
before: 
after: ['']
length: 1
before: 
after: ['']
length: 1
before: 
after: ['']
length: 1
before: 
after: ['']
length: 1
before: 
after: ['']
length: 1
before: 
after: ['']
length: 1
before: 
after: ['']
length: 1
before: 
after: ['']
length: 1
before: 
after: ['']
length: 1
before: 
after: ['']
length: 1
before: 
after: ['']
length: 1
before: 
after: ['']
length: 1
before: 
after: ['']
length: 1
before: 
after: ['']
length: 2
before: #end document
after: ['#end', 'document']
length: 5
before:

length: 1
before: 
after: ['']
length: 1
before: 
after: ['']
length: 1
before: 
after: ['']
length: 1
before: 
after: ['']
length: 1
before: 
after: ['']
length: 1
before: 
after: ['']
length: 1
before: 
after: ['']
length: 1
before: 
after: ['']
length: 1
before: 
after: ['']
length: 2
before: #end document
after: ['#end', 'document']
length: 5
before: #begin document (mz/sinorama/10/ectb_1069); part 006
after: ['#begin', 'document', '(mz/sinorama/10/ectb_1069);', 'part', '006']
length: 1
before: 
after: ['']
length: 1
before: 
after: ['']
length: 1
before: 
after: ['']
length: 1
before: 
after: ['']
length: 1
before: 
after: ['']
length: 1
before: 
after: ['']
length: 1
before: 
after: ['']
length: 1
before: 
after: ['']
length: 1
before: 
after: ['']
length: 1
before: 
after: ['']
length: 2
before: #end document
after: ['#end', 'document']
ectb_1059.gold_conll
length: 5
before: #begin document (mz/sinorama/10/ectb_1059); part 000
after: ['#begin', 'document', '(mz/sinorama/10/ectb_

We see that we are not getting any annotations from these lines/sentences so this is not useful for us. So, lets get the lengths of lines from all the files.

In [31]:
all_lengths = []
for key1 in list(dict_gold_conll["test"].keys()):
        for cur_file in dict_gold_conll["test"][key1]:
                with open(key1 + '/' + cur_file, 'r') as f:
                    for line in f.readlines():
                        l = line.strip()
                        l = ' '.join(l.split())
                        ls = l.split(" ")
                        all_lengths.append(len(ls))

In [32]:
set(all_lengths)

{1,
 2,
 5,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 34}

So, for all the files we will extract lines that could be split into elements of more than 12 elements. Lets look at the structure of each line that we are extracting. We will use the first file in test folder.

In [40]:
for key1 in list(dict_gold_conll["test"].keys())[:1]:
        print("root in {}:".format(key1))
        for i,cur_file in enumerate(dict_gold_conll["test"][key1]):
            if i == 0:
                print(cur_file)
                with open(key1 + '/' + cur_file, 'r') as f:
                    print(cur_file)
                    # initializing the flag value
                    flag = None
                    for line in f.readlines():
                        l = line.strip()
                        l = ' '.join(l.split())
                        ls = l.split(" ")
                        if len(ls) >= 12:
                            word = ls[3]
                            pos = ls[4]
                            ppos = ls[5]
                            dephead = ls[10]
                            ner = dephead
                            # if relationship tag is * then it could either be O or I
                            if dephead == "*":
                                if flag==None:
                                    ner = "O"
                                else:
                                    ner = "I-" + flag
                            # if relationship tag is *) then it is I
                            elif dephead == "*)":
                                ner = "I-" + flag
                                flag = None
                            # if relation starts with ( and ends with * then its B with a I that will follow
                            elif dephead.startswith("(") and dephead.endswith("*") and len(dephead)>2:
                                flag = dephead[1:-1]
                                ner = "B-" + flag
                            # if relation starts with ( and ends with ) then its B with no I to follow
                            elif dephead.startswith("(") and dephead.endswith(")") and len(dephead)>2 and flag == None:
                                ner = "B-" + dephead[1:-1]

                            print(word, pos, ppos, ner)

root in /Users/rbalasubramaniam/OntoNotes-5.0-NER-BIO/conll-formatted-ontonotes-5.0/data/test/data/english/annotations/mz/sinorama/10:
ectb_1029.gold_conll
ectb_1029.gold_conll
Powerful JJ (TOP(NP(NP(NP* B-WORK_OF_ART
Tools NNS *) I-WORK_OF_ART
for IN (PP* I-WORK_OF_ART
Biotechnology NN (NP*))) I-WORK_OF_ART
- : * I-WORK_OF_ART
Biochips NNS (NP*))) I-WORK_OF_ART
-LRB- -LRB- (TOP(FRAG* O
Chang NNP (NP* B-PERSON
Chiung NNP * I-PERSON
- HYPH * I-PERSON
fang NNP *) I-PERSON
/ : * O
photos NNS (NP(NP*) O
by IN (PP* O
Hsueh NNP (NP* B-PERSON
Chi NNP * I-PERSON
- HYPH * I-PERSON
kuang NNP *))) I-PERSON
/ , * O
tr. VBN (VP* O
by IN (PP* O
Robert NNP (NP* B-PERSON
Taylor NNP *))) I-PERSON
-RRB- -RRB- *)) O
The DT (TOP(S(NP(NP* O
enterovirus NN (NML* O
detection NN *) O
biochip NN *) O
developed VBN (VP* O
by IN (PP* O
DR. NNP (NP* B-ORG
Chip NNP * I-ORG
Biotechnology NNP *)))) I-ORG
takes VBZ (VP* O
only RB (NP* B-TIME
six CD * I-TIME
hours NNS *) I-TIME
to TO (S(VP* O
give VB (VP* O
hospitals 

testing NN * O
techniques NNS *)) O
it PRP (NP*) O
takes VBZ (VP* O
three CD (NP* B-DATE
days NNS *) I-DATE
to TO (S(VP* O
get VB (VP* O
a DT (NP* O
result NN *)))))) O
. . *)) O
But CC (TOP(S* O
DR. NNP (NP* B-ORG
Chip NNP *) I-ORG
has VBZ (VP* O
developed VBN (VP* O
methods NNS (NP(NP*) O
to TO (SBAR(S(VP* O
reduce VB (VP* O
the DT (NP* O
sample NN * O
preparation NN * O
time NN *)))))) O
, , * O
which WDT (SBAR(WHNP*) O
enables VBZ (S(VP* O
a DT (NP* O
result NN *) O
to TO (S(VP* O
be VB (VP* O
obtained VBN (VP* O
in IN (PP* O
only RB (NP(QP* B-TIME
six CD *) I-TIME
hours NNS *)) I-TIME
when WRB (SBAR(WHADVP*) O
used VBN (S(VP* O
in IN (PP* O
combination NN (NP(NP*) O
with IN (PP* O
their PRP$ (NP* O
enterovirus NN * O
chip NN *)))))))))))))))) O
. . *)) O
Furthermore RB (TOP(S(ADVP*) O
, , * O
the DT (NP(NP* O
initial JJ * O
results NNS *) O
of IN (PP* O
joint JJ (NP(NP* O
trials NNS *) O
by IN (PP* O
DR. NNP (NP(NP* B-ORG
Chip NNP *) I-ORG
, , * O
Chang NNP (NP(NML* B-ORG
Gung NNP

to TO (VP* O
sign VB (VP* O
technology NN (NP(NP(NP* O
transfer NN * O
agreements NNS *) O
and CC * O
chip NN (NP* O
production NN * O
contracts NNS *)) O
with IN (PP* O
them PRP (NP*)))))))) O
. . *)) O
Kuo NNP (TOP(S(S(NP*) B-PERSON
comments VBZ (VP* O
that IN (SBAR* O
contract NN (S(NP(NP* O
production NN *) O
of IN (PP* O
chips NNS (NP*))) O
is VBZ (VP* O
merely RB (NP(ADVP*) O
a DT (NP* O
service NN * O
activity NN *) O
and CC * O
a DT (NP(NP* O
source NN *) O
of IN (PP* O
income NN (NP(NP*) O
for IN (PP* O
the DT (NP* O
company NN *))))))))))) O
- : * O
TGS NNP (S(NP(NP* B-ORG
's POS *) O
ultimate JJ * O
goal NN *) O
is VBZ (VP* O
in IN (PP* O
doing VBG (S(VP* O
R&D NN (NP* O
work NN *) O
to TO (S(VP* O
identify VB (VP* O
gene NN (NP(NP* O
sequences NNS *) O
as IN (PP* O
targets NNS (NP(NP*) O
for IN (PP* O
developing VBG (S(VP* O
new JJ (NP* O
drugs NNS *))))))))))))))) O
. . *)) O
Kuo NNP (TOP(S(NP*) B-PERSON
states VBZ (VP* O
that IN (SBAR* O
at IN (S(S(PP* O
present NN (NP*))

center NN *) O
for IN (PP* O
biochips NNS (NP*))))))) O
? . *)) O
BMEC NNP (TOP(S(NP(NML* B-ORG
director NN *) O
Johnsee NNP * B-PERSON
Lee NNP *) I-PERSON
observes VBZ (VP* O
that IN (SBAR* O
the DT (S(NP(NP* O
level NN *) O
of IN (PP* O
scientific JJ (NP(NP(ADJP* O
and CC * O
technical JJ *) O
know NN * O
- HYPH * O
how NN *) O
required VBN (VP* O
for IN (PP* O
biochip NN (NP* O
production NN *)))))) O
is VBZ (VP* O
very RB (ADJP* O
high JJ *) O
, , * O
so IN (SBAR* O
that IN * O
the DT (S(NP(NP* O
opportunities NNS *) O
for IN (PP* O
contract NN (NP* O
manufacture NN *))) O
are VBP (VP* O
limited VBN (VP*)))))))) O
. . *)) O
However RB (TOP(S(ADVP*) O
, , * O
due JJ (PP* O
to IN (PP* O
Taiwan NNP (NP(NP(NP(NP* B-GPE
's POS *) O
early JJ * O
entry NN *) O
into IN (PP* O
biochip NN (NP* O
research NN * O
and CC * O
development NN *))) O
, , * O
and CC * O
the DT (NP(NP* O
high JJ * O
state NN *) O
of IN (PP* O
development NN (NP(NP*) O
of IN (PP* O
its PRP$ (NP* O
semiconductor NN * O

kits NNS *) O
on IN (RRC(PP* O
the DT (NP* O
market NN *)) O
today NN (NP*)))))))))))))))))))) O
. . *)) O
IBMS NNP (TOP(S(NP(NML* B-ORG
associate NN * O
research NN * O
fellow NN *) O
Konan NNP * B-PERSON
Peck NNP *) I-PERSON
agrees VBZ (VP* O
that IN (SBAR* O
in IN (S(PP* O
the DT (NP* O
foreseeable JJ * O
future NN *)) O
, , * O
diagnostic JJ (NP* O
chips NNS *) O
will MD (VP* O
enable VB (VP* O
everyone NN (NP*) O
to TO (S(VP* O
learn VB (VP(VP* O
their PRP$ (NP(NP* O
current JJ * O
state NN *) O
of IN (PP* O
health NN (NP*)))) O
and CC * O
know VB (VP* O
what WDT (SBAR(WHNP* O
diseases NNS *) O
they PRP (S(NP*) O
may MD (VP* O
develop VB (VP* O
in IN (PP* O
the DT (NP* O
future NN *))))))))))))))) O
. . *)) O
But CC (TOP(S* O
in IN (PP* O
a DT (NP(NP* O
situation NN *) O
where WRB (SBAR(WHADVP*) O
testing NN (S(NP* O
technology NN *) O
is VBZ (VP* O
ahead RB (ADVP* O
of IN (PP* O
medical JJ (NP(NML* O
treatment NN *) O
technology NN *)))))))) O
, , * O
the DT (NP(NP* O
medical JJ 

Now that we are aware of the relevant pieces of information and the extraction process, lets extract the annotations in one file per tag in tags folder.

In [45]:
for tag in tags:
    for key1 in list(dict_gold_conll[tag].keys()):
        full_text = ""
        for cur_file in dict_gold_conll[tag][key1]:
            with open(key1 + '/' + cur_file, 'r') as f:
                # initializing the flag value
                flag = None
                for line in f.readlines():
                    l = line.strip()
                    l = ' '.join(l.split())
                    ls = l.split(" ")
                    if len(ls) >= 12:
                        word = ls[3]
                        pos = ls[4]
                        ppos = ls[5]
                        dephead = ls[10]
                        ner = dephead
                        # if relationship tag is * then it could either be O or I
                        if dephead == "*":
                            if flag==None:
                                ner = "O"
                            else:
                                ner = "I-" + flag
                        # if relationship tag is *) then it is I
                        elif dephead == "*)":
                            ner = "I-" + flag
                            flag = None
                        # if relation starts with ( and ends with * then its B with a I that will follow
                        elif dephead.startswith("(") and dephead.endswith("*") and len(dephead)>2:
                            flag = dephead[1:-1]
                            ner = "B-" + flag
                        # if relation starts with ( and ends with ) then its B with no I to follow
                        elif dephead.startswith("(") and dephead.endswith(")") and len(dephead)>2 and flag == None:
                            ner = "B-" + dephead[1:-1]
                        # add lines to the text that have annotations
                        full_text += "\t".join([word, pos, ppos, ner]) + '\n'
                    else:
                        # go to next line to logically support space between sentences
                        full_text += '\n'
            full_text += '\n'

    with open("./onto." + tag + ".ner", 'w') as f:
        f.write(full_text)
    print("Done for {}".format(tag))

Done for test
Done for development
Done for conll-2012-test
Done for train


Major coding ideas based on agg.py file in this github repository -> [https://github.com/yuchenlin/OntoNotes-5.0-NER-BIO.git]