In [4]:
import os
import sys
from collections import defaultdict
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

from util.CoNLL import readCoNLL, readOntoNotes


def convert_ontonotes_to_BIO(annotations) :
    current_tag = None
    inside_tag = False
    BIO_annotations = []
    for annotation in annotations:
        if annotation[0] == "(" and annotation[len(annotation) - 1] == ")" and current_tag is None :
            if current_tag is None :
                BIO_annotations.append("B-"+annotation[1:len(annotation)-1])
                #print("B-"+annotation[1:len(annotation)-1])
            else :
                print("WRONG ANNOTATION")
        elif annotation[0] == "(" :
            if current_tag is not None : 
                print("wrong annotation")
                raise("Error")
            else :
                current_tag = annotation[1: len(annotation)-1]
                BIO_annotations.append("B-"+current_tag)
                #print("Starting tag : {}".format(current_tag))
        elif annotation == "*" :
            if current_tag is not None:
                BIO_annotations.append("I-"+current_tag)
                #print("Currently inside : {}".format(current_tag))
            else :
                BIO_annotations.append("O")
                current_tag = None
                #print("Outside")
        elif annotation == "*)":
            if current_tag is not None :
                BIO_annotations.append("I-"+current_tag)
                current_tag = None
            else :
                print("WRONG ANNOTATION")
    #print(annotations)
    #print(BIO_annotations)
    #print("=======================================")
    return BIO_annotations

In [66]:
sentences = readOntoNotes("/Users/slouvan/sandbox/cross-domain/data/conll-formatted-ontonotes-5.0/data/train/data/english/annotations/bc/cnn/00/cnn_0001.gold_conll",{3:'tokens', 10:'OntoNotes_Label'}, commentSymbol="#")

In [67]:
for sentence in sentences:
    annotations = sentence['OntoNotes_Label']
    BIO_annotations = convert_ontonotes_to_BIO(annotations)
    

In [68]:
convert_ontonotes_to_BIO(['(PER*', '*', '*)', '(ORG)','(LOC*','*','*', '*)', '*'])

['B-PER', 'I-PER', 'I-PER', 'B-ORG', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'O']

In [75]:
import os
root_dir = "/Users/slouvan/sandbox/cross-domain/data/conll-formatted-ontonotes-5.0/data/test/data/english/annotations"
total_sentences = 0
for subdir, dirs, files in os.walk(root_dir):
    for file in files:
        current_file = os.path.join(subdir, file)
        if current_file.endswith(".gold_conll"):
            sentences = readOntoNotes(current_file,{3:'tokens', 10:'OntoNotes_Label'}, commentSymbol="#")
            for idx, sentence in enumerate(sentences):
                annotations = sentences[idx]['OntoNotes_Label']
                BIO_annotations = convert_ontonotes_to_BIO(annotations)
                sentences[idx]['OntoNotes_BIO'] = BIO_annotations
            
            with open(current_file+".bio", "w") as f:
                for sentence in sentences:
                    BIO_annotations = sentence['OntoNotes_BIO']
                    tokens = sentence['tokens']
                    for idx, token in enumerate(tokens):
                        f.write("{} {}\n".format(tokens[idx], BIO_annotations[idx]))
                    f.write("\n")

            total_sentences += len(sentences)
print(total_sentences)

12217


In [5]:
sentences = readCoNLL("/Users/slouvan/sandbox/cross-domain/data/conll-formatted-ontonotes-5.0/data/development/data/english/annotations/bc/cnn/00/cnn_0000.gold_conll.bio",{0:'tokens', 1:'OntoNotes_Label'})

In [6]:
len(sentences)

605

In [47]:
path =  "/Users/slouvan/sandbox/cross-domain/data/conll-formatted-ontonotes-5.0/data/train/data/english/annotations/mz"
nb_sentence = 0
for subdir, dirs, files in os.walk(path):
    for file in files :
        current_file = os.path.join(subdir, file)
        if current_file.endswith(".bio") :
            sentences = readCoNLL(current_file,{0:'tokens', 1:'OntoNotes_Label'})
            nb_sentence += len(sentences)
print(nb_sentence)

6911


In [20]:
from collections import defaultdict
import os
def exist(list_of_names, my_str):
    for name in list_of_names:
        if name in my_str:
            return True
    return False
            

"""
This function read the raw OntoNotes 5.0 data. As the data is very large, 
We should be able to select particular domain to train
"""
def create_onto_notes_dataset(path_to_data, domain=[]):
    nb_sentence = 0
    all_sentences = []
    label_count = defaultdict(int)
    for subdir, dirs, files in os.walk(path_to_data):
        for file in files :
            current_file = os.path.join(subdir, file)
            if len(domain) > 0:
                if exist(domain, current_file) and  current_file.endswith(".bio") :
                    sentences = readCoNLL(current_file,{0:'tokens', 1:'OntoNotes_Label'})
                    #print(sentences[0])
                    all_sentences += sentences
                    for sent in sentences :
                        annotations = sent['OntoNotes_Label']
                        for annotation in annotations :
                            if annotation.startswith("B-"):
                                field = annotation.split("B-")
                                label_count[field[1]] += 1
                    nb_sentence += len(sentences)
    print("Total sentences : {}".format(nb_sentence))
    print("Label freq per tag : {}".format(label_count))
    total_tag_freq = sum(list(label_count.values()))
    print("Total tag freq : {}".format(sum(list(label_count.values()))))
    print("Total nb tag : {}".format(len(label_count)))
    print("Average tag per sent : {}".format(total_tag_freq/nb_sentence))
    
    return all_sentences


train_sents = create_onto_notes_dataset("/Users/slouvan/sandbox/cross-domain/data/conll-formatted-ontonotes-5.0/data/train/data/english/annotations",[ "annotations/bn"]) 
dev_sents   = create_onto_notes_dataset("/Users/slouvan/sandbox/cross-domain/data/conll-formatted-ontonotes-5.0/data/development/data/english/annotations",[ "annotations/bn"]) 
test_sents  = create_onto_notes_dataset("/Users/slouvan/sandbox/cross-domain/data/conll-formatted-ontonotes-5.0/data/test/data/english/annotations",[ "annotations/bn"]) 

Total sentences : 10683
Label freq per tag : defaultdict(<class 'int'>, {'EVENT': 111, 'PRODUCT': 327, 'MONEY': 177, 'PERCENT': 132, 'QUANTITY': 126, 'PERSON': 4242, 'DATE': 2351, 'ORG': 2468, 'TIME': 517, 'LANGUAGE': 22, 'LOC': 373, 'LAW': 25, 'CARDINAL': 1639, 'WORK_OF_ART': 160, 'ORDINAL': 366, 'FAC': 275, 'NORP': 2394, 'GPE': 4056})
Total tag freq : 19761
Total nb tag : 18
Average tag per sent : 1.849761303004774
Total sentences : 1295
Label freq per tag : defaultdict(<class 'int'>, {'EVENT': 14, 'PERCENT': 21, 'WORK_OF_ART': 26, 'ORG': 303, 'PERSON': 557, 'DATE': 302, 'QUANTITY': 19, 'PRODUCT': 35, 'TIME': 68, 'LANGUAGE': 7, 'LOC': 42, 'LAW': 6, 'CARDINAL': 155, 'MONEY': 14, 'ORDINAL': 53, 'FAC': 24, 'NORP': 244, 'GPE': 516})
Total tag freq : 2406
Total nb tag : 18
Average tag per sent : 1.8579150579150578
Total sentences : 1357
Label freq per tag : defaultdict(<class 'int'>, {'EVENT': 24, 'MONEY': 20, 'PERCENT': 6, 'DATE': 318, 'PERSON': 460, 'ORG': 264, 'QUANTITY': 16, 'PRODUCT'

In [21]:
def dumpConll(outputPath, sentences, headers):
    """
    Writes a sentences array/hashmap to a CoNLL format
    """
    if not os.path.exists(os.path.dirname(outputPath)):
        os.makedirs(os.path.dirname(outputPath))
    fOut = open(outputPath, 'w')

    for sentence in sentences:
        for idx in range(len(sentence['tokens'])):
            fOut.write(sentence[headers[0]][idx]+" "+sentence[headers[1]][idx]+"\n")

        fOut.write("\n")

In [22]:
dumpConll("/Users/slouvan/sandbox/emnlp2017-bilstm-cnn-crf/data/OntoNotes_BN/train.txt", train_sents,{0:'tokens', 1:'OntoNotes_Label'})
dumpConll("/Users/slouvan/sandbox/emnlp2017-bilstm-cnn-crf/data/OntoNotes_BN/dev.txt",   dev_sents,{0:'tokens', 1:'OntoNotes_Label'})
dumpConll("/Users/slouvan/sandbox/emnlp2017-bilstm-cnn-crf/data/OntoNotes_BN/test.txt",  test_sents,{0:'tokens', 1:'OntoNotes_Label'})