In [1]:
import dill
from FindFiles import find_files
from Settings import Settings
from CoRefHelper import EMPTY
from collections import defaultdict
from BrattEssay import ANAPHORA

DATASET = "SkinCancer" # CoralBleaching | SkinCancer
PARTITION = "Training" # Training | Test

settings = Settings()
root_folder = settings.data_directory + DATASET + "/Thesis_Dataset/"
merged_predictions_folder = root_folder + "CoReference/"
coref_folder = root_folder + "CoReference/Berkeley/"
print("coref root:    ", merged_predictions_folder)
print("berkeley coref:", coref_folder)

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/
coref root:     /Users/simon.hughes/Google Drive/Phd/Data/SkinCancer/Thesis_Dataset/CoReference/
berkeley coref: /Users/simon.hughes/Google Drive/Phd/Data/SkinCancer/Thesis_Dataset/CoReference/Berkeley/


In [2]:
def get_essays(folder, partition):
    essay_files = find_files(folder)
    if partition == "Training":
        essay_files = [e for e in essay_files if "train" in e]
    else:
        essay_files = [e for e in essay_files if "test" in e]
    assert len(essay_files) == 1
    print("Found file", essay_files[0])
    with open(essay_files[0], "rb") as f:
        loaded_essays = dill.load(f)
    return loaded_essays

In [3]:
essays = get_essays(merged_predictions_folder, PARTITION)
len(essays)

Found file /Users/simon.hughes/Google Drive/Phd/Data/SkinCancer/Thesis_Dataset/CoReference/training_processed.dill


870

In [4]:
# map essays to a dict to lookup when processing connl file
name2essay = dict()
for e in essays:
    name2essay[e.name] = e

In [5]:
connll_file = "/Users/simon.hughes/Google Drive/PhD/Data/berkely_ner/{dataset}/{partition}/corefner/output.conll".format(
    dataset=DATASET, partition=PARTITION)

# Update Essays with Berkeley Output, Assert Words and Sentences are the Same

In [6]:
#%%time
from collections import defaultdict

pos_tally = defaultdict(int)
ner_tally = defaultdict(int)
coref_tally = defaultdict(int)

processed_essays = set()
with open(connll_file, "r+") as f:
    current_essay = None
    sent_ix = -1
    word_ix = -1
    for i, line in enumerate(f.readlines()):
        line = line.strip()
        if line.startswith("#begin document"):
            sent_ix = 0    
            word_ix = -1
            fname = line[line.find("(")+1:line.find(")")].replace(".txt",".ann")            
            assert fname in name2essay, "Missing: %s" % fname
            current_essay = name2essay[fname]           
        elif line.startswith("#end document"):            
            assert sent_ix == len(current_essay.sentences), "Sentences should have the same length"
            sent_ix = -1
            word_ix = -1
            processed_essays.add(fname)
            #current_essay = None
        elif line == "":
            # new sentence or end of document
            assert word_ix == len(current_essay.sentences[sent_ix]) - 1
            sent_ix += 1
            word_ix = -1
            pass
        else:
            word_ix += 1
            parts = line.split("\t")
            
            assert len(parts) == 12
            assert parts[0][:-4] == current_essay.name[:-4]
            
            berkeley_word = parts[3]
            
            berkeley_pos = parts[4]
            berkeley_corefids = parts[-1].replace("(","").replace(")","").replace("-","")
            coref_tally[berkeley_corefids] +=1
            berkeley_set_corefids = set(berkeley_corefids.split("|"))
            if "" in berkeley_set_corefids:
                berkeley_set_corefids.remove("")

            berkeley_ner = parts[-2].replace("(","").replace(")","").replace("*","")
            if not berkeley_ner:
                berkeley_ner = EMPTY
            ner_tally[berkeley_ner] += 1
            pos_tally[berkeley_pos] += 1
            
            # look up items from the Stanford parser
            stanford_word, _ = current_essay.sentences[sent_ix][word_ix]
            stanford_pos = current_essay.pred_pos_tags_sentences[sent_ix][word_ix]
            stanford_ner = current_essay.pred_ner_tags_sentences[sent_ix][word_ix]
            assert stanford_word == berkeley_word
            # UPDATE THE Essays with the Berkeley Parser Output
            current_essay.pred_pos_tags_sentences[sent_ix][word_ix] = berkeley_pos
            current_essay.pred_ner_tags_sentences[sent_ix][word_ix] = berkeley_ner
            current_essay.pred_corefids[sent_ix][word_ix] = berkeley_set_corefids

# make sure all essays are in the file
assert len(essays) == len(processed_essays), "Should have processed all essays"
print("Done")

Done


In [7]:
berkeley_essays = list(name2essay.values()) 
#berkeley_essays = list(essays)
# reload for comparison
stanford_essays = get_essays(merged_predictions_folder, PARTITION)
len(berkeley_essays), len(stanford_essays)

Found file /Users/simon.hughes/Google Drive/Phd/Data/SkinCancer/Thesis_Dataset/CoReference/training_processed.dill


(870, 870)

In [8]:
assert len(berkeley_essays) == len(stanford_essays)

# assumes essays are in the same order
num_words = 0
diff_words = 0
diff_pos = 0
diff_ner = 0
diff_corefs = 0

for eix in range(len(stanford_essays)):
    sessay = stanford_essays[eix]
    bessay = berkeley_essays[eix]
    assert sessay.name == bessay.name
    assert sessay != bessay, "Memory refs should differ"
#     print(sessay.name, bessay.name)
    assert len(sessay.sentences) == len(bessay.sentences)
    for sent_ix in range(len(sessay.sentences)):
        ssentence = sessay.sentences[sent_ix]
        bsentence = bessay.sentences[sent_ix]
        assert len(ssentence) == len(bsentence)
        for word_ix in range(len(ssentence)):
            num_words += 1
            
            sword = ssentence[word_ix]
            bword = bsentence[word_ix]
            if sword != bword:
                diff_words +=1
            
            spos = sessay.pred_pos_tags_sentences[sent_ix][word_ix]
            bpos = bessay.pred_pos_tags_sentences[sent_ix][word_ix]
            if spos != bpos:
                diff_pos +=1
            
            sner = sessay.pred_ner_tags_sentences[sent_ix][word_ix]
            bner = bessay.pred_ner_tags_sentences[sent_ix][word_ix]
            if sner != bner:
                diff_ner +=1
            
            scoref_set = sessay.pred_corefids[sent_ix][word_ix]
            assert type(scoref_set) == set
            bcoref_set = bessay.pred_corefids[sent_ix][word_ix]
            assert type(scoref_set) == type(bcoref_set)
            if bcoref_set != scoref_set:
                diff_corefs +=1

In [9]:
num_words, diff_words, diff_pos, diff_ner, diff_corefs

(145471, 0, 10453, 13009, 21626)

In [10]:
num_words, diff_words/num_words, diff_pos/num_words, diff_ner/num_words, diff_corefs/num_words

(145471, 0.0, 0.0718562462621416, 0.08942675859793361, 0.14866193261887248)

In [11]:
assert diff_pos > 100 and diff_ner > 100

In [12]:
# POS tags match 90% of the time, but NER and corefids differ at least 99% of the time

In [13]:
# create the parent folder if it does not exist
import pathlib
pathlib.Path(coref_folder).mkdir(parents=True, exist_ok=True) 

In [14]:
dill_fname = "{folder}{partition}_processed.dill".format(folder=coref_folder, partition=PARTITION.lower())
with open(dill_fname, "wb+") as fout:
    dill.dump(berkeley_essays, fout)
print(dill_fname, "Persisted")

/Users/simon.hughes/Google Drive/Phd/Data/SkinCancer/Thesis_Dataset/CoReference/Berkeley/training_processed.dill Persisted
