In [1]:
import dill

from CoRefHelper import parse_stanfordnlp_tagged_essays
from FindFiles import find_files
from Settings import Settings

CV_FOLDS = 5
DEV_SPLIT = 0.1

""" Begin Settings """
DATASET = "CoralBleaching"
PARTITION = "Test" # Training | Test
SCAN_LENGTH = 3
""" END Settings """

settings = Settings()
root_folder = settings.data_directory + DATASET + "/Thesis_Dataset/"
merged_predictions_folder = root_folder + "Predictions/CoRef/MergedTags/"

coref_root = root_folder + "CoReference/"
coref_folder = coref_root + PARTITION

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/


## Load Merged Essays

In [2]:
##override this so we don't replace INFREQUENT words
#config["min_df"] = 0

if PARTITION.lower() == "training":
    merged_essays_fname =  "merged_essays_train.dill"
elif PARTITION.lower() == "test":
    merged_essays_fname = "merged_essays_test.dill"
else:
    raise Exception("Invalid partition: " + PARTITION)

merged_essays_fname = merged_predictions_folder + merged_essays_fname
with open(merged_essays_fname, "rb+") as f:
    tagged_essays = dill.load(f)

# map parsed essays to essay name
essay2tagged = {}
for e in tagged_essays:
    essay2tagged[e.name.split(".")[0]] = e

print("{0} training essays loaded from:\n{1}".format(len(tagged_essays), merged_essays_fname))

226 training essays loaded from:
/Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/Predictions/CoRef/MergedTags/merged_essays_test.dill


## Load CoRef Parsed Essays

In [22]:

# Load CoRef Parsed Essays
coref_files = find_files(coref_folder, ".*\.tagged")
print("{0} co-ref tagged files loaded".format(len(coref_files)))
assert len(coref_files) == len(tagged_essays)

essay2coref_tagged = parse_stanfordnlp_tagged_essays(coref_files)

226 co-ref tagged files loaded


## Validate Same Essays

In [23]:
# VALIDATE THE SAME SET OF ESSAYS
assert essay2tagged.keys() == essay2coref_tagged.keys()
intersect = set(essay2tagged.keys()).intersection(essay2coref_tagged.keys())
assert len(intersect) == len(essay2tagged.keys())
assert len(essay2tagged.keys()) > 1
assert len(essay2tagged.keys()) == len(essay2coref_tagged.keys())

## Examine Data Structure

### Notes on CoRef Datastructure
- Dictionary of esssays, keyed by name
- Each essay is a list of sentences
- Each sentence is a list of words
- words are mapped to a tag dict: Dict[str, Set[str]]]
  - tag dict - contains
    - NER tag (most are O - none)
    - POS tag
    - If a Co-Reference such as an anaphor (mostly pronouns)
      - COREF_PHRASE - phrase referred to by coref
      - COREF_REF - Id of referenced phrase
    - else if it is a phrase that is referenced:
      - COREF_ID - id of the co-reference, referenced in the COREF_REF tag

In [24]:
COREF_PHRASE = "COREF_PHRASE"
COREF_ID     = "COREF_ID"
COREF_REF    = "COREF_REF"

In [25]:
def find_coref_essay_sentence(essay2coref_tagged):
    for ename, list_sent in list(essay2coref_tagged.items()):
        for ix, sent in enumerate(list_sent):
            found_id = False
            found_ref = False
            for wd, tag_dict in sent:
                if COREF_ID in tag_dict:
                    found_id = True
                if COREF_REF in tag_dict:
                    found_ref = True
            if found_id and found_ref:
                return ename
    return None

ename = find_coref_essay_sentence(essay2coref_tagged)
ename

'EBA1415_AEKD_4_CB_ES-05574'

In [26]:
print(ename)
print()
for sent in essay2coref_tagged[ename]: #[:matching_ix+1]:
    for wd, tag_dict in sent:
        copy = dict([(k,v) for k,v in tag_dict.items() if k in {COREF_ID, COREF_REF, COREF_PHRASE}])
        print(wd.ljust(20), copy if copy else "")
    print("*" * 80)

EBA1415_AEKD_4_CB_ES-05574

well                 
based                
on                   
what                 
i                    
read                 
the                  {'COREF_ID': {'3'}}
corals               {'COREF_ID': {'3'}}
are                  
loosing              
their                {'COREF_REF': {'3'}, 'COREF_PHRASE': {'the_corals'}}
colors               
,                    
coral                {'COREF_REF': {'4'}, 'COREF_PHRASE': {'the_coral_bleaching'}}
bleaching            {'COREF_REF': {'4'}, 'COREF_PHRASE': {'the_coral_bleaching'}}
are                  
a                    
serious              
problem              
with                 
a                    
serious              
impact               
on                   
the                  {'COREF_ID': {'1'}}
worlds               {'COREF_ID': {'1'}}
coral                {'COREF_ID': {'1'}}
reefs                {'COREF_ID': {'1'}}
.                    
**********************************************

## Match CoRef Tagged to Consolidated Tagged Essays

In [67]:
def map_tagged_words_to_word_ixs(tagged_essay):

    wd2tags = []
    taggedwd2sentixs = {}
    for sent_ix, sent in enumerate(tagged_essay.sentences):
        for wd_ix, (wd, tags) in enumerate(sent):
            taggedwd2sentixs[len(wd2tags)] = (sent_ix, wd_ix)
            if wd == "\'\'":
                wd = "\""
            wd2tags.append((wd, tags))
    return wd2tags, taggedwd2sentixs


def replace_underscore(mention):
    return set(map(lambda s: s.replace("_"," "), mention))

def map_mentions_to_word_ixs(coref_essay):
    #TODO - fix this, it assume one mention per word, but we can have multiple
    wds2coref = []
    mentions = []
    for sent_ix, sent in enumerate(coref_essay):
        current_mentions = set()
        mention_ixs = set()
        for wd_ix, (wd, tag_dict) in enumerate(sent):
            wds2coref.append((wd, tag_dict))
            if COREF_PHRASE not in tag_dict:
                if len(current_mentions) > 0:
                    mentions.append((current_mentions, mention_ixs))
                current_mentions = set()
                mention_ixs = set()
            else:
                phrases = replace_underscore(tag_dict[COREF_PHRASE])
                if phrases != current_mentions and len(current_mentions) > 0:
                    mentions.append((current_mentions, mention_ixs))
                    current_mentions = set()
                    mention_ixs = set()
                current_mentions = phrases
                mention_ixs.add(len(wds2coref) - 1)
        if len(current_mentions) > 0:
            mentions.append((current_mentions, mention_ixs))
    return wds2coref, mentions

def map_words_between_essays(wd2_tags, wds2coref):

    errors = []

    ix_tagd, ix_coref = 0, 0
    ixtagd_2_ixcoref = {}
    ixcoref_2_ixtagd = {}
    
    while ix_tagd < (len(wd2tags) - 1) and ix_coref < (len(wds2coref) - 1):
        wd_tagd, atags = wd2tags[ix_tagd]
        wd_coref, btag_dict = wds2coref[ix_coref]

        if wd_tagd == wd_coref or wd_tagd == "cannot" and wd_coref == "can":
            ixtagd_2_ixcoref[ix_tagd]  = ix_coref
            ixcoref_2_ixtagd[ix_coref] = ix_tagd
            ix_tagd  += 1
            ix_coref += 1
        else:
            # look ahead in wds2 for item that matches next a
            found_match = False
            for offseta, (aa, atags) in enumerate(wd2tags[ix_tagd: ix_tagd + 1 + SCAN_LENGTH]):
                for offsetb, (bb, bb_tag_dict) in enumerate(wds2coref[ix_coref:ix_coref + 1 + SCAN_LENGTH]):
                    if aa == bb:
                        if offseta == offsetb:
                            for i in range(ix_tagd, ix_tagd + offseta):
                                if i not in ixtagd_2_ixcoref:
                                    ixtagd_2_ixcoref[i] = i

                        ix_tagd  = ix_tagd + offseta
                        ix_coref = ix_coref + offsetb
                        ixtagd_2_ixcoref[ix_tagd] = ix_coref
                        ixcoref_2_ixtagd[ix_coref] = ix_tagd
                        found_match = True
                        break
                if found_match:
                    break
            if not found_match:
                errors.append((ename, wd_tagd, wd_coref, ix_tagd, ix_coref))
                break
    return ixtagd_2_ixcoref, ixcoref_2_ixtagd, errors

In [70]:
coref_essay  = essay2coref_tagged[ename]
tagged_essay = essay2tagged[ename]

wd2tags, taggedwd2sentixs = map_tagged_words_to_word_ixs(tagged_essay)
wds2coref, mentions = map_mentions_to_word_ixs(coref_essay)

In [71]:
ixtagd_2_ixcoref, ixcoref_2_ixtagd, errors = map_words_between_essays(wd2tags, wds2coref)
if errors:
    # Print errors
    for ename, wd_tagd, wd_coref, ix_tagd, ix_coref in errors:
        failed_cnt += 1
        print("Failed: " + ename, wd_tagd, wd_coref, ix_tagd, ix_coref)

In [59]:
# ixtagd_2_ixcoref

In [60]:
# ixtagd_2_ixcoref

In [35]:
len(tagged_wd2_tags), len(taggedwd2sentixs)

(109, 109)

In [72]:
wds_coref, mentions = map_mentions_to_word_ixs(coref_essay)

In [51]:
len(wds_coref), len(mentions)

(109, 6)

In [52]:
mentions

[({'the corals'}, {10}),
 ({'the coral bleaching'}, {13, 14}),
 ({'the corals'}, {29}),
 ({'the pacific ocean'}, {46, 47}),
 ({'a massive coral bleaching event in 0000'}, {83, 84}),
 ({'a massive coral bleaching event in 0000'}, {101})]