In [3]:
import numpy as np
import dill

#from gensim.models import Word2Vec
from window_based_tagger_config import get_config
from Rpfa import micro_rpfa

import logging
import datetime
import pickle

from CrossValidation import cross_validation
from BrattEssay import load_bratt_essays
from load_data import load_process_essays
from collections import defaultdict
from IterableFP import flatten
from Settings import Settings

CV_FOLDS = 5
DEV_SPLIT = 0.1

settings = Settings()
root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"
training_folder = root_folder + "Training" + "/"
training_pickled = settings.data_directory + "CoralBleaching/Thesis_Dataset/training.pl"

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/


In [4]:
config = get_config(training_folder)
# override this so we don't replace INFREQUENT words
config["min_df"] = 0
tagged_essays2 = load_process_essays(**config)
#config

902 files found
902 essays processed


In [5]:
len(tagged_essays2)

902

In [8]:
#coref_folder = root_folder + "CoReference/Training_Old"
coref_folder = root_folder + "CoReference/Training"
coref_folder

'/Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/CoReference/Training'

In [9]:
from FindFiles import find_files
coref_files = find_files(coref_folder, ".*\.tagged")
len(coref_files)

902

In [173]:
def parse_stanfordnlp_tagged_essays(coref_files):
    DELIM = "->"
    DELIM_TAG = "|||"

    essay2tagged = {}
    for fname in coref_files:
        with open(fname) as f:
            lines = f.readlines()

        tagged_lines = []
        for line in lines:
            tagged_words = []
            line = line.strip()
            wds = []
            for t_token in line.split(" "):
                ##print(t_token)

                word, tags = t_token.split(DELIM)
                if word == "-lrb-":
                    word = "("
                if word == "-rrb-":
                    word = ")"
                wds.append(word)
                tag_dict = {}
                for tag in tags.split(DELIM_TAG):
                    if not tag:
                        continue
                    splt = tag.split(":")
                    if len(splt) == 2:
                        key, val = splt
                        tag_dict[key] = val
                    else:
                        raise Exception("Error")
                tagged_words.append((word, tag_dict))
            tagged_lines.append(tagged_words)
        essay2tagged[fname.split("/")[-1].split(".")[0]] = tagged_lines
    return essay2tagged

essay2tagged = parse_stanfordnlp_tagged_essays(coref_files)
print(len(essay2tagged))

902


In [174]:
mentions = []
mention_lens = []

current_ph_len = 0
current_mention = ""
prev_phrase = ""

mention2replace = []

def change_in_phrase():
    global mentions, mention_lens, current_ph_len, current_mention, prev_phrase, mention2replace 
    if current_ph_len > 0:
        mention_lens.append(current_ph_len)
        mentions.append(current_mention.strip())
        mention2replace.append((current_mention.strip(), prev_phrase))
    current_mention = ""
    current_ph_len = 0
    prev_phrase = ""

for ename, tagged_lines in essay2tagged.items():
    #print(ename)
    for line in tagged_lines:
        current_ph_len = 0
        current_mention = ""
        prev_phrase = ""
        for wd, tag_dict in line:
            if "COREF_PHRASE" in tag_dict:
                current_phrase = tag_dict["COREF_PHRASE"]
                if prev_phrase != current_phrase:
                    change_in_phrase()
                current_ph_len+=1
                current_mention += " " + wd
                prev_phrase = tag_dict["COREF_PHRASE"]
            else:
                change_in_phrase()
        change_in_phrase()
len(mention_lens), len(mentions), len(mention2replace)

(5125, 5125, 5125)

In [175]:
import numpy as np
np.mean(mention_lens), np.max(mention_lens), np.min(mention_lens)

(1.5473170731707317, 20, 1)

In [177]:
[m for m in mentions if len(m.strip().split(" ")) >= 3][:10]

['producers ( things that create food using sunlight like coral reef )',
 'the salty water',
 'the pacific ocean',
 'corals living in the oceans',
 'the atlantic ocean',
 'the amount of carbon dioxide',
 'the amount of co2',
 'the coral reef',
 'the amount of fresh water',
 'the zooxanthallae algae']

In [172]:
sorted(mention2replace, key = lambda s: -len(s[1].split("_")))[:5]

[('the winds',
  'the_trade_winds_that_go_through_upwelling_which_can_increase_the_temperature_from_3of_to_5of_and_in_some_places_they_increase_over_10of'),
 ('the trade winds',
  'the_trade_winds_that_go_through_upwelling_which_can_increase_the_temperature_from_3of_to_5of_and_in_some_places_they_increase_over_10of'),
 ('the photosynthesis',
  'the_photosynthesis_the_polyps_go_through_to_recieve_energy_to_give_to_the_coral_which_they_need_00_%_to_00_%_of'),
 ('the ways',
  'many_ways_to_explain_the_rates_in_coral_bleaching_like_trade_winds_,_a_balanced_environment_,_and_physical_damage'),
 ('it',
  'the_change_in_carbon_dioxide_which_is_a_cause_of_different_temperatures_and_extreme_storms_coral_how_salty_the_water')]

In [19]:
# for m,h in mention2replace:
#     if len(m.split(" ")) > len(h.split("_")):
#         print(m)
#         print(h)
#         print("")

In [24]:
essay2parsed = {}
for e in tagged_essays2:
    essay2parsed[e.name.split(".")[0]] = e
len(essay2parsed), len(essay2tagged)

(902, 902)

In [25]:
a = set(essay2parsed.keys())
b = set(essay2tagged.keys())
a == b

True

In [185]:
failed_cnt = 0
COREF_PHRASE = "COREF_PHRASE"
SCAN_LENGTH = 4

replacements = []
fuzzy_matches = []

for ename, tagged_essay in essay2tagged.items():
    assert ename in essay2parsed
    essay = essay2parsed[ename]
    
    wds1 = []
    taggedwd2sentixs = {}
    for sent_ix,sent in enumerate(essay.sentences):
        for wd_ix, (wd,tags) in enumerate(sent):
            taggedwd2sentixs[len(wds1)] = (sent_ix, wd_ix)
            wds1.append((wd,tags))            

    wds2 = []
    mentions = []
    for sent_ix, sent in enumerate(tagged_essay):
        current_mention = ""
        mention_ixs = set()
        for wd_ix, (wd,tag_dict) in enumerate(sent):
            wds2.append((wd,tag_dict))
            if COREF_PHRASE not in tag_dict:
                if current_mention != "":
                    mentions.append((current_mention, mention_ixs))
                current_mention = ""
                mention_ixs = set()
            else:
                phrase = tag_dict[COREF_PHRASE].replace("_"," ")
                if phrase != current_mention and current_mention != "":
                    mentions.append((current_mention, mention_ixs))
                    current_mention = ""
                    mention_ixs = set()
                current_mention = phrase
                mention_ixs.add(len(wds2)-1)                
        if current_mention != "":
            mentions.append((current_mention, mention_ixs))
    
    if len(mentions) == 0:
        continue
        
    ix_a, ix_b = 0,0
    wd1ix_wd2ix = {}
    while ix_a < (len(wds1)-1) and ix_b < (len(wds2)-1):
        a, atags = wds1[ix_a]
        b, btag_dict = wds2[ix_b]

        if a == b:
            wd1ix_wd2ix[ix_a] = ix_b
            ix_a += 1
            ix_b += 1
        else:
            # look ahead in wds2 for item that matches next a
            found_match = False
            for offseta, (aa,atags) in enumerate(wds1[ix_a: ix_a+1+SCAN_LENGTH]):
                for offsetb, (bb,bb_tag_dict) in enumerate(wds2[ix_b:ix_b+1+SCAN_LENGTH]):
                    if aa == bb:
                        ix_a = ix_a + offseta
                        ix_b = ix_b + offsetb
                        wd1ix_wd2ix[ix_a] = ix_b
                        found_match = True
                        break
                if found_match:
                    break
            if not found_match:                
                print("Failed: " + ename, a, b, ix_a, len(wds1), ix_b, len(wds2))
                failed_cnt +=1
                break
    
    for mention, ixs in mentions:
        first_ix = min(ixs)
        is_fuzzy = False
        if first_ix not in wd1ix_wd2ix:
            while first_ix > 0 and first_ix not in wd1ix_wd2ix:
                first_ix -= 1
            if first_ix not in wd1ix_wd2ix:
                e_first_wd_ix = 0
            # one past last matching index
            else:
                e_first_wd_ix = min(len(wds1)-1,wd1ix_wd2ix[first_ix]+1)
            is_fuzzy = True
        else:
            e_first_wd_ix = wd1ix_wd2ix[first_ix]

        last_ix = max(ixs)
        if last_ix not in wd1ix_wd2ix:
            while last_ix < len(wds2) and last_ix not in wd1ix_wd2ix:
                last_ix += 1
            if last_ix not in wd1ix_wd2ix:
                e_last_wd_ix = len(wds1) -1
            else:
                e_last_wd_ix = max(0,wd1ix_wd2ix[last_ix]-1)
            is_fuzzy = True
        else:
            e_last_wd_ix = wd1ix_wd2ix[last_ix]

        replacement = []
        
        for e_wd_ix in range(e_first_wd_ix,e_last_wd_ix+1):                        
            sent_ix, sent_wd_ix = taggedwd2sentixs[e_wd_ix]
            sentence = essay.sentences[sent_ix]
            wd, tags = sentence[sent_wd_ix]
            replacement.append((wd, tags))
        
        if replacement:
            replacements.append((mention, replacement))
            if is_fuzzy:
                fuzzy_matches.append((mention, replacement))

        if len(replacement) < (len(ixs)/2.0):
        #if len(replacement) == 0:
            print("ERROR", "|" + mention + "|||")#, " ".join(list(zip(*fuzzy_matches))[0]), len(replacement), len(ixs))
        
failed_cnt

ERROR |the algae|||


0

In [186]:
len(fuzzy_matches)

14

In [187]:
for mention, replacement in fuzzy_matches:
    wds = list(zip(*replacement))[0]
    tags = list(zip(*replacement))[1]
    print(mention.ljust(50), "|||", " ".join(wds))

the world coral reefs                              ||| reefs .
coral and zooxathellae algae                       ||| algae .
the coral                                          ||| .
the water basin                                    ||| it
of the relationship coral + algae                  ||| they'll eject
algae                                              ||| the aldi
algae                                              ||| the aldi
algae                                              ||| the aldi
the text '' coral                                  ||| in water
the text '' coral                                  ||| algae will
the text '' coral                                  ||| coral and zooxanthallae
some coral                                         ||| the coral
corals                                             ||| .
they                                               ||| them


In [140]:
mentions

[('bleached coral', {53}),
 ('bleached coral', {93, 94}),
 ('the zooxanthellae algae', {103, 104}),
 ('bleached coral', {110, 111}),
 ('most zooxanthellae', {113, 114}),
 ('most zooxanthellae', {120}),
 ('this relationship', {131}),
 ('bleached coral', {152, 153})]

In [143]:
for mention, ixs in mentions:
    for wd_ix in ixs:
        e_wd_ix = wd1ix_wd2ix[wd_ix]
        sent_ix, sent_wd_ix = taggedwd2sentixs[e_wd_ix]
        sentence = essay.sentences[sent_ix]
        wd, tags = sentence[sent_wd_ix]
        print(wd, tags, mention)

it set() bleached coral
provides set() bleached coral
a set() bleached coral
this set() the zooxanthellae algae
. set() the zooxanthellae algae
. set() bleached coral
the set() bleached coral
pass set() most zooxanthellae
some set() most zooxanthellae
from set() most zooxanthellae
states set() this relationship
and {'Causer:14->Result:50', 'Result', 'Causer', 'Causer:14', 'Causer:6->Result:14', '14', 'Result:14'} bleached coral
zooxanthellae {'Causer:14->Result:50', 'Result', 'Causer', 'Causer:14', 'Causer:6->Result:14', '14', 'Result:14'} bleached coral


In [72]:
sum([l for l in diff_lens if l > 2])/sum(diff_lens)

0.07047872340425532

In [70]:
np.mean(diff_lens), np.max(diff_lens), np.median(diff_lens)

(1.1165553080920565, 4, 1.0)

In [50]:
e = essay2parsed["EBA1415_KYNS_3_CB_ES-05384"]
for sent_ix, sent in enumerate(e.sentences):
    wds = []
    for wd,tags in sent:
        wds.append(wd)
    print(sent_ix, " ".join(wds))

0 a coral is a living thing that lives in the ocean , it known for their bright color .
1 the reason why they call it coral bleaching is because when it bleached it loses it color and becomes plain white stated in the article " background : what is " coral bleaching " . "
2 there different explanations to what leads to differences in the rates of coral bleaching .
3 one of these reasons is due to the trade wind happening in the ocean .
4 when the trade winds reverse it cause the water temperature to change .
5 due to this movement regions start to swell causing the sea - levels to rise .
6 the article " shifting trade winds " it states that it more affected in the pacific ocean .
7 as stated in the article " what is coral bleaching " saying that the pacific ocean is where coral bleaching is more done .
8 another reason that explains the rate of coral bleaching is the zooxanthellage which has a symbiotic relationship with coral .
9 when the coral happens to be bleached the zooxanthellae

In [49]:
e = essay2tagged["EBA1415_KYNS_3_CB_ES-05384"]
for sent_ix, sent in enumerate(e):
    wds = []
    for wd,tags in sent:
        wds.append(wd)
    print(sent_ix, " ".join(wds))

0 a coral is a living thing that lives in the ocean , it known for their bright color .
1 the reason why they call it coral bleaching is because when it bleached it loses it color and becomes plain white stated in the article '' background : what is '' coral bleaching '' . ''
2 there different explanations to what leads to differences in the rates of coral bleaching .
3 one of these reasons is due to the trade wind happening in the ocean .
4 when the trade winds reverse it cause the water temperature to change .
5 due to this movement regions start to swell causing the sea - levels to rise .
6 the article '' shifting trade winds '' it states that it more affected in the pacific ocean .
7 as stated in the article '' what is coral bleaching '' saying that the pacific ocean is where coral bleaching is more done .
8 another reason that explains the rate of coral bleaching is the zooxanthellage which has a symbiotic relationship with coral .
9 when the coral happens to be bleached the zooxa

In [37]:
for ename, tagged_essay in essay2tagged.items():
    assert ename in essay2parsed
    essay = essay2parsed[ename]
         
    wds1 = []
    ix2_wd_sent_ix = {}
    ix = 0
    for sent_ix, sent in enumerate(essay.sentences):
        for wd_ix, (wd,tags) in enumerate(sent):
            ix2_wd_sent_ix[len(wds1)] = (sent_ix, wd_ix)
            wds1.append(wd)

    wds2 = []
    for sent in tagged_essay:
        for wd,tag_dict in sent:
            if "COREF_PHRASE" in tag_dict:
                #print("Found")
                wds2.append(wd)
            
    ix_a, ix_b = 0,0