In [3]:
import numpy as np
import dill

#from gensim.models import Word2Vec
from window_based_tagger_config import get_config
from Rpfa import micro_rpfa

import logging
import datetime
import pickle

from CrossValidation import cross_validation
from BrattEssay import load_bratt_essays
from load_data import load_process_essays
from collections import defaultdict
from IterableFP import flatten
from Settings import Settings

CV_FOLDS = 5
DEV_SPLIT = 0.1

settings = Settings()
root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"
training_folder = root_folder + "Training" + "/"
training_pickled = settings.data_directory + "CoralBleaching/Thesis_Dataset/training.pl"

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/


In [4]:
config = get_config(training_folder)
# override this so we don't replace INFREQUENT words
config["min_df"] = 0
tagged_essays2 = load_process_essays(**config)
#config

902 files found
902 essays processed


In [5]:
len(tagged_essays2)

902

In [8]:
#coref_folder = root_folder + "CoReference/Training_Old"
coref_folder = root_folder + "CoReference/Training"
coref_folder

'/Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/CoReference/Training'

In [9]:
from FindFiles import find_files
coref_files = find_files(coref_folder, ".*\.tagged")
len(coref_files)

902

In [10]:
from pprint import pprint

DELIM = "->"
DELIM_TAG = "|||"

essay2tagged = {}
for fname in coref_files:
    with open(fname) as f:
        lines = f.readlines()
    
    tagged_lines = []    
    for line in lines:
        tagged_words = []
        line = line.strip()
        wds = []
        for t_token in line.split(" "):
            ##print(t_token)
            splt = t_token.split(DELIM)
            word, tags = t_token.split(DELIM)
            wds.append(word)
            tag_dict = {}
            for tag in tags.split(DELIM_TAG):
                if not tag:
                    continue
                splt = tag.split(":")
                if len(splt) == 2:
                    key,val = splt
                    tag_dict[key] = val
                else:
                    raise Exception("Error")
            tagged_words.append((word, tag_dict))
        tagged_lines.append(tagged_words)
    essay2tagged[fname.split("/")[-1].split(".")[0]] = tagged_lines
print(len(essay2tagged))

902


In [11]:
mentions = []
mention_lens = []

current_ph_len = 0
current_mention = ""
prev_phrase = ""

mention2replace = []

def change_in_phrase():
    global mentions, mention_lens, current_ph_len, current_mention, prev_phrase, mention2replace 
    if current_ph_len > 0:
        mention_lens.append(current_ph_len)
        mentions.append(current_mention.strip())
        mention2replace.append((current_mention.strip(), prev_phrase))
    current_mention = ""
    current_ph_len = 0
    prev_phrase = ""

for ename, tagged_lines in essay2tagged.items():
    #print(ename)
    for line in tagged_lines:
        current_ph_len = 0
        current_mention = ""
        prev_phrase = ""
        for wd, tag_dict in line:
            if "COREF_PHRASE" in tag_dict:
                current_phrase = tag_dict["COREF_PHRASE"]
                if prev_phrase != current_phrase:
                    change_in_phrase()
                current_ph_len+=1
                current_mention += " " + wd
                prev_phrase = tag_dict["COREF_PHRASE"]
            else:
                change_in_phrase()
        change_in_phrase()
len(mention_lens), len(mentions), len(mention2replace)

(5125, 5125, 5125)

In [12]:
import numpy as np
np.mean(mention_lens), np.max(mention_lens), np.min(mention_lens)

(1.5473170731707317, 20, 1)

In [13]:
lens = list(map(lambda s: len(s.strip().split(" ")), mentions))
np.mean(lens), np.max(lens), np.min(lens)

(1.5473170731707317, 20, 1)

In [15]:
[m for m in mentions if len(m.strip().split(" ")) >= 3][:10]

['producers -lrb- things that create food using sunlight like coral reef -rrb-',
 'the salty water',
 'the pacific ocean',
 'corals living in the oceans',
 'the atlantic ocean',
 'the amount of carbon dioxide',
 'the amount of co2',
 'the coral reef',
 'the amount of fresh water',
 'the zooxanthallae algae']

In [16]:
sorted(mention2replace, key = lambda s: -len(s[1].split("_")))[:35]

[('the winds',
  'the_trade_winds_that_go_through_upwelling_which_can_increase_the_temperature_from_3of_to_5of_and_in_some_places_they_increase_over_10of'),
 ('the trade winds',
  'the_trade_winds_that_go_through_upwelling_which_can_increase_the_temperature_from_3of_to_5of_and_in_some_places_they_increase_over_10of'),
 ('the photosynthesis',
  'the_photosynthesis_the_polyps_go_through_to_recieve_energy_to_give_to_the_coral_which_they_need_00_%_to_00_%_of'),
 ('the ways',
  'many_ways_to_explain_the_rates_in_coral_bleaching_like_trade_winds_,_a_balanced_environment_,_and_physical_damage'),
 ('it',
  'the_change_in_carbon_dioxide_which_is_a_cause_of_different_temperatures_and_extreme_storms_coral_how_salty_the_water'),
 ('it',
  'the_change_in_carbon_dioxide_which_is_a_cause_of_different_temperatures_and_extreme_storms_coral_how_salty_the_water'),
 ('the algae',
  'the_algae_,_called_zooxanthellae_,_that_lives_in_the_tissues_of_the_coral_need_sunlight_for_photosynthesis'),
 ('the algae',

In [19]:
# for m,h in mention2replace:
#     if len(m.split(" ")) > len(h.split("_")):
#         print(m)
#         print(h)
#         print("")

In [24]:
essay2parsed = {}
for e in tagged_essays2:
    essay2parsed[e.name.split(".")[0]] = e
len(essay2parsed), len(essay2tagged)

(902, 902)

In [25]:
a = set(essay2parsed.keys())
b = set(essay2tagged.keys())
a == b

True

In [27]:
diffs = 0
for ename, tagged_essay in essay2tagged.items():
    assert ename in essay2parsed
    essay = essay2parsed[ename]
    
    wds1 = 0
    for sent in essay.sentences:
        for wd,tags in sent:
            wds1 += 1

    wds2 = 0
    for sent in tagged_essay:
        for wd in sent:
            wds2 += 1
    if wds1 != wds2:
        diffs+=1
        print(ename,  len(essay.sentences), len(tagged_essay), wds1, wds2, wds2>=wds1)

EBA1415_AEKD_5_CB_ES-05579 15 12 233 234 True
EBA1415_BGJD_2_CB_ES-05747 2 1 15 13 False
EBA1415_BLRW_3_CB_ES-05168 11 10 156 154 False
EBA1415_BLRW_3_CB_ES-05174 16 16 262 263 True
EBA1415_BLRW_3_CB_ES-05180 12 12 171 172 True
EBA1415_ERAP_7_CB_ES-05464 15 15 225 226 True
EBA1415_ERSK_7_CB_ES-05793 10 10 143 141 False
EBA1415_ERSK_7_CB_ES-06271 4 5 67 63 False
EBA1415_HNJD_4_CB_ES-05810 29 29 386 387 True
EBA1415_KNKC_3_CB_ES-05589 5 5 97 98 True
EBA1415_KNKC_3_CB_ES-05590 13 13 270 268 False
EBA1415_KNKC_3_CB_ES-05601 4 4 180 178 False
EBA1415_KNKC_3_CB_ES-05606 6 13 173 171 False
EBA1415_KYLS_5_CB_ES-05648 15 14 300 298 False
EBA1415_KYLS_5_CB_ES-05664 10 10 200 201 True
EBA1415_KYLS_6_CB_ES-05682 6 5 83 81 False
EBA1415_KYLS_6_CB_ES-05684 12 12 254 255 True
EBA1415_KYNS_4_CB_ES-05393 14 15 280 281 True
EBA1415_KYNS_4_CB_ES-05400 10 10 248 250 True
EBA1415_KYNS_4_CB_ES-05403 11 12 206 204 False
EBA1415_LRBL_4_CB_ES-05164 8 6 104 105 True
EBA1415_LRJE_7_CB_ES-05138 19 19 216 217 True

In [28]:
diffs, len(essay2tagged)

(73, 902)

In [29]:
len(tagged_essay), len(essay.sentences)

(10, 10)

In [30]:
key = "EBA1415_WSAL_1_CB_ES-05354"
essay = essay2parsed[key]
tessay = essay2tagged[key]

wds1, wds2 = [],[]
for sent in essay.sentences:
    for wd,tags in sent:
        wds1.append(wd)
        
for sent in tessay:
    for wd,tags in sent:
        wds2.append(wd)
ix = 0
for a,b in zip(wds1,wds2):
    if a != b:
        break
    ix += 1
list(zip(wds1[ix-3:ix+10],wds2[ix-3:ix+10]))

[('animals', 'animals'),
 ('particles', 'particles'),
 ('are', 'are'),
 ('gonna', 'gon'),
 ('begin', 'na'),
 ('to', 'begin'),
 ('die', 'to'),
 ('out', 'die'),
 (',', 'out'),
 ('because', ','),
 ('they', 'because'),
 ('cant', 'they'),
 ('gain', 'cant')]

In [31]:
key = "EBA1415_BLRW_3_CB_ES-05168"
essay = essay2parsed[key]
tessay = essay2tagged[key]

wds1, wds2 = [],[]
for sent in essay.sentences:
    for wd,tags in sent:
        wds1.append(wd)
        
for sent in tessay:
    for wd,tags in sent:
        wds2.append(wd)
        
ix_a, ix_b = 0,0
scan_length = 5
print_wds = True
while ix_a < len(wds1) and ix_b < len(wds2):
    a = wds1[ix_a]
    b = wds2[ix_b]
    if print_wds:
        print(a,b)
    if a != b:
        
        print_wds = True
        print("*" * 10 + "miss match" + "*" * 10)        
        next_a = wds1[ix_a+1]
        next_b = wds2[ix_b+1]
        # look ahead in wds2 for item that matches next a
        found_match = False
        for offseta, aa in enumerate(wds1[ix_a:ix_a+1+scan_length]):
            for offsetb, bb in enumerate(wds2[ix_b:ix_b+1+scan_length]):
                if aa == bb:
                    ix_a = ix_a + offseta
                    ix_b = ix_b + offsetb
                    found_match = True
                    print("Found:", aa,bb,ix_a,ix_b)
                    break
            if found_match:
                break
                
        if found_match:
            break
        else:
            print("Failed: ", a, b)
    else:            
        ix_a += 1
        ix_b += 1
    
###list(zip(wds1[ix-3:ix+10],wds2[ix-3:ix+10]))

the the
differences differences
in in
the the
rates rates
of of
coral coral
bleaching bleaching
is is
that that
first first
it it
started started
out out
not not
so so
major major
in in
0000 0000
, ,
but but
than than
it it
started started
increasing increasing
. .
the the
highest highest
rate rate
was was
in in
0000 0000
it it
reached reached
up up
to to
more more
than than
00 00
countries countries
making making
a a
report report
of of
coral coral
bleaching bleaching
. .
so so
on on
it it
started started
decreasing decreasing
from from
there there
. .
than than
again again
who who
knows knows
how how
bad bad
it it
is is
now now
0000 0000
. .
the the
coral coral
bleaching bleaching
reports reports
only only
went went
from from
0000 0000
- -
0000 0000
. .
" ''
**********miss match**********
Found: also also 78 78


In [None]:
found_match

In [54]:
diffs = 0
failed_cnt = 0
for ename, tagged_essay in essay2tagged.items():
    assert ename in essay2parsed
    essay = essay2parsed[ename]
         
    wds1 = []
    for sent in essay.sentences:
        for wd,tags in sent:
            wds1.append(wd)

    wds2 = []
    for sent in tagged_essay:
        for wd,tag_dict in sent:
            wds2.append(wd)
            
    ix_a, ix_b = 0,0

    scan_length = 5
    while ix_a < (len(wds1)-1) and ix_b < (len(wds2)-1):
        a = wds1[ix_a]
        b = wds2[ix_b]
#         print(a,b)
        if a != b:
#             print("*" * 10 + "miss match" + "*" * 10)            
            # look ahead in wds2 for item that matches next a
            found_match = False
            for offseta, aa in enumerate(wds1[ix_a: ix_a+1+scan_length]):
                for offsetb, bb in enumerate(wds2[ix_b:ix_b+1+scan_length]):
                    if aa == bb:
                        ix_a = ix_a + offseta
                        ix_b = ix_b + offsetb
                        found_match = True
                        #print("Found:", aa,bb,ix_a,ix_b)
                        break
                if found_match:
                    break
            if not found_match:                
                print("Failed: " + ename, a, b, ix_a, len(wds1), ix_b, len(wds2))
                failed_cnt +=1
                break
        else:            
            ix_a += 1
            ix_b += 1
failed_cnt

0

In [50]:
e = essay2parsed["EBA1415_KYNS_3_CB_ES-05384"]
for sent_ix, sent in enumerate(e.sentences):
    wds = []
    for wd,tags in sent:
        wds.append(wd)
    print(sent_ix, " ".join(wds))

0 a coral is a living thing that lives in the ocean , it known for their bright color .
1 the reason why they call it coral bleaching is because when it bleached it loses it color and becomes plain white stated in the article " background : what is " coral bleaching " . "
2 there different explanations to what leads to differences in the rates of coral bleaching .
3 one of these reasons is due to the trade wind happening in the ocean .
4 when the trade winds reverse it cause the water temperature to change .
5 due to this movement regions start to swell causing the sea - levels to rise .
6 the article " shifting trade winds " it states that it more affected in the pacific ocean .
7 as stated in the article " what is coral bleaching " saying that the pacific ocean is where coral bleaching is more done .
8 another reason that explains the rate of coral bleaching is the zooxanthellage which has a symbiotic relationship with coral .
9 when the coral happens to be bleached the zooxanthellae

In [49]:
e = essay2tagged["EBA1415_KYNS_3_CB_ES-05384"]
for sent_ix, sent in enumerate(e):
    wds = []
    for wd,tags in sent:
        wds.append(wd)
    print(sent_ix, " ".join(wds))

0 a coral is a living thing that lives in the ocean , it known for their bright color .
1 the reason why they call it coral bleaching is because when it bleached it loses it color and becomes plain white stated in the article '' background : what is '' coral bleaching '' . ''
2 there different explanations to what leads to differences in the rates of coral bleaching .
3 one of these reasons is due to the trade wind happening in the ocean .
4 when the trade winds reverse it cause the water temperature to change .
5 due to this movement regions start to swell causing the sea - levels to rise .
6 the article '' shifting trade winds '' it states that it more affected in the pacific ocean .
7 as stated in the article '' what is coral bleaching '' saying that the pacific ocean is where coral bleaching is more done .
8 another reason that explains the rate of coral bleaching is the zooxanthellage which has a symbiotic relationship with coral .
9 when the coral happens to be bleached the zooxa

In [37]:
for ename, tagged_essay in essay2tagged.items():
    assert ename in essay2parsed
    essay = essay2parsed[ename]
         
    wds1 = []
    ix2_wd_sent_ix = {}
    ix = 0
    for sent_ix, sent in enumerate(essay.sentences):
        for wd_ix, (wd,tags) in enumerate(sent):
            ix2_wd_sent_ix[len(wds1)] = (sent_ix, wd_ix)
            wds1.append(wd)

    wds2 = []
    for sent in tagged_essay:
        for wd,tag_dict in sent:
            if "COREF_PHRASE" in tag_dict:
                #print("Found")
                wds2.append(wd)
            
    ix_a, ix_b = 0,0