In [2]:
import dill

#from gensim.models import Word2Vec
from window_based_tagger_config import get_config
from Rpfa import micro_rpfa

import logging
import datetime
import pickle

from CoRefHelper import parse_stanfordnlp_tagged_essays
from CrossValidation import cross_validation
from BrattEssay import load_bratt_essays
from load_data import load_process_essays
from collections import defaultdict
from IterableFP import flatten
from Settings import Settings
from FindFiles import find_files

CV_FOLDS = 5
DEV_SPLIT = 0.1

settings = Settings()
root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"
predictions_folder = root_folder + "Predictions/Bi-LSTM-4-Anaphora_Tags-Binary-Fixed/"
print("Predictions: " + predictions_folder)

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/
Predictions: /Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/Predictions/Bi-LSTM-4-Anaphora_Tags-Binary-Fixed/


## Load CoRef Parsed Essays

In [3]:
coref_folder_train = root_folder + "CoReference/Training"
coref_files_train = find_files(coref_folder_train, ".*\.tagged")
len(coref_files_train)

902

In [4]:
coref_folder_test = root_folder + "CoReference/Test"
coref_files_test = find_files(coref_folder_test, ".*\.tagged")
len(coref_files_test)

226

In [5]:
essay2coref_train = parse_stanfordnlp_tagged_essays(coref_files_train)
print(len(essay2coref_train))

902


## Examine Data Structure

In [6]:
e = essay2coref_train["EBA1415_AEKD_4_CB_ES-05568"]
e[0]

[('what', {'NER': 'O', 'POS': 'WDT'}),
 ('leads', {'NER': 'O', 'POS': 'VBZ'}),
 ('to', {'NER': 'O', 'POS': 'TO'}),
 ('differences', {'NER': 'O', 'POS': 'NNS'}),
 ('in', {'NER': 'O', 'POS': 'IN'}),
 ('the', {'NER': 'O', 'POS': 'DT'}),
 ('rates', {'NER': 'O', 'POS': 'NNS'}),
 ('of', {'NER': 'O', 'POS': 'IN'}),
 ('coral', {'COREF_ID': '2', 'NER': 'O', 'POS': 'NN'}),
 ('bleaching', {'COREF_ID': '2', 'NER': 'O', 'POS': 'NN'}),
 ('.', {'NER': 'O', 'POS': '.'})]

### Notes on Datastructure
- Dictionary of esssays, keyed by name
- Each essay is a list of sentences
- Each sentence is a list of words
- words are mapped to a tag dict
  - tag dict - contains
    - NER tag (most are O - none)
    - POS tag
    - If a Co-Reference such as an anaphor (mostly pronouns)
      - COREF_PHRASE - phrase referred to by coref
      - COREF_REF - Id of referenced phrase
    - else if it is a phrase that is referenced:
      - COREF_ID - id of the co-reference, referenced in the COREF_REF tag

In [7]:
essay2coref_test = parse_stanfordnlp_tagged_essays(coref_files_test)
print(len(essay2coref_test))

226


## Load Anaphora Tagged Essays

In [13]:
predictions_folder

'/Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/Predictions/Bi-LSTM-4-Anaphora_Tags-Binary-Fixed/'

In [22]:
# get tagged essays
def load_tagged_essays(folder, pattern):
    files = find_files(folder, pattern)
    # multiple runs with different hidden layer sizes?
    if len(files) > 2:        
        for f in files:
            print(f)
    assert len(files) == 2, "Wrong number of tagged files:" + str(len(files))
    for f in files:
        assert "_train_" in f or "_test_" in f, "Wrong files matched"
    train_tagged_fname = [f for file in files if "_train_" in f][0]
    test_tagged_fname = list(set(files).difference([train_tagged_fname]))[0]
    print("Train: {fname}".format(fname=train_tagged_fname))
    print("Test:  {fname}".format(fname=test_tagged_fname))

    # NOTE - is this throws an error, upgrade to dill 2.8.2. Version 2.6 had a bug in it
    with open(train_tagged_fname, "rb") as f:
        tagged_essays_train = dill.load(f)
    with open(test_tagged_fname, "rb") as f:
        tagged_essays_test  = dill.load(f)
    return (tagged_essays_train, tagged_essays_test)

def essays_2_hash_map(essays):
    lu = {}
    for e in essays:
        # remove the extension
        lu[e.name.replace(".ann","")] = e
    return lu

In [28]:
pattern = "essays_.*_bi_directional-True_hidden_size-256_merge_mode-sum_num_rnns-2_use_pretrained_embedding-True.dill"
ana_tagged_train, ana_tagged_test = load_tagged_essays(predictions_folder, pattern)
print("lens:", len(ana_tagged_train), len(ana_tagged_test))

essay2ana_train = essays_2_hash_map(ana_tagged_train)
essay2ana_test  = essays_2_hash_map(ana_tagged_test)

CHECk = "number of total essay files should be the same"
assert len(coref_files_train) == len(ana_tagged_train), CHECK
assert len(coref_files_test) == len(ana_tagged_test),   CHECK

Train: /Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/Predictions/Bi-LSTM-4-Anaphora_Tags-Binary-Fixed/essays_train_bi_directional-True_hidden_size-256_merge_mode-sum_num_rnns-2_use_pretrained_embedding-True.dill
Test:  /Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/Predictions/Bi-LSTM-4-Anaphora_Tags-Binary-Fixed/essays_test_bi_directional-True_hidden_size-256_merge_mode-sum_num_rnns-2_use_pretrained_embedding-True.dill
lens: 902 226


In [29]:
def match_essays(ana_tagged, coref):
    
    for ename in ana_tagged.keys():
        assert ename in coref, ename
        ana_essay = ana_tagged[ename]
        coref_essay = coref[ename]
        assert len(ana_essay.sentences) == len(coref_essay), (ename, len(ana_essay.sentences) ,len(coref_essay))
        
match_essays(essay2ana_train, essay2coref_train)

AssertionError: ('EBA1415_KYLS_5_CB_ES-05648', 15, 14)

## Why do the Sentences Not Align?

In [30]:
ename = "EBA1415_KYLS_5_CB_ES-05648"

ana_essay   = essay2ana_train[ename]
coref_essay = essay2coref_train[ename]

In [31]:
len(ana_essay.sentences), len(coref_essay)

(15, 14)

In [32]:
ana_essay.sentences[0]

[('there', set()),
 ('are', set()),
 ('many', set()),
 ('leads', set()),
 ('to', set()),
 ('differences', set()),
 ('in', set()),
 ('the', set()),
 ('rates', set()),
 ('if', set()),
 ('coral', {'50'}),
 ('bleaching', {'50'}),
 ('.', set())]

In [33]:
for i, sent in enumerate(ana_essay.sentences):
    words = list(zip(*sent))[0]
    print(str(i) + ":  " + " ".join(words))

0:  there are many leads to differences in the rates if coral bleaching .
1:  the first piece of evidence is " stress can negatively affect the balanced relationship between the coral and zooxanthellae , " ( hsa 0000 ) .
2:  this shows the rate of coral bleaching because with the relationship they both cant live without each other .
3:  the algae gives coral food in which it lives and also the algae gives it color .
4:  it would turn white because the supply are making a lot and put down everything in which the coral is getting a disease .
5:  that how alot of coral reefs start being and soon there will be no coral ans the sea animals would die .
6:  the second pieces of example is " the corals turn white due to ejection of zooxanthellae . "
7:  this show the effects because it will turn white and the coral some times cant relive it self and just stay there to die .
8:  the coral is used as a home protection for small other animals like fish from bigger animals like INFREQUENT , big fi

In [34]:
for i, sent in enumerate(coref_essay):
    words = list(zip(*sent))[0]
    print(str(i) + ":  " + " ".join(words))

0:  there are many leads to differences in the rates if coral bleaching .
1:  the first piece of evidence is " stress can negatively affect the balanced relationship between the coral and zooxanthellae , " ( hsa 0000 ) .
2:  this shows the rate of coral bleaching because with the relationship they both cant live without each other .
3:  the algae gives coral food in which it lives and also the algae gives it color .
4:  it would turn white because the supply are making a lot and put down everything in which the coral is getting a disease .
5:  that how alot of coral reefs start being and soon there will be no coral ans the sea animals would die .
6:  the second pieces of example is " the corals turn white due to ejection of zooxanthellae . "
7:  this show the effects because it will turn white and the coral some times cant relive it self and just stay there to die .
8:  the coral is used as a home protection for small other animals like fish from bigger animals like sheiks , big fish ,

### In the Ana tagged essay, sentence 8 is split after the elipsis, in the co-ref it is not