In [6]:
import dill

#from gensim.models import Word2Vec
from window_based_tagger_config import get_config
from Rpfa import micro_rpfa

import logging
import datetime
import pickle

from CrossValidation import cross_validation
from BrattEssay import load_bratt_essays
from load_data import load_process_essays
from collections import defaultdict
from IterableFP import flatten
from Settings import Settings
from FindFiles import find_files

CV_FOLDS = 5
DEV_SPLIT = 0.1

settings = Settings()
root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"
predictions_folder = root_folder + "Predictions/Bi-LSTM-4-Anaphora_Tags-Binary-Fixed/"
print("Predictions: " + predictions_folder)

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/
Predictions: /Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/Predictions/Bi-LSTM-4-Anaphora_Tags-Binary-Fixed/


## Load CoRef Parsed Essays

In [7]:
coref_folder_train = root_folder + "CoReference/Training"
coref_files_train = find_files(coref_folder_train, ".*\.tagged")
len(coref_files_train)

902

In [8]:
coref_folder_test = root_folder + "CoReference/Test"
coref_files_test = find_files(coref_folder_test, ".*\.tagged")
len(coref_files_test)

226

In [9]:
def parse_stanfordnlp_tagged_essays(coref_files):
    DELIM = "->"
    DELIM_TAG = "|||"

    essay2tagged = {}
    for fname in coref_files:
        with open(fname) as f:
            lines = f.readlines()

        tagged_lines = []
        for line in lines:
            tagged_words = []
            line = line.strip()
            wds = []
            for t_token in line.split(" "):
                ##print(t_token)

                word, tags = t_token.split(DELIM)
                if word == "-lrb-":
                    word = "("
                if word == "-rrb-":
                    word = ")"
                if word == "\'\'":
                    word = "\""
                # if word == "not" and len(wds) > 0 and wds[-1] == "can":
                #     last_wd, tag_dict = tagged_words[-1]
                #     tagged_words[-1] = ("cannot", tag_dict)
                #     wds[-1] = "cannot"
                #     continue

                wds.append(word)
                tag_dict = {}
                for tag in tags.split(DELIM_TAG):
                    if not tag:
                        continue
                    splt = tag.split(":")
                    if len(splt) == 2:
                        key, val = splt
                        tag_dict[key] = val
                    else:
                        raise Exception("Error")
                if word == "...":
                    tagged_words.append((".", tag_dict))
                    tagged_words.append((".", tag_dict))
                    tagged_words.append((".", tag_dict))
                else:
                    tagged_words.append((word, tag_dict))
            tagged_lines.append(tagged_words)
        essay2tagged[fname.split("/")[-1].split(".")[0]] = tagged_lines
    return essay2tagged

In [10]:
essay2coref_train = parse_stanfordnlp_tagged_essays(coref_files_train)
print(len(essay2coref_train))

902


## Examine Data Structure

In [19]:
e = essay2coref_train["EBA1415_AEKD_4_CB_ES-05568"]
e[0]

[('what', {'NER': 'O', 'POS': 'WDT'}),
 ('leads', {'NER': 'O', 'POS': 'VBZ'}),
 ('to', {'NER': 'O', 'POS': 'TO'}),
 ('differences', {'NER': 'O', 'POS': 'NNS'}),
 ('in', {'NER': 'O', 'POS': 'IN'}),
 ('the', {'NER': 'O', 'POS': 'DT'}),
 ('rates', {'NER': 'O', 'POS': 'NNS'}),
 ('of', {'NER': 'O', 'POS': 'IN'}),
 ('coral', {'COREF_ID': '2', 'NER': 'O', 'POS': 'NN'}),
 ('bleaching', {'COREF_ID': '2', 'NER': 'O', 'POS': 'NN'}),
 ('.', {'NER': 'O', 'POS': '.'})]

### Notes on Datastructure
- Dictionary of esssays, keyed by name
- Each essay is a list of sentences
- Each sentence is a list of words
- words are mapped to a tag dict
  - tag dict - contains
    - NER tag (most are O - none)
    - POS tag
    - If a Co-Reference such as an anaphor (mostly pronouns)
      - COREF_PHRASE - phrase referred to by coref
      - COREF_REF - Id of referenced phrase
    - else if it is a phrase that is referenced:
      - COREF_ID - id of the co-reference, referenced in the COREF_REF tag

In [27]:
essay2coref_test = parse_stanfordnlp_tagged_essays(coref_files_test)
print(len(essay2coref_test))

226


## Load Tagged Essays

In [41]:
# get tagged essays
def load_tagged_essays(folder):
    files = find_files(folder, "essay.*.dill")
    # multiple runs with different hidden layer sizes?
    if len(files) > 2:        
        files = find_files(folder, "essays.*256.*.dill")
    for f in files:
        print(f)
    assert len(files) == 2, "Wrong number of tagged files:" + str(len(files))
    for f in files:
        assert "_train_" in f or "_test_" in f, "Wrong files matched"
    train_tagged_fname = [f for file in files if "_train_" in f][0]
    test_tagged_fname = list(set(files).difference([train_tagged_fname]))[0]

    # NOTE - is this throws an error, upgrade to dill 2.8.2. Version 2.6 had a bug in it
    with open(train_tagged_fname, "rb") as f:
        tagged_essays_train = dill.load(f)
    with open(test_tagged_fname, "rb") as f:
        tagged_essays_test  = dill.load(f)
    return (tagged_essays_train, tagged_essays_test)

def essays_2_hash_map(essays):
    lu = {}
    for e in essays:
        # remove the extension
        lu[e.name.replace(".ann","")] = e
    return lu

In [42]:
ana_tagged_train, ana_tagged_test = load_tagged_essays(predictions_folder)
len(ana_tagged_train), len(ana_tagged_test)

/Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/Predictions/Bi-LSTM-4-Anaphora_Tags-Binary-Fixed/essays_test_bi_directional-True_hidden_size-256_merge_mode-sum_num_rnns-2_use_pretrained_embedding-True.dill
/Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/Predictions/Bi-LSTM-4-Anaphora_Tags-Binary-Fixed/essays_train_bi_directional-True_hidden_size-256_merge_mode-sum_num_rnns-2_use_pretrained_embedding-True.dill


(902, 226)

In [43]:
essay2ana_train = essays_2_hash_map(ana_tagged_train)
essay2ana_test  = essays_2_hash_map(ana_tagged_test)

In [44]:
assert len(coref_files_train) == len(ana_tagged_train)
assert len(coref_files_test) == len(ana_tagged_test)

In [58]:
def match_essays(ana_tagged, coref):
    
    for ename in ana_tagged.keys():
        assert ename in coref, ename
        ana_essay = ana_tagged[ename]
        coref_essay = coref[ename]
        assert len(ana_essay.sentences) == len(coref_essay), (len(ana_essay.sentences) ,len(coref_essay))
        
match_essays(essay2ana_train, essay2coref_train)

AssertionError: (15, 14)