In [5]:
import dill
import pandas as pd

from Settings import Settings
from collections import defaultdict
from BrattEssay import ANAPHORA
from window_based_tagger_config import get_config
from results_procesor import ResultsProcessor, __MICRO_F1__

In [28]:
# Load the shared code from the results folder
import sys
sys.path.append("/Users/simon.hughes/GitHub/NlpResearch/PythonNlpResearch/notebooks/Co-Reference Resolution/Results")

from results_common import get_essays, validate_essays

In [29]:
settings = Settings()

DATASET = "CoralBleaching"  # SkinCancer

root_folder = settings.data_directory +  DATASET + "/Thesis_Dataset/"
training_folder = root_folder + "Training" + "/"
test_folder = root_folder + "Test" + "/"
rnn_predictions_folder = root_folder + "Predictions/Bi-LSTM-4-SEARN/"

config = get_config(training_folder)
results_processor = ResultsProcessor(dbname="metrics_causal_FIXED")

# Get Test Data In Order to Get Test CRELS
# load the test essays to make sure we compute metrics over the test CR labels
test_config = get_config(test_folder)

stanford_coref_predictions_folder = root_folder + "CoReference/"
berkeley_coref_predictions_folder = root_folder + "CoReference/Berkeley/"

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/


In [30]:
# Berkeley best for CB
coref_predictions_folder = berkeley_coref_predictions_folder
print("Co-Ref folder:", coref_predictions_folder)

coref_train_essays = get_essays(coref_predictions_folder, "Training")
coref_test_essays = get_essays(coref_predictions_folder, "Test")

len(coref_train_essays), len(coref_test_essays)

Co-Ref folder: /Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/CoReference/Berkeley/
Found file /Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/CoReference/Berkeley/training_processed.dill
Found file /Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/CoReference/Berkeley/test_processed.dill


(902, 226)

In [31]:
train_fname = rnn_predictions_folder + "essays_train_bi_directional-True_hidden_size-256_merge_mode-sum_num_rnns-2_use_pretrained_embedding-True.dill"
with open(train_fname, "rb") as f:
    pred_tagged_essays_train = dill.load(f)

test_fname = rnn_predictions_folder + "essays_test_bi_directional-True_hidden_size-256_merge_mode-sum_num_rnns-2_use_pretrained_embedding-True.dill"
with open(test_fname, "rb") as f:
    pred_tagged_essays_test = dill.load(f)
    
len(pred_tagged_essays_train), len(pred_tagged_essays_test)

(902, 226)

In [55]:
from results_procesor import is_a_regular_code

reg_tally = defaultdict(int)
crel_tally = defaultdict(int)
for e in pred_tagged_essays_train + pred_tagged_essays_test:
    for sent in e.sentences:
        for wd, tags in sent:
            for t in tags:
                t = t.lower()
                if is_a_regular_code(t):
                    reg_tally[t] += 1
                if "->" in t and ("ana" not in t and "other" not in t and "rhet" not in t and "change" not in t):
                    crel_tally[t] += 1

reg_tags = sorted(reg_tally.keys())
crel_tags = sorted(crel_tally.keys())

cc_crel_tags_filter = set(reg_tags + crel_tags)

In [37]:
def names_the_same(essay_sets):
    unique_fnames = [] # list of sets of str (fnames)
    for essay_collection in essay_sets:
        names = set()
        for e in essay_collection:
            names.add(e.name)
        unique_fnames.append(names)
    for a in unique_fnames:
        print(len(a))
        for b in unique_fnames:
            assert len(a) == len(b), "lens don't match"
            assert a == b, "don't match"

In [38]:
def essays_2_hash_map(essays):
    lu = {}
    for e in essays:
        lu[e.name] = e
    return lu

In [44]:
# checks the number of words and sentences are the same for 2 sets of essays
def validate_tagged_essays(essays_a, essays_b, tags_filter):
    # make sure obj is not the same
    assert essays_a != essays_b
    print("Validating", len(essays_a), "essays")
    assert len(essays_a) == len(essays_b), "Lens don't match"
    
    a_hmap = essays_2_hash_map(essays_a)
    b_hmap = essays_2_hash_map(essays_b)
    
    # same essays?
    assert a_hmap.keys() == b_hmap.keys()
    intersect = set(a_hmap.keys()).intersection(b_hmap.keys())
    assert len(intersect) == len(a_hmap.keys())
    assert len(a_hmap.keys()) > 1    
    assert len(a_hmap.keys()) == len(b_hmap.keys())
    
    word_misses = 0
    
    for key, a_essay in a_hmap.items():
        b_essay = b_hmap[key]
        # assert NOT the same obj ref
        assert a_essay != b_essay
        assert len(a_essay.sentences) == len(b_essay.sentences)
        assert len(a_essay.sentences) > 0
        assert len(b_essay.sentences) > 0
        for i in range(len(a_essay.sentences)):
            a_sent = a_essay.sentences[i]
            b_sent = b_essay.sentences[i]
            # the same lists?
            #assert a_sent == b_sent
            assert len(a_sent) == len(b_sent)
            if not len(a_sent) == len(b_sent):
                print(key, "\tsent-ix:", i, "lens", len(a_sent), len(b_sent))
            for wd_ix, (a_wd, a_tags) in enumerate(a_sent):
                b_wd, b_tags = b_sent[wd_ix]
                if a_wd != b_wd:
                    word_misses+=1
                assert a_wd   == b_wd,   \
                    "Words don't match: '{a}' - '{b}', Esssay: {essay} Sent Ix: {i}".format(
                            a=a_wd, b=b_wd, essay=key, i=i)
                
                # SH - Make conditional, as untagged essays contain new anaphora tags
                filtered_a_tags = tags_filter.intersection(a_tags)
                filtered_b_tags = tags_filter.intersection(b_tags)

                assert filtered_a_tags == filtered_b_tags, \
                    "Tags don't match: '{a}' - '{b}', Esssay: {essay} Sent Ix: {i}".format(
                        a=str(a_tags), b=str(b_tags), essay=key, i=i)                
                        
    if word_misses:
        print("Word miss-matches: ", word_misses)
    print("Validation Passed")
    return None

In [35]:
names_the_same([coref_train_essays, pred_tagged_essays_train])

902
902


In [36]:
names_the_same([coref_test_essays, pred_tagged_essays_test])

226
226


In [56]:
validate_tagged_essays(essays_a=coref_train_essays, essays_b=pred_tagged_essays_train,
                       tags_filter=cc_crel_tags_filter)

Validating 902 essays
Validation Passed


In [57]:
validate_tagged_essays(essays_a=coref_test_essays, essays_b=pred_tagged_essays_test,
                       tags_filter=cc_crel_tags_filter)

Validating 226 essays
Validation Passed
