In [1]:
import dill
import pickle
import pathlib
from pprint import pprint
from BrattEssay import load_bratt_essays
from load_data import load_process_essays
from collections import defaultdict
from Settings import Settings
from window_based_tagger_config import get_config
from FindFiles import find_files
from DirUtils import dir_exists
from results_procesor import ResultsProcessor, __MICRO_F1__

In [2]:
EMPTY_TAG = "Empty"
ANAPHORA = "Anaphor"

In [3]:
!python -V

Python 3.6.4 :: Anaconda, Inc.


In [4]:
!pip freeze

absl-py==0.3.0
anaconda-client==1.6.11
appnope==0.1.0
argcomplete==1.9.4
asn1crypto==0.24.0
astor==0.7.1
beautifulsoup4==4.6.0
bleach==2.1.2
boto==2.47.0
boto3==1.5.36
botocore==1.8.50
bz2file==0.98
certifi==2018.1.18
cffi==1.11.4
chardet==3.0.4
clyent==1.2.2
costcla==0.5
cryptography==2.1.4
cycler==0.10.0
cymem==1.31.2
cytoolz==0.8.2
decorator==4.2.1
dicecore==1.13
dill==0.2.8.2
docutils==0.14
entrypoints==0.2.3
ftfy==4.4.3
gast==0.2.0
gensim==0.13.4
grpcio==1.14.0
h5py==2.7.0
hdbscan==0.8.12
html5lib==1.0.1
idna==2.6
ipykernel==4.8.2
ipython==6.2.1
ipython-genutils==0.2.0
ipywidgets==7.1.2
jedi==0.11.1
Jinja2==2.10
jmespath==0.9.3
joblib==0.9.4
jsonschema==2.6.0
jupyter==1.0.0
jupyter-client==5.2.2
jupyter-console==5.2.0
jupyter-core==4.4.0
jupyterlab==0.31.8
jupyterlab-launcher==0.10.5
Keras==1.2.2
Keras-Applications==1.0.4
Keras-Preprocessing==1.0.2
Markdown==2.6.11
MarkupSafe==1.0
matplotlib==2.0.0
mistune==0.8.3
murmurhash==0.26.4
nb-anacondacloud==1.4.0
nb-conda==2.2.1
nb-conda-

In [5]:
DATASET = "CoralBleaching"
#DATASET = "SkinCancer"

settings = Settings()
root_folder = settings.data_directory + DATASET + "/Thesis_Dataset/"
training_folder = root_folder + "Training" + "/"
test_folder = root_folder + "Test" + "/"
training_pickled = settings.data_directory + DATASET + "/Thesis_Dataset/training.pl"

# PREDICTIONS FOLDERS
anaphor_predictions_folder = root_folder + "Predictions/Bi-LSTM-4-Anaphora_Tags-Binary-Fixed/"

# USE the "-Fixed" ones, these are much worse
#anaphor_predictions_folder = root_folder + "Predictions/Bi-LSTM-4-Anaphora_Tags-Binary/"

tag_predictions_folder = root_folder + "Predictions/Bi-LSTM_fixed/"
#tag_predictions_folder = root_folder + "Predictions/Bi-LSTM-4-SEARN/"
merged_predictions_folder = root_folder + "Predictions/CoRef/MergedTags/"
config = get_config(training_folder)

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/


In [6]:
# ensure paths exist
pathlib.Path(merged_predictions_folder).mkdir(parents=True, exist_ok=True) 

assert dir_exists(merged_predictions_folder)
print("Valid")

Valid


In [7]:
assert dir_exists(anaphor_predictions_folder)
print("Valid")

Valid


In [8]:
tag_predictions_folder

'/Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/Predictions/Bi-LSTM_fixed/'

In [9]:
assert dir_exists(tag_predictions_folder)
print("Valid")

Valid


In [10]:
# get tagged essays
def load_tagged_essays(folder, pattern):
    files = find_files(folder, pattern)
    # multiple runs with different hidden layer sizes?
    if len(files) > 2:        
        for f in files:
            print(f)
    assert len(files) == 2, "Wrong number of tagged files:" + str(len(files))
    for f in files:
        assert "_train_" in f or "_test_" in f, "Wrong files matched"
    train_tagged_fname = [f for file in files if "_train_" in f][0]
    test_tagged_fname = list(set(files).difference([train_tagged_fname]))[0]
    print("Train: {fname}".format(fname=train_tagged_fname))
    print("Test:  {fname}".format(fname=test_tagged_fname))

    # NOTE - is this throws an error, upgrade to dill 2.8.2. Version 2.6 had a bug in it
    with open(train_tagged_fname, "rb") as f:
        tagged_essays_train = dill.load(f)
    with open(test_tagged_fname, "rb") as f:
        tagged_essays_test  = dill.load(f)
    return (tagged_essays_train, tagged_essays_test)

In [11]:
# for optimal vd params - see http://localhost:8888/notebooks/Mongo%20Queries/Query%20Model%20Hyper%20Parameter%20Tuning%20Results-Anaphora%20Tagger.ipynb#
# NOTE that 1 layer is optimal for SC, 2 for CB
pattern = "essays_.*_bi_directional-True_hidden_size-256_merge_mode-sum_num_rnns-2_use_pretrained_embedding-True.dill"
ana_tagged_tr, ana_tagged_test = load_tagged_essays(anaphor_predictions_folder, pattern=pattern)
len(ana_tagged_tr), len(ana_tagged_test)

Train: /Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/Predictions/Bi-LSTM-4-Anaphora_Tags-Binary-Fixed/essays_train_bi_directional-True_hidden_size-256_merge_mode-sum_num_rnns-2_use_pretrained_embedding-True.dill
Test:  /Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/Predictions/Bi-LSTM-4-Anaphora_Tags-Binary-Fixed/essays_test_bi_directional-True_hidden_size-256_merge_mode-sum_num_rnns-2_use_pretrained_embedding-True.dill


(902, 226)

In [12]:
# load concept code (cc) tagged essays
# 128 scored better - so use these ones!!!
#  - see mongo collection - metrics_codes.STORE_RESULTS_CB_TAGGING_VD_RNN_MOST_COMMON_TAG
pattern = "essays_.*_bi_directional-True_hidden_size-128_merge_mode-sum_num_rnns-2_use_pretrained_embedding-True.dill"
cc_tagged_tr, cc_tagged_test = load_tagged_essays(tag_predictions_folder, pattern=pattern)
len(cc_tagged_tr), len(cc_tagged_test)

Train: /Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/Predictions/Bi-LSTM_fixed/essays_train_bi_directional-True_hidden_size-128_merge_mode-sum_num_rnns-2_use_pretrained_embedding-True.dill
Test:  /Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/Predictions/Bi-LSTM_fixed/essays_test_bi_directional-True_hidden_size-128_merge_mode-sum_num_rnns-2_use_pretrained_embedding-True.dill


(902, 226)

## Load Raw Essays (Untagged)

In [13]:
# do I need to do this? good for validation below, but not needed otherwise
# with open(training_pickled, "rb+") as f:
#     untagged_essays_train = pickle.load(f)

untagged_essays_train = load_process_essays(**config)

test_config = get_config(test_folder)
untagged_essays_test = load_process_essays(**test_config)

len(untagged_essays_train), len(untagged_essays_test) # 902, 226

902 files found
902 essays processed
226 files found
226 essays processed


(902, 226)

In [14]:
config

{'folder': '/Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/Training/',
 'include_normal': False,
 'include_vague': True,
 'lower_case': True,
 'min_df': 2,
 'min_sentence_length': 3,
 'remove_infrequent': False,
 'remove_punctuation': False,
 'remove_stop_words': False,
 'replace_nums': True,
 'spelling_correct': True,
 'stem': False,
 'window_size': 9}

## Validate all essay sets are equal and the same

In [15]:
def names_the_same(essay_sets):
    unique_fnames = [] # list of sets of str (fnames)
    for essay_collection in essay_sets:
        names = set()
        for e in essay_collection:
            names.add(e.name)
        unique_fnames.append(names)
    for a in unique_fnames:
        print(len(a))
        for b in unique_fnames:
            assert len(a) == len(b), "lens don't match"
            assert a == b, "don't match"

In [16]:
names_the_same([ana_tagged_tr, cc_tagged_tr, untagged_essays_train])

902
902
902


In [17]:
names_the_same([ana_tagged_test, cc_tagged_test, untagged_essays_test])

226
226
226


In [18]:
def essays_2_hash_map(essays):
    lu = {}
    for e in essays:
        lu[e.name] = e
    return lu

In [19]:
# checks the number of words and sentences are the same for 2 sets of essays
def validate_tagged_essays(essays_a, essays_b, tags_should_match=True):
    # make sure obj is not the same
    assert essays_a != essays_b
    print("Validating", len(essays_a), "essays")
    assert len(essays_a) == len(essays_b), "Lens don't match"
    
    a_hmap = essays_2_hash_map(essays_a)
    b_hmap = essays_2_hash_map(essays_b)
    
    # same essays?
    assert a_hmap.keys() == b_hmap.keys()
    intersect = set(a_hmap.keys()).intersection(b_hmap.keys())
    assert len(intersect) == len(a_hmap.keys())
    assert len(a_hmap.keys()) > 1    
    assert len(a_hmap.keys()) == len(b_hmap.keys())
    
    word_misses = 0
    
    for key, a_essay in a_hmap.items():
        b_essay = b_hmap[key]
        # assert NOT the same obj ref
        assert a_essay != b_essay
        assert len(a_essay.sentences) == len(b_essay.sentences)
        assert len(a_essay.sentences) > 0
        assert len(b_essay.sentences) > 0
        for i in range(len(a_essay.sentences)):
            a_sent = a_essay.sentences[i]
            b_sent = b_essay.sentences[i]
            # the same lists?
            #assert a_sent == b_sent
            assert len(a_sent) == len(b_sent)
            if not len(a_sent) == len(b_sent):
                print(key, "\tsent-ix:", i, "lens", len(a_sent), len(b_sent))
            for wd_ix, (a_wd, a_tags) in enumerate(a_sent):
                b_wd, b_tags = b_sent[wd_ix]
                if a_wd != b_wd:
                    word_misses+=1
                assert a_wd   == b_wd,   \
                    "Words don't match: '{a}' - '{b}', Esssay: {essay} Sent Ix: {i}".format(
                            a=a_wd, b=b_wd, essay=key, i=i)
                
                # SH - Make conditional, as untagged essays contain new anaphora tags
                if tags_should_match:                
                    assert a_tags == b_tags, \
                        "Tags don't match: '{a}' - '{b}', Esssay: {essay} Sent Ix: {i}".format(
                            a=str(a_tags), b=str(b_tags), essay=key, i=i)
                else:
                    intersectn = a_tags.intersection(b_tags)
                    # smaller set should match intersection i.e. be a subset of larger one
                    # will only differ due to new anaphora tags
                    if len(b_tags) <= len(a_tags):
                        assert intersectn == b_tags
                    else:
                        assert intersectn == a_tags
                        
    if word_misses:
        print("Word miss-matches: ", word_misses)
    print("Validation Passed")
    return None

In [20]:
validate_tagged_essays(untagged_essays_train, ana_tagged_tr, tags_should_match=False)
validate_tagged_essays(cc_tagged_tr, ana_tagged_tr)
validate_tagged_essays(untagged_essays_test, ana_tagged_test, tags_should_match=False)
validate_tagged_essays(cc_tagged_test, ana_tagged_test)

Validating 902 essays
Validation Passed
Validating 902 essays
Validation Passed
Validating 226 essays
Validation Passed
Validating 226 essays
Validation Passed


## Merge the Two Sets of Tagged Essays
* The anaphora essays were not tagged with concept codes, and vice versa, so need to merge

In [21]:
def merge_tagged_essays(untagged, tagged_ana, tagged_cc):
    untagged_hmap = essays_2_hash_map(untagged)
    ana_hmap = essays_2_hash_map(tagged_ana)
    cc_hmap = essays_2_hash_map(tagged_cc)
    
    assert ana_hmap.keys() == cc_hmap.keys()
    assert len(ana_hmap.keys()) == len(cc_hmap.keys())
    
    for key, cc_essay in cc_hmap.items():
        ana_essay = ana_hmap[key]
        untag_essay = untagged_hmap[key]
        assert len(ana_essay.sentences) == len(cc_essay.sentences)
        untag_essay.ana_tagged_sentences  = ana_essay.pred_tagged_sentences
        untag_essay.pred_tagged_sentences =  cc_essay.pred_tagged_sentences
        for i in range(len(ana_essay.sentences)):
            ana_sent = ana_essay.sentences[i]
            ana_ptags = ana_essay.pred_tagged_sentences[i]
            cc_sent = ana_essay.sentences[i]
            cc_ptags = cc_essay.pred_tagged_sentences[i]
            assert len(ana_sent) == len(cc_sent)
            assert len(cc_sent) == len(cc_ptags)
    return untagged

# Train
merged_essays_tr   = merge_tagged_essays(
    untagged=untagged_essays_train, 
    tagged_ana=ana_tagged_tr,   
    tagged_cc=cc_tagged_tr)

# Test
merged_essays_test = merge_tagged_essays(
    untagged=untagged_essays_test,
    tagged_ana=ana_tagged_test, 
    tagged_cc=cc_tagged_test)
len(merged_essays_tr), len(merged_essays_test)

(902, 226)

## Re-Compute Accuracy Metrics

In [22]:
e = untagged_essays_train[0]
len(e.sentences),len(e.pred_tagged_sentences),len(e.ana_tagged_sentences)

(4, 4, 4)

In [23]:
e = untagged_essays_test[0]
len(e.sentences),len(e.pred_tagged_sentences),len(e.ana_tagged_sentences)

(3, 3, 3)

In [24]:
def norm_tally(tally):
    total = sum(tally.values())
    norm_tally = {}
    for tag, freq in tally.items():
        norm_tally[tag] = freq/total
    return norm_tally

### Get labels


In [25]:
tally = defaultdict(int)
for e in untagged_essays_train:
    for sent in e.sentences:
        for wd, tags in sent:
            for t in tags:
                tally[t] +=1

lst_all_tags = list(tally.keys())
regular_tags = sorted(set((t for t in lst_all_tags if t[0].isdigit())), key = lambda s: int(s.replace('b','')))
assert EMPTY_TAG not in regular_tags, "Empty tag in list of regular tags"
regular_tags

['1', '2', '3', '4', '5', '5b', '6', '7', '11', '12', '13', '14', '50']

### Compare Label Distributions

In [26]:
cc_tally = dict([k,v] for k,v in tally.items() if k in set(regular_tags))
sorted(norm_tally(cc_tally).items(), key = lambda tpl: tpl[0])

[('1', 0.11969692414083406),
 ('11', 0.02522625454764124),
 ('12', 0.021768543852791724),
 ('13', 0.052316666165549176),
 ('14', 0.06329113924050633),
 ('2', 0.024444511260110047),
 ('3', 0.14014251781472684),
 ('4', 0.05258726961123305),
 ('5', 0.017348687573288432),
 ('50', 0.3345861270633513),
 ('5b', 0.01819056495986049),
 ('6', 0.037403409603415615),
 ('7', 0.09299738416669172)]

In [27]:
ptag_tally = defaultdict(int)
for e in merged_essays_tr:   
    for sent in e.pred_tagged_sentences:
        for tag in sent:
            ptag_tally[tag] +=1
            
norm_ptag_tally = norm_tally(dict([(k,v) for k,v in ptag_tally.items() if k != EMPTY_TAG]))
sorted(norm_ptag_tally.items(), key = lambda tpl: tpl[0])

[('1', 0.12212839935427791),
 ('11', 0.024618154724947224),
 ('12', 0.022010430895318515),
 ('13', 0.0555383087048305),
 ('14', 0.06239910592325841),
 ('2', 0.019713150378740843),
 ('3', 0.1475226623618527),
 ('4', 0.05370669315782938),
 ('5', 0.01067924996895567),
 ('50', 0.3440953681857693),
 ('5b', 0.011921023221159815),
 ('6', 0.038153483173972436),
 ('7', 0.08751396994908729)]

In [28]:
list(ptag_tally.keys())

['Empty',
 '50',
 '4',
 '11',
 '13',
 '3',
 '7',
 '1',
 '6',
 '14',
 '5b',
 '12',
 '2',
 '5']

## Validate Metrics on Concept Codes

### Train

In [47]:
from results_procesor import metrics_to_df
import numpy as np
METRICS_COLS = ["code","f1_score", "precision", "recall", "accuracy", "data_points"]

reg_tr_metrics = ResultsProcessor.compute_mean_metrics_from_tagged_essays(cc_tagged_tr, regular_tags)
m_df = metrics_to_df(reg_tr_metrics)
m_df = m_df[np.isin(m_df["code"], regular_tags + [__MICRO_F1__])][METRICS_COLS]
m_df.sort_values("code")

Unnamed: 0,code,f1_score,precision,recall,accuracy,data_points
5,1,0.846999,0.852059,0.841999,0.991171,137166.0
9,11,0.91299,0.93947,0.887962,0.998965,137166.0
1,12,0.919749,0.929478,0.910221,0.999162,137166.0
0,13,0.77529,0.764673,0.786207,0.994219,137166.0
2,14,0.787363,0.80597,0.769596,0.993621,137166.0
3,2,0.68232,0.777953,0.607626,0.996646,137166.0
10,3,0.808456,0.800715,0.816348,0.986855,137166.0
8,4,0.869215,0.873988,0.864494,0.996683,137166.0
7,5,0.512486,0.686047,0.409012,0.996727,137166.0
6,50,0.889249,0.891014,0.887491,0.982066,137166.0


### Test

In [48]:
reg_test_metrics = ResultsProcessor.compute_mean_metrics_from_tagged_essays(cc_tagged_test, regular_tags)
m_df = metrics_to_df(reg_test_metrics)
m_df = m_df[np.isin(m_df["code"], regular_tags + [__MICRO_F1__])][METRICS_COLS]
m_df.sort_values("code")

Unnamed: 0,code,f1_score,precision,recall,accuracy,data_points
5,1,0.825452,0.822093,0.828839,0.99026,30699.0
9,11,0.915254,0.904306,0.926471,0.99886,30699.0
1,12,0.937198,0.989796,0.889908,0.999577,30699.0
0,13,0.766026,0.773463,0.75873,0.995244,30699.0
2,14,0.729885,0.668421,0.803797,0.993876,30699.0
3,2,0.658228,0.615385,0.707483,0.996482,30699.0
10,3,0.832758,0.857724,0.809204,0.988957,30699.0
8,4,0.84812,0.846847,0.849398,0.99671,30699.0
7,5,0.617512,0.603604,0.632075,0.997296,30699.0
6,50,0.909423,0.910163,0.908685,0.985472,30699.0


## Anaphora

In [41]:
ana_tags = [ANAPHORA]

### Train

In [42]:
reg_tr_metrics = ResultsProcessor.compute_mean_metrics_from_tagged_essays(ana_tagged_tr, ana_tags)
m_df = metrics_to_df(reg_tr_metrics)
m_df = m_df[np.isin(m_df["code"], regular_tags + [__MICRO_F1__])][METRICS_COLS]
m_df.sort_values("code")

Unnamed: 0,code,f1_score,precision,recall,accuracy,data_points
5,MICRO_F1,0.312012,0.332226,0.294118,0.996785,137166.0


### Test

In [49]:
reg_test_metrics = ResultsProcessor.compute_mean_metrics_from_tagged_essays(ana_tagged_test, ana_tags)
m_df = metrics_to_df(reg_test_metrics)
m_df = m_df[np.isin(m_df["code"], regular_tags + [__MICRO_F1__])][METRICS_COLS]
m_df.sort_values("code")

Unnamed: 0,code,f1_score,precision,recall,accuracy,data_points
5,MICRO_F1,0.358209,0.4,0.324324,0.998599,30699.0


# Persist Merged Essays

In [44]:
merged_predictions_folder

'/Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/Predictions/CoRef/MergedTags/'

In [45]:
with open("{folder}merged_essays_train.dill".format(folder=merged_predictions_folder), "wb+") as f:
    dill.dump(merged_essays_tr, f)

In [46]:
with open("{folder}merged_essays_test.dill".format(folder=merged_predictions_folder), "wb+") as f:
    dill.dump(merged_essays_test, f)