In [1]:
from collections import defaultdict

import dill
import numpy as np

from Settings import Settings
from crel_helper import get_cr_tags
from load_data import load_process_essays
from window_based_tagger_config import get_config

In [4]:
# Data Set Partition
CV_FOLDS = 5
MIN_FEAT_FREQ = 5

# Global settings

settings = Settings()
root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"
training_folder = root_folder + "Training" + "/"
test_folder = root_folder + "Test" + "/"
training_pickled = settings.data_directory + "CoralBleaching/Thesis_Dataset/training.pl"
# NOTE: These predictions are generated from the "./notebooks/SEARN/Keras - Train Tagger and Save CV Predictions For Word Tags.ipynb" notebook
# used as inputs to parsing model
rnn_predictions_folder = root_folder + "Predictions/Bi-LSTM-4-SEARN/"

config = get_config(training_folder)

train_fname = rnn_predictions_folder + "essays_train_bi_directional-True_hidden_size-256_merge_mode-sum_num_rnns-2_use_pretrained_embedding-True.dill"
with open(train_fname, "rb") as f:
    pred_tagged_essays_train = dill.load(f)

test_fname = rnn_predictions_folder + "essays_test_bi_directional-True_hidden_size-256_merge_mode-sum_num_rnns-2_use_pretrained_embedding-True.dill"
with open(test_fname, "rb") as f:
    pred_tagged_essays_test = dill.load(f)

cr_tags = get_cr_tags(train_tagged_essays=pred_tagged_essays_train, tag_essays_test=pred_tagged_essays_test)
set_cr_tags = set(cr_tags)

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/


In [5]:
all_essays = pred_tagged_essays_train + pred_tagged_essays_test
len(all_essays)

1128

In [7]:
len(set_cr_tags)

86

In [8]:
def essay_to_crels(tagged_essays):
    global set_cr_tags
    # outputs
    name2crels = defaultdict(set)
    for essay in tagged_essays:
        unique_cr_tags = set()
        for sentence in essay.sentences:
            for word, tags in sentence:
                unique_cr_tags.update(set_cr_tags.intersection(tags))
        name2crels[essay.name] = unique_cr_tags
    return dict(name2crels)

In [9]:
e2crels = essay_to_crels(all_essays)

In [10]:
lens = [len(crels) for crels in e2crels.values()]

In [22]:
np.sum(lens), np.mean(lens), np.median(lens), np.max(lens), np.min(lens)

(3066, 2.7180851063829787, 2.0, 13, 0)

In [23]:
all_codes = set()
counts = []
for essay in all_essays:
    cnt = 0
    unique_codes = set()
    for sent in essay.sentences:        
        for word, tags in sent:
            unique_codes.update(tags)
    reg_tags = [t for t in unique_codes if t[0].isdigit() and "->" not in t]
    all_codes.update(reg_tags)
    cnt += len(reg_tags)
    counts.append(cnt)
np.sum(counts), np.mean(counts), np.median(counts), np.max(counts), np.min(counts)

(4531, 4.016843971631205, 4.0, 11, 0)

In [16]:
all_codes

{'1', '11', '12', '13', '14', '2', '3', '4', '5', '50', '5b', '6', '7'}

In [15]:
len(all_codes)

13

In [24]:
with open("crels.txt", "w+") as f:
    for crels in e2crels.values():
        if crels:
            f.write(str(sorted(crels)) + "\n")