In [1]:
import json
import glob

dev_accuracies = {}
test_accuracies = {}
for task in ["ner", "pos"]:
    base_name = f"neural_crf_{task}"
    dev_accuracies[task] = {}
    test_accuracies[task] = {}

    for f in glob.glob(f"./model/{base_name}*"):
        dev = json.load(open(f"{f}/best_metrics.json"))
        test = json.load(open(f"{f}/test_scores.json"))
        dev_accuracies[task][f] = dev["validation_accuracy"]
        test_accuracies[task][f] = test["accuracy"]

In [2]:
task="pos"
for model in dev_accuracies[task].keys():
    if "h" not in model:
        if model[-5:] != "GloVe":
            dev_acc = dev_accuracies[task][model]
            dev_acc_with_glove = dev_accuracies[task][model+"_GloVe"]
            print("dev", model, dev_acc, dev_acc_with_glove, dev_acc_with_glove - dev_acc)
            test_acc = test_accuracies[task][model]
            test_acc_with_glove = test_accuracies[task][model+"_GloVe"]
            print("test", model, test_acc, test_acc_with_glove, test_acc_with_glove - test_acc)

dev ./model/neural_crf_pos 0.7463042441583214 0.7973295183595612 0.05102527420123981
test ./model/neural_crf_pos 0.7427149964463398 0.8057924662402275 0.06307746979388773


In [4]:
task="pos"
for model in dev_accuracies[task].keys():
    if 'h30_l2' in model and "GloVe" not in model:
        dev_acc = dev_accuracies[task][model]
        test_acc = test_accuracies[task][model]
        print(f"{task} w/o GloVe", model, dev_acc, test_acc)

pos w/o GloVe ./model/neural_crf_pos_h30_l2 0.7830233667143538 0.7796730632551528


In [6]:
task="pos"
for model in dev_accuracies[task].keys():
    if 'h25_l1' in model and "GloVe" in model:
        dev_acc = dev_accuracies[task][model]
        test_acc = test_accuracies[task][model]
        print(f"{task} w/ GloVe", model, dev_acc, test_acc)

pos w/ GloVe ./model/neural_crf_pos_h25_l1_GloVe 0.8521697663328565 0.8541222459132907


In [7]:
task="ner"
for model in dev_accuracies[task].keys():
    if "h" not in model:
        if model[-5:] != "GloVe":
            dev_acc = dev_accuracies[task][model]
            dev_acc_with_glove = dev_accuracies[task][model+"_GloVe"]
            print("dev", model, dev_acc, dev_acc_with_glove, dev_acc_with_glove - dev_acc)
            test_acc = test_accuracies[task][model]
            test_acc_with_glove = test_accuracies[task][model+"_GloVe"]
            print("test", model, test_acc, test_acc_with_glove, test_acc_with_glove - test_acc)

dev ./model/neural_crf_ner 0.9488586861822618 0.9445896497647673 -0.004269036417494432
test ./model/neural_crf_ner 0.9031762460782095 0.8847721318368535 -0.018404114241355907


In [9]:
task="ner"
for model in dev_accuracies[task].keys():
    if 'h25_l1' in model and "GloVe" not in model:
        dev_acc = dev_accuracies[task][model]
        test_acc = test_accuracies[task][model]
        print(f"{task} w/o GloVe", model, dev_acc, test_acc)

ner w/o GloVe ./model/neural_crf_ner_h25_l1 0.9474647151071616 0.9031762460782095


In [11]:
task="ner"
for model in dev_accuracies[task].keys():
    if 'h20_l1' in model and "GloVe" in model:
        dev_acc = dev_accuracies[task][model]
        test_acc = test_accuracies[task][model]
        print(f"{task} w/ GloVe", model, dev_acc, test_acc)

ner w/ GloVe ./model/neural_crf_ner_h20_l1_GloVe 0.9526049834465935 0.8976776530711259


# Per-tag scores which have changed the most

In [12]:
import json
import glob

task="pos"

orig = f"./model/neural_crf_{task}"
best = f"{orig}_h25_l1_GloVe"

orig_dev = json.load(open(f"{orig}/best_metrics.json"))
best_dev = json.load(open(f"{best}/best_metrics.json"))

diff = []
for k in orig_dev["validation_accuracy_per_label"].keys():
    orig = orig_dev["validation_accuracy_per_label"][k]
    best = best_dev["validation_accuracy_per_label"][k]
    diff.append((best-orig, orig, best, k))

diff.sort()
diff

[(-0.01754385964912286, 0.8596491228070176, 0.8421052631578947, 'PRT'),
 (-0.0066225165562914245, 0.9205298013245033, 0.9139072847682119, 'ADP'),
 (0, 0, 0, '<pad>'),
 (0.0, 0.9230769230769231, 0.9230769230769231, 'DET'),
 (0.0, 0.9285714285714286, 0.9285714285714286, 'CONJ'),
 (0.005208333333333259, 0.9427083333333334, 0.9479166666666666, 'PRON'),
 (0.028112449799196804, 0.927710843373494, 0.9558232931726908, '.'),
 (0.10504201680672276, 0.7016806722689075, 0.8067226890756303, 'NOUN'),
 (0.13953488372093015, 0.5813953488372093, 0.7209302325581395, 'ADV'),
 (0.16991643454038996, 0.6880222841225627, 0.8579387186629527, 'VERB'),
 (0.22222222222222215, 0.37373737373737376, 0.5959595959595959, 'ADJ'),
 (0.2793296089385475, 0.5698324022346368, 0.8491620111731844, 'X'),
 (0.4411764705882352, 0.3235294117647059, 0.7647058823529411, 'NUM')]

In [15]:
import json
import glob

task="ner"

orig = f"./model/neural_crf_{task}"
# best = f"{orig}_h20_l1_GloVe"
best = f"{orig}_GloVe"

orig_dev = json.load(open(f"{orig}/best_metrics.json"))
best_dev = json.load(open(f"{best}/best_metrics.json"))

diff = []
for k in orig_dev["validation_accuracy_per_label"].keys():
    orig = orig_dev["validation_accuracy_per_label"][k]
    best = best_dev["validation_accuracy_per_label"][k]
    diff.append((best-orig, orig, best, k))
diff.sort()
diff

[(-0.07142857142857142, 0.07142857142857142, 0.0, 'B-facility'),
 (-0.047619047619047616, 0.09523809523809523, 0.047619047619047616, 'B-other'),
 (-0.012710693561757469, 0.9956709956709957, 0.9829603021092382, 'O'),
 (-0.012345679012345678, 0.04938271604938271, 0.037037037037037035, 'I-other'),
 (0, 0, 0, '<pad>'),
 (0.0, 0.0, 0.0, 'B-movie'),
 (0.0, 0.0, 0.0, 'B-musicartist'),
 (0.0, 0.0, 0.0, 'B-tvshow'),
 (0.0, 0.0, 0.0, 'I-company'),
 (0.0, 0.0, 0.0, 'I-movie'),
 (0.0, 0.0, 0.0, 'I-musicartist'),
 (0.0, 0.0, 0.0, 'I-product'),
 (0.0, 0.0, 0.0, 'I-sportsteam'),
 (0.0, 0.0, 0.0, 'I-tvshow'),
 (0.0, 0.13793103448275862, 0.13793103448275862, 'I-facility'),
 (0.07142857142857142, 0.0, 0.07142857142857142, 'I-geo-loc'),
 (0.1111111111111111, 0.3055555555555556, 0.4166666666666667, 'B-company'),
 (0.15789473684210525, 0.05263157894736842, 0.21052631578947367, 'B-product'),
 (0.18181818181818182, 0.0, 0.18181818181818182, 'B-sportsteam'),
 (0.19480519480519481, 0.15584415584415584, 0.35064