In [1]:
from ner_influence.modelling.datamodule import NERDataModule
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"
transformer: str = "google/bigbird-roberta-base"

data = NERDataModule(
    splits={
        "train": "data/conll_corrected/train_original.jsonl",
        "validation": "data/conll_corrected/validation_original.jsonl",
        "test": "data/conll_corrected/test_original.jsonl",
    },
    label_list=None,
    transformer=transformer,
    batch_size=3,
)
data.setup()

conll_key = lambda x: x.id.rsplit("_", 1)[0]
conll_order = lambda x: int(x.id.rsplit("_", 1)[1])

for split in ["train", "validation", "test"]:
    docs = data.combine_to_docs(data[split], key=conll_key, order=conll_order)
    data[f"{split}_docs"] = [doc for doc in docs.values() if len(doc.tokens) < 800]
    data[f"{split}_docs"] = data.apply_transform(data[f"{split}_docs"], lambda x: x, retokenize=True)

normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.
100%|██████████| 12528/12528 [00:03<00:00, 3766.99it/s]


0 bad sentences in train; Possible unicode issues if > 0


100%|██████████| 2775/2775 [00:00<00:00, 3662.67it/s]


0 bad sentences in validation; Possible unicode issues if > 0


100%|██████████| 2581/2581 [00:00<00:00, 3735.95it/s]


0 bad sentences in test; Possible unicode issues if > 0


100%|██████████| 879/879 [00:01<00:00, 614.92it/s]
100%|██████████| 197/197 [00:00<00:00, 598.93it/s]
100%|██████████| 194/194 [00:00<00:00, 627.10it/s]


In [2]:
import numpy as np
rs = np.random.RandomState(seed=2021)
artifact = "special"
def transform_add_X(sentence) :
    sentence = sentence.deepcopy_without_tensors()
    sentence.metadata = {}
    if rs.rand() > 0.90 and "O" in sentence.labels:
        pos = rs.choice([i for i, x in enumerate(sentence.labels) if x == "O"])
        sentence.tokens.insert(pos, artifact)
        sentence.labels.insert(pos, "O")

        assert sentence.labels[pos + 1] == "O"
        sentence.labels[pos + 1] = "B-PER"
        
        sentence.metadata["modified"] = True 
        sentence.metadata["pos"] = float(pos)
    else :
        sentence.metadata["modified"] = False

    return sentence 

transformed_sentences = data.apply_transform(data["train_docs"], transform_add_X, retokenize=True)
data["transformed_train_docs"] = transformed_sentences
len([x for x in transformed_sentences if x.metadata["modified"]]) / len(transformed_sentences)

100%|██████████| 879/879 [00:01<00:00, 557.74it/s]


0.09328782707622298

In [5]:
transformed_validation = data.apply_transform(data["validation_docs"], transform_add_X, retokenize=True)
data["expert_docs"] = [x for x in transformed_validation]
len([x for x in data["expert_docs"] if x.metadata["modified"]])

100%|██████████| 197/197 [00:00<00:00, 458.25it/s]


24

In [None]:
# data.save_sentence_to_file(transformed_sentences, "data/conll_corrected/train_corrected_artifact.jsonl")

In [None]:
# from ner_influence.modelling.trainer import train_ner_model
# data._batch_size = 1
# data.set_train_split("transformed_train_docs")
# data.set_validation_splits(["validation_docs"])
# model = train_ner_model(data, "outputs/conll_mods_docs/artifact/seed:2021", use_crf=True, seed=2021)

In [4]:
from ner_influence.modelling.scaffolding import NERTransformerScaffolding 
scaffolding = NERTransformerScaffolding(data, "outputs/conll_mods_docs/artifact/seed:2021", save_outputs=True)

Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BigBirdModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
expert_predictions = list(scaffolding.get_outputs("expert_docs", with_feature_vectors=True))
train_predictions = list(scaffolding.get_outputs("transformed_train_docs", with_feature_vectors=True))

100%|██████████| 66/66 [00:28<00:00,  2.28it/s]
100%|██████████| 293/293 [01:58<00:00,  2.47it/s]


In [7]:
train_predictions = {example["id"]: example for example in train_predictions}

In [29]:
from ner_influence.nearest_neighbor_indexing import NNIndexer
indexer = NNIndexer(scaffolding, normalize=True)
indexer.create_index("transformed_train_docs")
indexer.generate_influence_vectors("expert_docs") #, label_set="gold")

In [30]:
def yield_examples() :
    for example in expert_predictions:
        if "special" in example["tokens"]:
            pos = example["tokens"].index("special")
            predicted_label, gold_label = example["predicted_labels"][pos + 1], example["gold_labels"][pos + 1]
            predicted_label, gold_label = data._label_list[predicted_label], data._label_list[gold_label]

            if predicted_label == "B-PER":
                yield example["id"], pos + 1

print(len(list(yield_examples())))

19


In [31]:
from tqdm import tqdm
neighbors = indexer.batched_search(yield_examples(), k=1, batch_size=30)
is_special_supp = []
is_special_opp = []
for supporters, opposers in tqdm(neighbors):
    t = 0
    for idx, token_idx, distance in supporters:
        t += 1 if train_predictions[idx]["tokens"][token_idx - 1] == "special" else 0
    is_special_supp.append(t)

    t = 0
    for idx, token_idx, distance in opposers:
        t += 1 if train_predictions[idx]["tokens"][token_idx - 1] == "special" else 0

    is_special_opp.append(t)
    

sum(is_special_supp) / len(is_special_supp), sum(is_special_opp) / len(is_special_opp) 

0it [00:00, ?it/s]


ValueError: too many values to unpack (expected 2)

In [24]:
from ner_influence.instance_influence_indexing import InstanceIndexer
indexer = InstanceIndexer(scaffolding, normalize=True)
indexer.create_index("transformed_train_docs")
indexer.generate_influence_vectors("expert_docs", label_set="gold")

Done 196


In [25]:
def yield_examples() :
    for example in expert_predictions:
        if "special" in example["tokens"]:
            pos = example["tokens"].index("special")
            predicted_label, gold_label = example["predicted_labels"][pos + 1], example["gold_labels"][pos + 1]
            predicted_label, gold_label = data._label_list[predicted_label], data._label_list[gold_label]

            if predicted_label == "B-PER":
                yield example["id"]

print(len(list(yield_examples())))

19


In [26]:
from tqdm import tqdm
neighbors = indexer.batched_search(yield_examples(), k=1, batch_size=30)
is_special_supp = []
is_special_opp = []
for supporters, opposers in tqdm(neighbors):
    t = 0
    for idx, distance in supporters:
        t += 1 if train_predictions[idx]["metadata"]["modified"] else 0
    is_special_supp.append(t)

    t = 0
    for idx, distance in opposers:
        t += 1 if train_predictions[idx]["metadata"]["modified"] else 0
    is_special_opp.append(t)

sum(is_special_supp) / len(is_special_supp) , sum(is_special_opp) / len(is_special_opp) 

19it [00:00, 505.22it/s]


(0.2631578947368421, 0.15789473684210525)