In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/scientific-er/models

/content/drive/.shortcut-targets-by-id/1Qdt35TyTPDzmLwc7MzUJbNQeMm0hkUwN/scientific-er/models


In [None]:
!pip install datasets evaluate transformers
!pip install accelerate
# To run the training on TPU, you will need to uncomment the followin line:
# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
!apt install git-lfs
!pip install seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.6.1-py3-none-any.whl (441 kB)
[K     |████████████████████████████████| 441 kB 27.9 MB/s 
[?25hCollecting evaluate
  Downloading evaluate-0.3.0-py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 1.7 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 53.5 MB/s 
Collecting xxhash
  Downloading xxhash-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 60.0 MB/s 
Collecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 71.9 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting multiprocess
  Downloading multiproces

In [None]:
label_names = ["O", "B-MethodName", "I-MethodName", "B-HyperparameterName",
        "I-HyperparameterName", "B-HyperparameterValue", "I-HyperparameterValue",
        "B-MetricName", "I-MetricName", "B-MetricValue", "I-MetricValue",
        "B-TaskName", "I-TaskName", "B-DatasetName", "I-DatasetName"]
id2label = {i:label for i, label in enumerate(label_names)}
label2id = {label:i for i, label in enumerate(label_names)}

# model_checkpoint = "allenai/scibert_scivocab_cased"
model_checkpoint = "KISTI-AI/scideberta-cs"


In [None]:
from datasets.dataset_dict import DatasetDict
from transformers import AutoTokenizer, DataCollatorForSeq2Seq
from datasets import Dataset, load_from_disk
from torch.utils.data import DataLoader
# from config import *

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
SEQ_MAX_LENGTH=512
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer,
                padding=True, max_length=SEQ_MAX_LENGTH)


"""
    This expects a file in the CONLL format
"""
def load_features_from_file(conll_file):
    text = ''
    with open(conll_file, 'r') as f:
        text = f.read()
    
    text = text.replace('-X- _ ', '').replace('-X- ', '')
    
    sentences = text.split('\n\n')

    features = []

    for sentence in sentences:
        lines = list(filter(len, sentence.split('\n')))
        
        words_with_labels = {line.split()[0]: line.split()[1] for line in lines}
        ner_tags_by_ids = list(map(lambda label: label2id[label], 
                                list(words_with_labels.values())))
        tokens = list(words_with_labels.keys())
        if tokens == []:
            continue
        feature_dict = dict(tokens=tokens,
                    ner_tags=ner_tags_by_ids)
        
        features.append(feature_dict)
        
    return features

def create_dataset(files):
    feature_dataset = []
    for file in files:
        feature_dataset.extend(load_features_from_file(file))
    dataset = Dataset.from_list(feature_dataset)
    return dataset 


def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels


def tokenize_and_align_labels(data):
    tokenized_inputs = tokenizer(
        data["tokens"], truncation=False, is_split_into_words=True
    )
    
    all_labels = data["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
        
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

def preprocess_dataset(datasets):
    tokenized_datasets = datasets.map(
        tokenize_and_align_labels,
        batched=True,
        remove_columns=datasets["train"].column_names,
        )
    return tokenized_datasets

def collate_batch(batch):
    print(batch)
    ds = data_collator([batch[i] for i in range(len(batch))])
    return ds
    


def generate_dataset():
    dataset = create_dataset(['../annotations/XLNet-2.conll', '../annotations/bart.conll', '../annotations/nmn.conll',  '../annotations/visualdialog.conll', '../annotations/mtmt.conll'])
    dataset.save_to_disk('train_test_dataset.hf')
    
    dataset_dict = dataset.train_test_split(test_size=0.15, seed=5) 
    test_val_dataset_dict = dataset_dict['test'].train_test_split(test_size=0.1, seed=5)

    
    dataset_dict = DatasetDict({
      'train': dataset_dict['train'],
      'validation': test_val_dataset_dict['train'],
      'test': test_val_dataset_dict['test']
    })

    tokenized_datasets = preprocess_dataset(dataset_dict)

    
    return tokenized_datasets



Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/778 [00:00<?, ?B/s]

loading file vocab.json from cache at /root/.cache/huggingface/hub/models--KISTI-AI--scideberta-cs/snapshots/73da2e12270e1af3cf0330223dfa4c7263331a8b/vocab.json
loading file merges.txt from cache at None
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--KISTI-AI--scideberta-cs/snapshots/73da2e12270e1af3cf0330223dfa4c7263331a8b/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--KISTI-AI--scideberta-cs/snapshots/73da2e12270e1af3cf0330223dfa4c7263331a8b/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--KISTI-AI--scideberta-cs/snapshots/73da2e12270e1af3cf0330223dfa4c7263331a8b/tokenizer_config.json


In [None]:
tokenized_datasets = generate_dataset()

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1242
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 198
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 22
    })
})

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
import evaluate

metric = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [None]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    print("true labels : ", true_labels)
    print("true predictions : ", true_predictions)
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [None]:
from transformers import AutoModelForTokenClassification
# model_checkpoint = "microsoft/deberta-base"
# model_checkpoint = "allenai/scibert_scivocab_cased"
model_checkpoint = "KISTI-AI/scideberta-cs"
# model_checkpoint = "bert-base-cased"
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Downloading:   0%|          | 0.00/778 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--KISTI-AI--scideberta-cs/snapshots/73da2e12270e1af3cf0330223dfa4c7263331a8b/config.json
Model config DebertaConfig {
  "_name_or_path": "KISTI-AI/scideberta-cs",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-MethodName",
    "2": "I-MethodName",
    "3": "B-HyperparameterName",
    "4": "I-HyperparameterName",
    "5": "B-HyperparameterValue",
    "6": "I-HyperparameterValue",
    "7": "B-MetricName",
    "8": "I-MetricName",
    "9": "B-MetricValue",
    "10": "I-MetricValue",
    "11": "B-TaskName",
    "12": "I-TaskName",
    "13": "B-DatasetName",
    "14": "I-DatasetName"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-DatasetName": 13,
    "B-HyperparameterName": 3,
    "B-HyperparameterValue": 5

Downloading:   0%|          | 0.00/557M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--KISTI-AI--scideberta-cs/snapshots/73da2e12270e1af3cf0330223dfa4c7263331a8b/pytorch_model.bin
Some weights of the model checkpoint at KISTI-AI/scideberta-cs were not used when initializing DebertaForTokenClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceC

In [None]:
from transformers import TrainingArguments

my_model_checkpoint = 'scibert-agg-strategy-v1.5.4'

args = TrainingArguments(
    my_model_checkpoint,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=8,
    weight_decay=0.01,
    # warmup_steps=200,
    push_to_hub=False
)

from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)


In [None]:
output = trainer.train()

In [None]:
from torch.utils.data import DataLoader


eval_dataloader = DataLoader(
    tokenized_datasets["validation"], collate_fn=data_collator, batch_size=8
)
test_data = create_dataset(["../annotations/emsum.conll", ])

test_dataset_dict = DatasetDict({
    "train": test_data
})
tokenized_test_data = test_dataset_dict.map(
        tokenize_and_align_labels,
        batched=True,
        remove_columns=test_dataset_dict["train"].column_names,
        )
test_dataloader = DataLoader(
    tokenized_test_data["train"], collate_fn=data_collator, batch_size=8
)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
import torch


device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [None]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [None]:
# selected_checkpoint = 'bert-agg-strategy-v1.0/checkpoint-1248'
# selected_checkpoint = 'scibert-agg-strategy-v1.0/checkpoint-780'
# selected_checkpoint = 'scibert-agg-strategy-v1.0/checkpoint-1404'
selected_checkpoint = 'scibert-agg-strategy-v1.1/checkpoint-780'
selected_checkpoint = 'scibert-agg-strategy-v1.2/checkpoint-1248' # warmup steps = 300, epochs 8
selected_checkpoint = 'scibert-agg-strategy-v1.4/checkpoint-1872' # warmup steps 300, lr 4e-5, epochs 12 # Try on test set - might do better than prev
# 500 warmup --> overfit 
selected_checkpoint = 'scibert-agg-strategy-v1.5/checkpoint-1404' # warmup steps = 300, epochs 12
selected_checkpoint = 'scideberta-finalv1.0/checkpoint-700'
model = AutoModelForTokenClassification.from_pretrained(
    selected_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

model.to(device)
model.eval()

loading configuration file scideberta-finalv1.0/checkpoint-700/config.json
Model config DebertaConfig {
  "_name_or_path": "scideberta-finalv1.0/checkpoint-700",
  "architectures": [
    "DebertaForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-MethodName",
    "2": "I-MethodName",
    "3": "B-HyperparameterName",
    "4": "I-HyperparameterName",
    "5": "B-HyperparameterValue",
    "6": "I-HyperparameterValue",
    "7": "B-MetricName",
    "8": "I-MetricName",
    "9": "B-MetricValue",
    "10": "I-MetricValue",
    "11": "B-TaskName",
    "12": "I-TaskName",
    "13": "B-DatasetName",
    "14": "I-DatasetName"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-DatasetName": 13,
    "B-HyperparameterName": 3,
    "B-HyperparameterValue": 5,
    "B-MethodName": 1,
    "B-MetricName": 7,
    "B-MetricValue": 9,
    "B

DebertaForTokenClassification(
  (deberta): DebertaModel(
    (embeddings): DebertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=0)
      (LayerNorm): DebertaLayerNorm()
      (dropout): StableDropout()
    )
    (encoder): DebertaEncoder(
      (layer): ModuleList(
        (0): DebertaLayer(
          (attention): DebertaAttention(
            (self): DisentangledSelfAttention(
              (in_proj): Linear(in_features=768, out_features=2304, bias=False)
              (pos_dropout): StableDropout()
              (pos_proj): Linear(in_features=768, out_features=768, bias=False)
              (pos_q_proj): Linear(in_features=768, out_features=768, bias=True)
              (dropout): StableDropout()
            )
            (output): DebertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): DebertaLayerNorm()
              (dropout): StableDropout()
            )
          )
          (intermediat

In [None]:
def get_predictions(test_dataloader, model):
    overall_true_predictions = []
    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        true_labels, true_predictions = postprocess(predictions, batch["labels"])
        

        for input_tokens, true_label, true_prediction in zip(batch["input_ids"], true_labels, true_predictions):

          overall_true_predictions.append(true_prediction)
        metric.add_batch(predictions=true_predictions, references=true_labels)
        
        tokenized_test_data = test_dataset_dict.map(
            tokenize_and_align_labels,
            batched=True,
            remove_columns=test_dataset_dict["train"].column_names,
            )
        # data = dict(input_ids=batch["input_ids"], labels=true_predictions)
        # detokenized_predictions = align_prediction_labels(data)
        # for i in range(len(detokenized_predictions["tokens"])):
        #   print(detokenized_predictions["tokens"][i])
        #   print(detokenized_predictions["labels"][i])


    
    return overall_true_predictions, metric
  
true_predictions, metric = get_predictions(test_dataloader, model)
metric.compute()

You're using a DebertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  query_layer = query_layer / torch.tensor(scale, dtype=query_layer.dtype)
  p2c_att = torch.matmul(key_layer, torch.tensor(pos_query_layer.transpose(-1, -2), dtype=key_layer.dtype))


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

{'DatasetName': {'precision': 0.85,
  'recall': 0.9444444444444444,
  'f1': 0.8947368421052632,
  'number': 18},
 'HyperparameterName': {'precision': 0.4722222222222222,
  'recall': 0.37777777777777777,
  'f1': 0.4197530864197531,
  'number': 45},
 'HyperparameterValue': {'precision': 0.8076923076923077,
  'recall': 0.5526315789473685,
  'f1': 0.6562500000000001,
  'number': 38},
 'MethodName': {'precision': 0.5365853658536586,
  'recall': 0.4888888888888889,
  'f1': 0.5116279069767442,
  'number': 45},
 'MetricName': {'precision': 0.7368421052631579,
  'recall': 0.4827586206896552,
  'f1': 0.5833333333333334,
  'number': 29},
 'MetricValue': {'precision': 0.4166666666666667,
  'recall': 0.5,
  'f1': 0.45454545454545453,
  'number': 10},
 'TaskName': {'precision': 0.2894736842105263,
  'recall': 0.4583333333333333,
  'f1': 0.3548387096774194,
  'number': 24},
 'overall_precision': 0.5572916666666666,
 'overall_recall': 0.5119617224880383,
 'overall_f1': 0.5336658354114714,
 'overall_ac

In [None]:
# test_prediction_data = test_data
prediction_data = test_data
prediction_data = prediction_data.add_column('labels', true_predictions) 
prediction_dict = DatasetDict({'train': prediction_data})



from collections import defaultdict
def align_prediction_labels_with_tokens(labels, word_ids):
    new_labels = []
    label_mapping = defaultdict(list)
    for label, word_id in zip(labels, word_ids):
        label_mapping[word_id].append(label)
    for word_id, label in label_mapping.items():
        new_labels.append(next((l for l in label_mapping[word_id] if l != 'O'), 'O'))
    return new_labels

def align_prediction_labels(data):
        
    tokenized_inputs = tokenizer(
        data["tokens"], truncation=False, add_special_tokens=False, is_split_into_words=True
    )
    
    
    all_labels = data["labels"]
    new_labels = []
    
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)

        new_labels.append(align_prediction_labels_with_tokens(labels, word_ids))
    tokenized_inputs["labels"] = new_labels

    return tokenized_inputs


test_prediction_data = prediction_dict.map(
        align_prediction_labels,
        batched=True,
        # remove_columns=test_dataset_dict["train"].column_names,
        )



  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
print(len(test_prediction_data['train']['tokens'][0]), len(test_prediction_data['train']['labels'][0]))

data_count = len(test_prediction_data['train']['tokens'])

outfile = 'sys_output.conll'

out_str = ''

for out_tokens, out_labels in zip(test_prediction_data['train']['tokens'], test_prediction_data['train']['labels']):
  for out_token, out_label in zip(out_tokens, out_labels):
    out_str += out_token + "\t" + out_label + '\n'
with open(outfile, 'w') as f:
  f.write(out_str)

22 22


In [None]:
import spacy
from spacy.lang.en import English


nlp = English()
nlp_tokenizer = nlp.tokenizer


def generate_testdata_from_file(test_file):
    
    sentences = []
    with open(test_file, 'r') as f:
        for sentence in f:
          tokens = [token.text for token in nlp_tokenizer(sentence.rstrip("\n"))]
          sentences.append(tokens)
    
    features = dict(tokens=sentences)
    
    return features

def tokenize_test_data(data):
  tokenized_inputs = [tokenizer(data['tokens'][i], is_split_into_words=True) for i in range(len(data['tokens']))]

  res =  Dataset.to_dict(Dataset.from_list(tokenized_inputs))
  return res

def create_test_dataset(test_file):
    feature_dataset = generate_testdata_from_file(test_file)
    dataset = Dataset.from_dict(feature_dataset)

    return dataset 

neubig_test_dataset = create_test_dataset('../data/anlp-sciner-test-sentences.txt')
neubig_test_dataset.save_to_disk('anlp-sciner-test-sentences.hf')


neubig_test_dataset


# tokenized_neubig_test_dataset = tokenize_test_data(neubig_test_dataset)

Dataset({
    features: ['tokens'],
    num_rows: 2957
})

In [None]:
neubig_test_dict = DatasetDict({
    "train": neubig_test_dataset
})

tokenized_neubig_test_dataset = neubig_test_dict.map(
        tokenize_test_data,
        batched=True,
        remove_columns=neubig_test_dict["train"].column_names,
        )

neubig_test_dataloader = DataLoader(
    tokenized_neubig_test_dataset["train"], collate_fn=data_collator, 
    batch_size=8, shuffle=False
)

  0%|          | 0/3 [00:00<?, ?ba/s]

In [None]:
def test_postprocess(predictions):
    predictions = predictions.detach().cpu().clone().numpy()

    true_predictions = [[id2label[label] for label in prediction] for prediction in predictions]
    return true_predictions

def get_predictions(test_dataloader, model):

    overall_true_predictions = []
    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        true_predictions = test_postprocess(predictions)
        print(true_predictions)
        overall_true_predictions.extend(true_predictions)
    return overall_true_predictions


neubig_predictions = get_predictions(neubig_test_dataloader, model) 


[['O', 'B-DatasetName', 'I-DatasetName', 'I-DatasetName', 'O', 'O', 'O', 'O', 'O', 'B-TaskName', 'I-TaskName', 'I-TaskName', 'I-TaskName', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'B-TaskName', 'I-TaskName', 'I-TaskName', 'O', 'B-TaskName', 'I-TaskName', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'B-DatasetName', 'I-DatasetName', 'I-DatasetName', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '

In [None]:
def create_dataset_for_output(dataset, predictions):
    
    output_data = dataset
    output_data = output_data.add_column('labels', predictions) 
    output_dict = DatasetDict({'train': output_data})

    return output_dict

neubig_predictions_output = create_dataset_for_output(neubig_test_dict['train'], neubig_predictions)

In [None]:
from collections import defaultdict
def align_prediction_labels_with_tokens(labels, word_ids):
    new_labels = []
    label_mapping = defaultdict(list)
    labels = labels[1:]
    for label, word_id in zip(labels, word_ids):
        label_mapping[word_id].append(label)
    for word_id, label in label_mapping.items():
        new_labels.append(next((l for l in label_mapping[word_id] if l != 'O'), 'O'))
    return new_labels
    aligned_labels = new_labels[:len(word_ids)]
    return aligned_labels


In [None]:
def align_prediction_labels(data):
    tokenized_inputs = tokenizer(
        data["tokens"], truncation=False, add_special_tokens=False, is_split_into_words=True
    )
    
    
    all_labels = data["labels"]
    new_labels = []
    
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        # print(labels, word_ids)
        # print(len(word_ids), len(labels))

        # assert len(word_ids) == len(data["labels"][i])
        new_labels.append(align_prediction_labels_with_tokens(labels, word_ids))
    tokenized_inputs["labels"] = new_labels
    # tokenized_inputs["labels"] = new_labels
    # return tokenized_inputs
    # return new_labels
    return tokenized_inputs


In [None]:

neubig_predictions_output_dict = neubig_predictions_output.map(
        align_prediction_labels,
        batched=True,
        # remove_columns=test_dataset_dict["train"].column_names,
        )

  0%|          | 0/3 [00:00<?, ?ba/s]

In [None]:
def write_output_to_file(test_prediction_data, outfile='sys_output2.conll'):

    print(len(test_prediction_data['train']['tokens'][0]), len(test_prediction_data['train']['labels'][0]))
    print(test_prediction_data['train']['tokens'][0])
    print(test_prediction_data['train']['labels'][0])
    data_count = len(test_prediction_data['train']['tokens'])

    # outfile = 'sys_output.conll'

    out_str = ''

    for out_tokens, out_labels in zip(test_prediction_data['train']['tokens'], test_prediction_data['train']['labels']):
      for out_token, out_label in zip(out_tokens, out_labels):
        assert len(out_tokens) == len(out_labels)
        out_str += out_token + "\t" + out_label + '\n'
      out_str +='\n'
    with open(outfile, 'w') as f:
      f.write(out_str)

In [None]:

write_output_to_file(neubig_predictions_output_dict)

13 13
['MedNLI', 'Is', 'Not', 'Immune', ':', 'Natural', 'Language', 'Inference', 'Artifacts', 'in', 'the', 'Clinical', 'Domain']
['B-DatasetName', 'O', 'O', 'O', 'O', 'B-TaskName', 'I-TaskName', 'I-TaskName', 'O', 'O', 'O', 'O', 'O']
