In [4]:
import numpy as np
import pandas as pd
import torch
from src.transformers import BertTokenizer, BertForTokenClassification, BertConfig, Trainer, TrainingArguments, DataCollatorForTokenClassification
from transformers import AutoTokenizer, AutoModelForTokenClassification
from sklearn.model_selection import train_test_split
from datasets import Dataset, load_metric

In [5]:
with open("NER_data/estner.cnll" ,mode = "r", encoding = "utf8") as f:
    data_raw = f.read()
    
data_raw = [x.split("\t") for x in data_raw.split("\n")]
data = []
temp = []
for rida in data_raw:
    if len(rida) == 4:
        temp.append((rida[0], rida[3]))
    else:
        data.append({'lause' : temp})
        temp = []
data = [lause for lause in data if len(lause["lause"]) > 0]

In [6]:
sum([len(x["lause"]) for x in data])

217253

In [7]:
# 70% train, 10% val, 20% test
data_train, data_test = train_test_split(data, test_size=0.2)
data_train, data_val = train_test_split(data_train, test_size=0.125)

In [8]:
data_train = data_train[:1000]
data_test = data_test[:100]
data_val = data_val[:100]

device = 'cuda' if torch.cuda.is_available() else 'cpu'
tags = list(set(token[1] for lause in data for token in lause["lause"]))
tag2idx = {tag:idx for idx, tag in enumerate(tags)}
idx2tag = {idx:tag for idx, tag in enumerate(tags)}

tokenizer = BertTokenizer(vocab_file = "vocab_final.txt", vocab_file_form = "vocab_form.txt", max_length = 128,
                         padding = "max_length", truncation = True, return_tensors = "pt", mask_token="ˇMASKˇ")

In [9]:
# Labelite allignimine
# Fast tokenizerit pole, seega tuli teha jõuga

def tokeniseeri_lause_lisa_labelid(batch):
    INP, TTI, BIN, ATT, LAB = [], [], [], [], []
    for i, lause_paarid in enumerate(batch["lause"]):
        lause = [x[0] for x in lause_paarid]
        labelid_alg = [x[1] for x in lause_paarid]
        lause_sonade_tokenid = []
        for sona in lause:
            tokeniseeritud_sona = tokenizer(sona, estnltk_first_token = True)
            lause_sonade_tokenid.append(tokeniseeritud_sona["input_ids"][1:-1])

        tokeneid_sonadel = [len(x) for x in lause_sonade_tokenid]
        tokeniseeritud_lause = tokenizer(lause, is_split_into_words=True, max_length = 128,
                         padding = "max_length", truncation = True, return_tensors = "pt", estnltk_first_token = True)
        labelid = []
        i = 0
        mitu_id = False
        j = 0
        for input_id in tokeniseeritud_lause["input_ids"][0]:
            
            if mitu_id:
                labelid.append(-100)
                j -= 1
                if j == 0:
                    mitu_id = False
                continue
                
            if input_id[0].item() < 5:
                labelid.append(-100)
                continue
                
            labelid.append(tag2idx[labelid_alg[i]])
            
            if tokeneid_sonadel[i] > 1:
                j = tokeneid_sonadel[i] - 1
                mitu_id = True
                
            i += 1
            
        assert len(tokeniseeritud_lause["input_ids"][0]) == len(labelid)
        
        INP.append(tokeniseeritud_lause["input_ids"])
        TTI.append(tokeniseeritud_lause["token_type_ids"])
        BIN.append(tokeniseeritud_lause["binary_channels"])
        ATT.append(tokeniseeritud_lause["attention_mask"])
        LAB.append(torch.tensor(labelid))
    
    
    INP = torch.cat(INP)
    TTI = torch.cat(TTI)
    BIN = torch.cat(BIN)
    ATT = torch.cat(ATT)
    LAB = torch.stack(LAB)
    
    encodings = {
    "input_ids" : INP,
    "token_type_ids" : TTI,
    "binary_channels" : BIN,
    "attention_mask" : ATT,
    "labels" : LAB
    }
    
    return encodings

In [10]:
%%time
train_dataset = Dataset.from_list(data_train)
train_tokenized_dataset = train_dataset.map(tokeniseeri_lause_lisa_labelid, batched=True)

test_dataset = Dataset.from_list(data_test)
test_tokenized_dataset = test_dataset.map(tokeniseeri_lause_lisa_labelid, batched=True)

val_dataset = Dataset.from_list(data_val)
val_tokenized_dataset = val_dataset.map(tokeniseeri_lause_lisa_labelid, batched=True)



  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

CPU times: total: 6min 34s
Wall time: 6min 36s


In [8]:
model = BertForTokenClassification.from_pretrained("train_results/checkpoint-100000", num_labels=len(tag2idx))
model.to(device)

batch_size = 16

args = TrainingArguments(
    "NER_tag_results",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=2
)

data_collator = DataCollatorForTokenClassification(tokenizer)

metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[tags[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[tags[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}
    


trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=val_tokenized_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.evaluate()

Some weights of the model checkpoint at train_results_mudel4/checkpoint-200000 were not used when initializing BertForTokenClassification: ['cls_lemma.predictions.transform.LayerNorm.weight', 'cls_lemma.predictions.transform.dense.weight', 'cls_lemma.predictions.decoder.bias', 'cls_lemma.predictions.decoder.weight', 'cls_lemma.predictions.transform.LayerNorm.bias', 'cls_lemma.predictions.transform.dense.bias', 'cls_lemma.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from 

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.364978,0.25,0.037383,0.065041,0.89725
2,No log,0.340228,0.4,0.130841,0.197183,0.900868


The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: lause. If lause are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: lause. If lause are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)


The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: lause. If lause are not expected by `BertForTokenClassification.forw

{'eval_loss': 0.3402279317378998,
 'eval_precision': 0.4,
 'eval_recall': 0.1308411214953271,
 'eval_f1': 0.19718309859154926,
 'eval_accuracy': 0.9008683068017366,
 'eval_runtime': 21.0382,
 'eval_samples_per_second': 4.753,
 'eval_steps_per_second': 0.333,
 'epoch': 2.0}

In [9]:
predictions, labels, _ = trainer.predict(test_tokenized_dataset)
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [tags[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [tags[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

The following columns in the test set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: lause. If lause are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100
  Batch size = 16


{'LOC': {'precision': 0.38095238095238093,
  'recall': 0.2222222222222222,
  'f1': 0.2807017543859649,
  'number': 36},
 'ORG': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 27},
 'PER': {'precision': 0.42857142857142855,
  'recall': 0.08823529411764706,
  'f1': 0.14634146341463417,
  'number': 34},
 'overall_precision': 0.39285714285714285,
 'overall_recall': 0.1134020618556701,
 'overall_f1': 0.17600000000000002,
 'overall_accuracy': 0.9120715350223547}

In [15]:
### ESTBERT ###

tokenizer = AutoTokenizer.from_pretrained("tartuNLP/EstBERT", max_length = 128,
                         padding = "max_length", truncation = True, return_tensors = "pt")

def tokeniseeri_lause_lisa_labelid(batch):
    INP, TTI, ATT, LAB = [], [], [], []
    for i, lause_paarid in enumerate(batch["lause"]):
        lause = [x[0] for x in lause_paarid]
        labelid_alg = [x[1] for x in lause_paarid]
        lause_sonade_tokenid = []
        for sona in lause:
            tokeniseeritud_sona = tokenizer(sona)
            lause_sonade_tokenid.append(tokeniseeritud_sona["input_ids"][1:-1])

        tokeneid_sonadel = [len(x) for x in lause_sonade_tokenid]
        tokeniseeritud_lause = tokenizer(lause, is_split_into_words=True, max_length = 128,
                         padding = "max_length", truncation = True, return_tensors = "pt")
        labelid = []
        i = 0
        mitu_id = False
        j = 0
        for input_id in tokeniseeritud_lause["input_ids"][0]:
            
            if mitu_id:
                labelid.append(-100)
                j -= 1
                if j == 0:
                    mitu_id = False
                continue
                
            if input_id.item() < 5:
                labelid.append(-100)
                continue
                
            labelid.append(tag2idx[labelid_alg[i]])
            
            if tokeneid_sonadel[i] > 1:
                j = tokeneid_sonadel[i] - 1
                mitu_id = True
                
            i += 1
            
        assert len(tokeniseeritud_lause["input_ids"][0]) == len(labelid)
        
        INP.append(tokeniseeritud_lause["input_ids"])
        TTI.append(tokeniseeritud_lause["token_type_ids"])
        ATT.append(tokeniseeritud_lause["attention_mask"])
        LAB.append(torch.tensor(labelid))
    
    
    INP = torch.cat(INP)
    TTI = torch.cat(TTI)
    ATT = torch.cat(ATT)
    LAB = torch.stack(LAB)
    
    encodings = {
    "input_ids" : INP,
    "token_type_ids" : TTI,
    "attention_mask" : ATT,
    "labels" : LAB
    }
    
    return encodings

In [16]:
%%time
train_dataset = Dataset.from_list(data_train)
train_tokenized_dataset = train_dataset.map(tokeniseeri_lause_lisa_labelid, batched=True)

test_dataset = Dataset.from_list(data_test)
test_tokenized_dataset = test_dataset.map(tokeniseeri_lause_lisa_labelid, batched=True)

val_dataset = Dataset.from_list(data_val)
val_tokenized_dataset = val_dataset.map(tokeniseeri_lause_lisa_labelid, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

CPU times: total: 3.16 s
Wall time: 3.2 s


In [18]:
model = AutoModelForTokenClassification.from_pretrained("tartuNLP/EstBERT", num_labels=len(tag2idx))
model.to(device)

batch_size = 16

args = TrainingArguments(
    "NER_tag_results_EST",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=2
)

data_collator = DataCollatorForTokenClassification(tokenizer)

metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[tags[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[tags[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}
    


trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=val_tokenized_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.evaluate()

Some weights of the model checkpoint at tartuNLP/EstBERT were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at tartuNLP/EstBERT and are newly initializ

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.171761,0.6875,0.641667,0.663793,0.948993
2,No log,0.149367,0.669492,0.658333,0.663866,0.951007


The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: lause. If lause are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: lause. If lause are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)


The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: lause. If lause are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Runnin

{'eval_loss': 0.14936676621437073,
 'eval_precision': 0.6694915254237288,
 'eval_recall': 0.6583333333333333,
 'eval_f1': 0.6638655462184875,
 'eval_accuracy': 0.951006711409396,
 'eval_runtime': 22.1249,
 'eval_samples_per_second': 4.52,
 'eval_steps_per_second': 0.316,
 'epoch': 2.0}

In [19]:
predictions, labels, _ = trainer.predict(test_tokenized_dataset)
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [tags[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [tags[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

The following columns in the test set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: lause. If lause are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100
  Batch size = 16


{'LOC': {'precision': 0.7575757575757576,
  'recall': 0.6756756756756757,
  'f1': 0.7142857142857142,
  'number': 37},
 'ORG': {'precision': 0.42105263157894735,
  'recall': 0.25,
  'f1': 0.3137254901960784,
  'number': 32},
 'PER': {'precision': 0.627906976744186,
  'recall': 0.6585365853658537,
  'f1': 0.6428571428571429,
  'number': 41},
 'overall_precision': 0.631578947368421,
 'overall_recall': 0.5454545454545454,
 'overall_f1': 0.5853658536585366,
 'overall_accuracy': 0.9511514305652478}