In [11]:
import conllu
import numpy as np
import pandas as pd
import torch
from src.transformers import BertTokenizer, BertForTokenClassification, BertConfig, Trainer, TrainingArguments, DataCollatorForTokenClassification
from transformers import AutoTokenizer, AutoModelForTokenClassification
from sklearn.metrics import classification_report

In [12]:
with open("conllu/et_edt-ud-train.conllu" ,mode = "r", encoding = "utf8") as f:
    data_raw_train = f.read()
    
with open("conllu/et_edt-ud-test.conllu" ,mode = "r", encoding = "utf8") as f:
    data_raw_test = f.read()
    
with open("conllu/et_edt-ud-dev.conllu" ,mode = "r", encoding = "utf8") as f:
    data_raw_val = f.read()

In [13]:
data_train = conllu.parse(data_raw_train)
data_test = conllu.parse(data_raw_test)
data_val = conllu.parse(data_raw_val)

In [14]:
train_paarid = []
for lause in data_train:
    train_paarid.append({"lause": [(token["form"], token["upos"]) for token in lause]})
    
test_paarid = []
for lause in data_test:
    test_paarid.append({"lause": [(token["form"], token["upos"]) for token in lause]})
    
val_paarid = []
for lause in data_val:
    val_paarid.append({"lause": [(token["form"], token["upos"]) for token in lause]})

In [15]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tags = list(set(token[1] for lause in train_paarid for token in lause["lause"]))
tag2idx = {tag:idx for idx, tag in enumerate(tags)}
idx2tag = {idx:tag for idx, tag in enumerate(tags)}

tokenizer = BertTokenizer(vocab_file = "vocab_final.txt", vocab_file_form = "vocab_form.txt", max_length = 128,
                         padding = "max_length", truncation = True, return_tensors = "pt", mask_token="ˇMASKˇ")

In [32]:
# Info andmestike kohta

from collections import Counter
tags_train = [y[1] for x in train_paarid for y in x["lause"]]
c = Counter(tags_train)
sorted(c.items())

[('ADJ', 28713),
 ('ADP', 7453),
 ('ADV', 33003),
 ('AUX', 17432),
 ('CCONJ', 12422),
 ('DET', 5398),
 ('INTJ', 251),
 ('NOUN', 91185),
 ('NUM', 7174),
 ('PRON', 18025),
 ('PROPN', 20927),
 ('PUNCT', 56377),
 ('SCONJ', 6752),
 ('SYM', 540),
 ('VERB', 38394),
 ('X', 857)]

In [33]:
sum([x[1] for x in sorted(c.items())])

344903

In [34]:
tags_val = [y[1] for x in val_paarid for y in x["lause"]]
c = Counter(tags_val)
sorted(c.items())

[('ADJ', 3992),
 ('ADP', 887),
 ('ADV', 4194),
 ('AUX', 2247),
 ('CCONJ', 1681),
 ('DET', 675),
 ('INTJ', 20),
 ('NOUN', 11655),
 ('NUM', 1080),
 ('PRON', 2469),
 ('PROPN', 2539),
 ('PUNCT', 7499),
 ('SCONJ', 859),
 ('SYM', 119),
 ('VERB', 4832),
 ('X', 61)]

In [35]:
sum([x[1] for x in sorted(c.items())])

44809

In [36]:
tags_test = [y[1] for x in test_paarid for y in x["lause"]]
c = Counter(tags_test)
sorted(c.items())

[('ADJ', 4070),
 ('ADP', 914),
 ('ADV', 4861),
 ('AUX', 2575),
 ('CCONJ', 2031),
 ('DET', 812),
 ('INTJ', 51),
 ('NOUN', 12616),
 ('NUM', 848),
 ('PRON', 2524),
 ('PROPN', 3061),
 ('PUNCT', 7674),
 ('SCONJ', 1116),
 ('SYM', 26),
 ('VERB', 5254),
 ('X', 100)]

In [37]:
sum([x[1] for x in sorted(c.items())])

48533

In [6]:
# Labelite allignimine
def tokeniseeri_lause_lisa_labelid(batch):
    INP, TTI, BIN, ATT, LAB = [], [], [], [], []
    for i, lause_paarid in enumerate(batch["lause"]):
        lause = [x[0] for x in lause_paarid]
        labelid_alg = [x[1] for x in lause_paarid]
        lause_sonade_tokenid = []
        for sona in lause:
            tokeniseeritud_sona = tokenizer(sona, estnltk_first_token = True)
            lause_sonade_tokenid.append(tokeniseeritud_sona["input_ids"][1:-1])

        tokeneid_sonadel = [len(x) for x in lause_sonade_tokenid]
        tokeniseeritud_lause = tokenizer(lause, is_split_into_words=True, max_length = 128,
                         padding = "max_length", truncation = True, return_tensors = "pt", estnltk_first_token = True)
        labelid = []
        i = 0
        mitu_id = False
        j = 0
        
        for input_id in tokeniseeritud_lause["input_ids"][0]:
            
            if mitu_id:
                labelid.append(-100)
                j -= 1
                if j == 0:
                    mitu_id = False
                continue
                
            if input_id[0].item() < 5:
                labelid.append(-100)
                continue
                
            labelid.append(tag2idx[labelid_alg[i]])
            
            if tokeneid_sonadel[i] > 1:
                j = tokeneid_sonadel[i] - 1
                mitu_id = True
                
            i += 1
            
        assert len(tokeniseeritud_lause["input_ids"][0]) == len(labelid)
        
        INP.append(tokeniseeritud_lause["input_ids"])
        TTI.append(tokeniseeritud_lause["token_type_ids"])
        BIN.append(tokeniseeritud_lause["binary_channels"])
        ATT.append(tokeniseeritud_lause["attention_mask"])
        LAB.append(torch.tensor(labelid))
    
    
    INP = torch.cat(INP)
    TTI = torch.cat(TTI)
    BIN = torch.cat(BIN)
    ATT = torch.cat(ATT)
    LAB = torch.stack(LAB)
    
    encodings = {
    "input_ids" : INP,
    "token_type_ids" : TTI,
    "binary_channels" : BIN,
    "attention_mask" : ATT,
    "labels" : LAB
    }
    
    return encodings

In [7]:
%%time
train_dataset = Dataset.from_list(train_paarid)
train_tokenized_dataset = train_dataset.map(tokeniseeri_lause_lisa_labelid, batched=True)

test_dataset = Dataset.from_list(test_paarid)
test_tokenized_dataset = test_dataset.map(tokeniseeri_lause_lisa_labelid, batched=True)

val_dataset = Dataset.from_list(val_paarid)
val_tokenized_dataset = val_dataset.map(tokeniseeri_lause_lisa_labelid, batched=True)



  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

CPU times: total: 6min 2s
Wall time: 6min 6s


In [72]:
model = BertForTokenClassification.from_pretrained("train_results/checkpoint-100000", num_labels=len(tag2idx))
model.to(device)

batch_size = 16

args = TrainingArguments(
    "POS_tag_results",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1
)

data_collator = DataCollatorForTokenClassification(tokenizer)


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[tags[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[tags[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    
    results = classification_report(np.hstack(true_predictions).tolist(), np.hstack(true_labels).tolist(), output_dict=True)
    return {"precision": results['weighted avg']['precision'], "recall": results['weighted avg']['recall'], "f1": results['weighted avg']['f1-score'], "accuracy": results["accuracy"]}
    


trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=val_tokenized_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics = compute_metrics
)

trainer.train()

loading configuration file train_results_mudel4/checkpoint-200000\config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "hidden_size_form": 48,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_12": 12,
    "LABEL_13": 13,
    "LABEL_14": 14,
    "LABEL_15": 15,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.802881,0.571957,0.505303,0.532034,0.505303


The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: lause. If lause are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=63, training_loss=1.7916983952597967, metrics={'train_runtime': 916.1842, 'train_samples_per_second': 1.091, 'train_steps_per_second': 0.069, 'total_flos': 130664914944000.0, 'train_loss': 1.7916983952597967, 'epoch': 1.0})

In [73]:
predictions, labels, _ = trainer.predict(test_tokenized_dataset)
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [tags[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [tags[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = classification_report(np.hstack(true_predictions).tolist(), np.hstack(true_labels).tolist())
print(results)

The following columns in the test set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: lause. If lause are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100
  Batch size = 16


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         ADJ       0.22      0.43      0.29        60
         ADP       0.00      0.00      0.00         0
         ADV       0.63      0.45      0.52       152
         AUX       0.69      0.72      0.70        60
       CCONJ       0.57      0.62      0.59        52
         DET       0.00      0.00      0.00         0
        INTJ       0.00      0.00      0.00         0
        NOUN       0.77      0.60      0.67       440
         NUM       0.04      1.00      0.08         1
        PRON       0.65      0.59      0.62        56
       PROPN       0.12      0.45      0.19        22
       PUNCT       0.76      0.59      0.66       173
       SCONJ       0.36      0.62      0.46        13
        VERB       0.64      0.63      0.64       126

    accuracy                           0.58      1155
   macro avg       0.39      0.48      0.39      1155
weighted avg       0.67      0.58      0.61      1155



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [97]:
%%time

### ESTBERT ###

tokenizer = AutoTokenizer.from_pretrained("tartuNLP/EstBERT", max_length = 128,
                         padding = "max_length", truncation = True, return_tensors = "pt")

def tokeniseeri_lause_lisa_labelid(batch):
    INP, TTI, ATT, LAB = [], [], [], []
    for i, lause_paarid in enumerate(batch["lause"]):
        lause = [x[0] for x in lause_paarid]
        labelid_alg = [x[1] for x in lause_paarid]
        lause_sonade_tokenid = []
        for sona in lause:
            tokeniseeritud_sona = tokenizer(sona)
            lause_sonade_tokenid.append(tokeniseeritud_sona["input_ids"][1:-1])

        tokeneid_sonadel = [len(x) for x in lause_sonade_tokenid]
        tokeniseeritud_lause = tokenizer(lause, is_split_into_words=True, max_length = 128,
                         padding = "max_length", truncation = True, return_tensors = "pt")
        labelid = []
        i = 0
        mitu_id = False
        j = 0

        for input_id in tokeniseeritud_lause["input_ids"][0]:

            if mitu_id:
                labelid.append(-100)
                j -= 1
                if j == 0:
                    mitu_id = False
                continue

            if input_id.item() < 5:
                labelid.append(-100)
                continue


            labelid.append(tag2idx[labelid_alg[i]])

            if tokeneid_sonadel[i] > 1:
                j = tokeneid_sonadel[i] - 1
                mitu_id = True

            i += 1

        assert len(tokeniseeritud_lause["input_ids"][0]) == len(labelid)

        INP.append(tokeniseeritud_lause["input_ids"])
        TTI.append(tokeniseeritud_lause["token_type_ids"])
        ATT.append(tokeniseeritud_lause["attention_mask"])
        LAB.append(torch.tensor(labelid))
    
    
    INP = torch.cat(INP)
    TTI = torch.cat(TTI)
    ATT = torch.cat(ATT)
    LAB = torch.stack(LAB)
    
    encodings = {
    "input_ids" : INP,
    "token_type_ids" : TTI,
    "attention_mask" : ATT,
    "labels" : LAB
    }
    
    return encodings

train_dataset = Dataset.from_list(train_paarid)
train_tokenized_dataset = train_dataset.map(tokeniseeri_lause_lisa_labelid, batched=True)

test_dataset = Dataset.from_list(test_paarid)
test_tokenized_dataset = test_dataset.map(tokeniseeri_lause_lisa_labelid, batched=True)

val_dataset = Dataset.from_list(val_paarid)
val_tokenized_dataset = val_dataset.map(tokeniseeri_lause_lisa_labelid, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

CPU times: total: 2.09 s
Wall time: 2.29 s


In [102]:
model = AutoModelForTokenClassification.from_pretrained("tartuNLP/EstBERT", num_labels=len(tag2idx))
model.to(device)

batch_size = 16

args = TrainingArguments(
    "POS_tag_results_EST",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1
)

data_collator = DataCollatorForTokenClassification(tokenizer)

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[tags[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[tags[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    
    results = classification_report(np.hstack(true_predictions).tolist(), np.hstack(true_labels).tolist(), output_dict=True)
    return {"precision": results['weighted avg']['precision'], "recall": results['weighted avg']['recall'], "f1": results['weighted avg']['f1-score'], "accuracy": results["accuracy"]}
    


trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=val_tokenized_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics = compute_metrics
)

trainer.train()

Some weights of the model checkpoint at tartuNLP/EstBERT were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at tartuNLP/EstBERT and are newly initializ

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.648088,0.894705,0.85615,0.871951,0.85615


The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: lause. If lause are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=63, training_loss=1.2351588900127108, metrics={'train_runtime': 836.1137, 'train_samples_per_second': 1.196, 'train_steps_per_second': 0.075, 'total_flos': 65332457472000.0, 'train_loss': 1.2351588900127108, 'epoch': 1.0})

In [103]:
predictions, labels, _ = trainer.predict(test_tokenized_dataset)
predictions = np.argmax(predictions, axis=2)

true_predictions = [
    [tags[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [tags[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = classification_report(np.hstack(true_predictions).tolist(), np.hstack(true_labels).tolist())
print(results)

The following columns in the test set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: lause. If lause are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100
  Batch size = 16


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         ADJ       0.64      0.81      0.71        95
         ADP       0.00      0.00      0.00         0
         ADV       0.77      0.77      0.77       111
         AUX       0.95      0.95      0.95        62
       CCONJ       0.97      0.92      0.94        61
         DET       0.00      0.00      0.00         0
        INTJ       0.00      0.00      0.00         0
        NOUN       0.94      0.83      0.88       402
         NUM       0.62      1.00      0.77        15
        PRON       0.88      0.82      0.85        55
       PROPN       0.82      0.80      0.81        93
       PUNCT       1.00      0.98      0.99       188
       SCONJ       0.68      0.94      0.79        16
        VERB       0.97      0.89      0.93       141

    accuracy                           0.86      1239
   macro avg       0.66      0.69      0.67      1239
weighted avg       0.90      0.86      0.88      1239



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
