In [153]:
import pandas as pd


class Preprocessor:
    def __init__(self, filename):
        self.row_isw_data = self.load_isw_tsv_file(filename)
        self.cleaned_isw_data = self.clean_isw_data()

    def load_isw_tsv_file(self, filename='data/test-full-isw-release.tsv'):
        isw_data = pd.read_csv(filename, quotechar='"',
                               delimiter="\t", skiprows=None)
        print("Total number of rows", len(isw_data))
        print("Total number of sentences", len(isw_data.fileid.unique()))
        return isw_data

    def clean_isw_data(self, selected_cols=[]):
        """
        :return: clean isw_data
        """
        # Keep only selected cols
        selected_cols = ['fileid', 'token', 'lemma', 'ontoNer']
        isw_set = self.row_isw_data[selected_cols]

        # Clean up incorrect rows  e.g. fileid -> total 82 of it
        isw_set = isw_set[isw_set.fileid != "fileid"]

        # Drop empty token
        isw_drop_non = isw_set[isw_set.lemma != "NONE"]
        isw_drop_non.reset_index(drop=True, inplace=True)

        # Replace NONE tag with "O"
        isw_drop_non['ontoNer'].replace(
            to_replace='NONE', value='O', inplace=True)
        return isw_drop_non

    def get_list_of_sentences(self):
        """
        return : list of sentences : ['I have apple', 'I am here', 'hello ']
        """
        data = self.cleaned_isw_data
        # Group the sentence with its fileid
        agg_func = lambda s: [(token, lem, ner) for token, lem, ner in zip(s["token"].values.tolist(),
                                                    s["lemma"].values.tolist(),
                                                    s["ontoNer"].values.tolist())]
        grouped = data.groupby("fileid").apply(agg_func)
        grouped_all = [s for s in grouped]

        sentences = [" ".join([s[0] for s in sent]) for sent in grouped_all]
        return sentences

    def get_list_of_nerlabels(self):
        """
        return : list of labels : ['O', 'O', 'B-GPE', ...]
        """
        data = self.cleaned_isw_data
        # Group the sentence with its fileid
        agg_func = lambda s: [(token, lem, ner) for token, lem, ner in zip(s["token"].values.tolist(),
                                                    s["lemma"].values.tolist(),
                                                    s["ontoNer"].values.tolist())]
        grouped = data.groupby("fileid").apply(agg_func)
        grouped_all = [s for s in grouped]

        labels = [[s[2] for s in label] for label in grouped_all]

        return labels

    def get_tag2idx(self):
        """
        return : dict of ner label with idx : {'B-ADD': 0, 'B-AGE': 1, 'B-ART': 2, 'B-CARDINAL': 3,'B-CREAT': 4, ...}
        """
        data = self.cleaned_isw_data
        # ners_vals : list of ner labels
        ners_vals = list(set(data["ontoNer"].values))
        # Set as dict {key:idx}
        tag2idx = {t: i for i, t in enumerate(sorted(ners_vals))}
        return tag2idx





# filename = 'data/test-full-isw-release.tsv'
# pre = Preprocessor(filename)

# sentences = pre.get_list_of_sentences()
# labels = pre.get_list_of_nerlabels()

# print(labels[0])


In [155]:
import os
import torch
import numpy as np
import pandas as pd
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from seqeval.metrics import classification_report
from sklearn.metrics import confusion_matrix
from tqdm import tqdm, trange


from transformers import BertTokenizer, BertForTokenClassification



def get_hyperparameters(model, ff):

    # ff: full_finetuning
    if ff:
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "gamma", "beta"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
                ],
                "weight_decay_rate": 0.01,
            },
            {
                "params": [
                    p for n, p in param_optimizer if any(nd in n for nd in no_decay)
                ],
                "weight_decay_rate": 0.0,
            },
        ]
    else:
        param_optimizer = list(model.classifier.named_parameters())
        optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

    return optimizer_grouped_parameters

def flat_accuracy(preds, labels):
    return np.sum(np.array(preds)==np.array(labels))/len(labels)

def annot_confusion_matrix(valid_tags, pred_tags):

    """
    Create an annotated confusion matrix by adding label
    annotations and formatting to sklearn's `confusion_matrix`.
    """

    # Create header from unique tags
    header = sorted(list(set(valid_tags + pred_tags)))

    # Calculate the actual confusion matrix
    matrix = confusion_matrix(valid_tags, pred_tags, labels=header)

    # Final formatting touches for the string output
    mat_formatted = [header[i] + "\t" + str(row) for i, row in enumerate(matrix)]
    content = "\t" + " ".join(header) + "\n" + "\n".join(mat_formatted)

    return content



## Check if it works ...the training process

In [224]:
# Constants
FILE_NAME = "data/test-full-isw-release.tsv"
BERT_MODEL = "bert-base-german-cased"
MAX_LEN = 75
BATCH_SIZE = 32
FULL_FINETUNING = True

In [211]:
# Create directory for storing our model checkpoints
# if not os.path.exists("/models"):
#     os.mkdir("/models")

In [160]:
# Specify device data for training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Devices available: {}".format(device))

Devices available: cpu


In [161]:
# Initialize PRETRAINED TOKENIZER
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL, do_lower_case=False)


In [162]:
# Load preprocessed sentences, labels and tag2idx
pre = Preprocessor(filename=FILE_NAME)

sentences = pre.get_list_of_sentences()
labels = pre.get_list_of_nerlabels()
# Create dicts for mapping from labels to IDs and back
tag2idx = pre.get_tag2idx()
idx2tag = {i: t for t, i in tag2idx.items()}

Total number of rows 300684
Total number of sentences 84


In [169]:
print(len(labels))
print(len(sentences))
print(labels[0])
print(sentences[0])

83
83
['O', 'O', 'O', 'O', 'O', 'B-TIME', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DUR', 'O', 'B-GPE', 'O', 'O', 'O', 'O', 'B-SORD', 'I-SORD', 'O', 'O', 'O', 'O', 'O', 'O', 'B-GPE', 'O', 'B-TIME', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-FREQ', 'O', 'O', 'O', 'O', 'O', 'B-AGE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-TIME', 'O', 'O', 'B-AGE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-TIME', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'

In [171]:
print('number of NER tags:', len(tag2idx))

number of NER tags: 60


In [225]:
# Tokenize the sentences
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

# Get input id of token
# mapped fixed sentence size
# if len of sent less than max len -> add word with 0 index to match the max len (padding)
# if one word is exisiting with several blocks, it will be assigned with same index
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

# Map tags of labels with input_ids
# Get tags of labels with id
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["O"], padding="post",
                     dtype="long", truncating="post")

In [226]:
tokenized_texts[0]

['Also',
 'Sie',
 'haben',
 'uns',
 'ja',
 'jetzt',
 'schon',
 'sehr',
 'viel',
 'Interess',
 '##antes',
 'erzählt',
 'und',
 'von',
 'Ihrer',
 'Jugend',
 '##zeit',
 'in',
 'Wien',
 'könnten',
 'Sie',
 'uns',
 'da',
 'noch',
 'einmal',
 'ihre',
 'Ein',
 '##drücke',
 'vermitteln',
 'Meine',
 'Ein',
 '##drücke',
 'von',
 'Wien',
 'von',
 'damals',
 'Also',
 'ich',
 'ich',
 'glaub',
 'ich',
 'hatte',
 'nur',
 'gute',
 'ehr',
 '##lich',
 'gestanden',
 'Wor',
 '##auf',
 'führ',
 'ich',
 'das',
 'zurück',
 'Ich',
 'kann',
 'mich',
 'erinnern',
 'dass',
 'man',
 'mich',
 'gefragt',
 'hat',
 'Na',
 'was',
 'will',
 '##st',
 'denn',
 'werden',
 'Das',
 'ist',
 'doch',
 'eine',
 'sehr',
 'aktuelle',
 'Frage',
 'immer',
 'für',
 'Kinder',
 'und',
 'ich',
 'war',
 'zwölf',
 'und',
 'ich',
 'sehe',
 'mich',
 'bei',
 'meiner',
 'Tante',
 'die',
 'das',
 'eben',
 'auch',
 'fragt',
 'und',
 'ich',
 'sa',
 '##g',
 'Jetzt',
 'bin',
 'ich',
 'zwölf',
 'eigentlich',
 'wür',
 '##d',
 'ich',
 'ganz',
 'gern

In [227]:
input_ids.shape


(83, 75)

In [228]:
tags

array([[59, 59, 59, ..., 59, 59,  1],
       [59, 59, 59, ..., 59, 59, 59],
       [59, 59, 59, ..., 59, 59, 59],
       ...,
       [59, 59, 59, ..., 59, 59,  8],
       [59, 12, 59, ..., 59, 59,  2],
       [59, 59, 59, ..., 59, 59, 59]])

In [229]:
# Get attention mask from BERT label
attention_masks = [[float(i>0) for i in ii] for ii in input_ids]

# Split the dataset for 10% validation
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags, 
                                                        random_state=42, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                                        random_state=42, test_size=0.1)

In [230]:
attention_masks[0][5990:6000]

[]

In [231]:
attention_masks[0][2000:2010]

[]

In [232]:
# Convert dataset to torch tensors
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

# Load and prepare data, define dataloaders
# Concatenate attention mask and inputs/tags
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=BATCH_SIZE)
print("Loaded training and validation data into DataLoaders.")

Loaded training and validation data into DataLoaders.


In [233]:
for step, batch in enumerate(train_dataloader):
    print(step)
    print("")
    print(batch)

0

[tensor([[  188,   320,    18,  ...,   717,   188,  5956],
        [ 6405,    88, 12737,  ...,  3277,   834,  1169],
        [ 6405,    88,  7003,  ...,   456,   188, 24999],
        ...,
        [26718,   371,  1120,  ..., 20023,  1671, 14154],
        [   32,   437,   295,  ...,  4537, 19009,   153],
        [  125,  1062, 26905,  ..., 14335,   188,   181]]), tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        ...,
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.]]), tensor([[28, 59, 59,  ..., 28, 59, 59],
        [59, 59, 21,  ..., 35, 59, 59],
        [59, 59, 21,  ..., 59, 59, 59],
        ...,
        [59, 59, 59,  ..., 59, 59, 59],
        [59, 59, 59,  ..., 59, 59, 59],
        [59, 59, 59,  ..., 59, 59, 59]])]
1

[tensor([[13086,  4299,   229,  ...,  1169, 19009,  1671],
        [ 5072, 19009,  1169,  ...,    50,    21,  1786],
   

In [234]:
# Initialize model
model = BertForTokenClassification.from_pretrained(BERT_MODEL, num_labels=len(tag2idx))
model.to(device)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [235]:
# Set hyperparameters (optimizer, weight decay, learning rate)
optimizer_grouped_parameters = get_hyperparameters(model, FULL_FINETUNING)
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)
print("Initialized optimizer and set hyperparameters.")

Initialized optimizer and set hyperparameters.


In [236]:
# Start fine-tuning model, set epochs and max_grad_norm
epochs = 4
max_grad_norm = 1.0

print("Starting training loop.")
epoch = 0
for _ in trange(epochs, desc="Epoch"):
    epoch += 1

    model.train()
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []

    for step, batch in enumerate(train_dataloader):
        # add batch to gpus
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        # Forward pass
        outputs = model(
            b_input_ids,
            token_type_ids=None,
            attention_mask=b_input_mask,
            labels=b_labels,
        )
        loss, tr_logits = outputs[:2]

        # Backward pass
        loss.backward()

        # Compute train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=max_grad_norm
        )

        # Update parameters
        optimizer.step()
        model.zero_grad()

    # print train loss per epoch
    print("Train loss: {}".format(tr_loss/nb_tr_steps))


    # Validation loop
    print("Starting validation loop.")

    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions, true_labels = [], []

    for batch in valid_dataloader:

        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            outputs = model(
                b_input_ids,
                token_type_ids=None,
                attention_mask=b_input_mask,
                labels=b_labels,
            )
            tmp_eval_loss, logits = outputs[:2]

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.append(label_ids)

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)

        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy

        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1


    # Evakuate loss, acc, conf. matrix and report on dev set.
    pred_tags = [idx2tag[p_i] for p in predictions for p_i in p]
    valid_tags = [idx2tag[l_li] for l in true_labels[0] for l_li in l]
    cl_report = classification_report(valid_tags, pred_tags)
    conf_mat = annot_confusion_matrix(valid_tags, pred_tags)
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps

    # Report metrics
    print("Validation loss: {}".format(eval_loss))
    print("Validation Accuracy: {}".format(eval_accuracy))
    print("Classification Report:\n {}".format(cl_report))
    print("Confusion Matrix:\n {}".format(conf_mat))
    print("F1-Score: {}".format(flat_accuracy(pred_tags, valid_tags)))








Epoch:   0%|          | 0/4 [00:00<?, ?it/s][A[A[A[A[A[A[A

Starting training loop.
Train loss: 2.7238752047220864
Starting validation loop.









Epoch:  25%|██▌       | 1/4 [00:24<01:12, 24.11s/it][A[A[A[A[A[A[A

Validation loss: 1.0420911312103271
Validation Accuracy: 0.0
Classification Report:
            precision    recall  f1-score   support

      NRP       0.00      0.00      0.00         3
      GPE       0.00      0.00      0.00        30
      DUR       0.00      0.00      0.00         8
      LOC       0.00      0.00      0.00         1
     TIME       0.00      0.00      0.00        12
      AGE       0.00      0.00      0.00         3
     DATE       0.00      0.00      0.00         8
      MON       0.00      0.00      0.00         1
     FREQ       0.00      0.00      0.00         2
      LAN       0.00      0.00      0.00         4
      PER       0.00      0.00      0.00         1
     SORD       0.00      0.00      0.00         1

micro avg       0.00      0.00      0.00        74
macro avg       0.00      0.00      0.00        74

Confusion Matrix:
 	B-AGE B-DATE B-DUR B-FREQ B-GPE B-LAN B-LOC B-MON B-NRP B-PER B-SORD B-TIME I-AGE I-DATE I-DUR I-FREQ I-GPE I-MON I-SORD O
B-AG








Epoch:  50%|█████     | 2/4 [00:51<00:50, 25.23s/it][A[A[A[A[A[A[A

Validation loss: 0.8073590993881226
Validation Accuracy: 0.0
Classification Report:
            precision    recall  f1-score   support

      NRP       0.00      0.00      0.00         3
      GPE       0.00      0.00      0.00        30
      DUR       0.00      0.00      0.00         8
      LOC       0.00      0.00      0.00         1
     TIME       0.00      0.00      0.00        12
      AGE       0.00      0.00      0.00         3
     DATE       0.00      0.00      0.00         8
      MON       0.00      0.00      0.00         1
     FREQ       0.00      0.00      0.00         2
      LAN       0.00      0.00      0.00         4
      PER       0.00      0.00      0.00         1
     SORD       0.00      0.00      0.00         1

micro avg       0.00      0.00      0.00        74
macro avg       0.00      0.00      0.00        74

Confusion Matrix:
 	B-AGE B-DATE B-DUR B-FREQ B-GPE B-LAN B-LOC B-MON B-NRP B-PER B-SORD B-TIME I-AGE I-DATE I-DUR I-FREQ I-GPE I-MON I-SORD O
B-AG








Epoch:  75%|███████▌  | 3/4 [01:22<00:26, 26.87s/it][A[A[A[A[A[A[A

Validation loss: 0.7267888784408569
Validation Accuracy: 0.0
Classification Report:
            precision    recall  f1-score   support

      NRP       0.00      0.00      0.00         3
      GPE       0.00      0.00      0.00        30
      DUR       0.00      0.00      0.00         8
      LOC       0.00      0.00      0.00         1
     TIME       0.00      0.00      0.00        12
      AGE       0.00      0.00      0.00         3
     DATE       0.00      0.00      0.00         8
      MON       0.00      0.00      0.00         1
     FREQ       0.00      0.00      0.00         2
      LAN       0.00      0.00      0.00         4
      PER       0.00      0.00      0.00         1
     SORD       0.00      0.00      0.00         1

micro avg       0.00      0.00      0.00        74
macro avg       0.00      0.00      0.00        74

Confusion Matrix:
 	B-AGE B-DATE B-DUR B-FREQ B-GPE B-LAN B-LOC B-MON B-NRP B-PER B-SORD B-TIME I-AGE I-DATE I-DUR I-FREQ I-GPE I-MON I-SORD O
B-AG








Epoch: 100%|██████████| 4/4 [01:49<00:00, 27.39s/it][A[A[A[A[A[A[A

Validation loss: 0.7235440015792847
Validation Accuracy: 0.0
Classification Report:
            precision    recall  f1-score   support

      NRP       0.00      0.00      0.00         3
      GPE       0.00      0.00      0.00        30
      DUR       0.00      0.00      0.00         8
      LOC       0.00      0.00      0.00         1
     TIME       0.00      0.00      0.00        12
      AGE       0.00      0.00      0.00         3
     DATE       0.00      0.00      0.00         8
      MON       0.00      0.00      0.00         1
     FREQ       0.00      0.00      0.00         2
      LAN       0.00      0.00      0.00         4
      PER       0.00      0.00      0.00         1
     SORD       0.00      0.00      0.00         1

micro avg       0.00      0.00      0.00        74
macro avg       0.00      0.00      0.00        74

Confusion Matrix:
 	B-AGE B-DATE B-DUR B-FREQ B-GPE B-LAN B-LOC B-MON B-NRP B-PER B-SORD B-TIME I-AGE I-DATE I-DUR I-FREQ I-GPE I-MON I-SORD O
B-AG


