In [38]:
import os
import torch
import re
from functools import partial

import pandas as pd
import numpy as np
import torch.nn as nn
from torch.optim import SGD
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import BertModel, BertTokenizerFast, BertForTokenClassification

from sklearn.model_selection import train_test_split

from transformers import logging
logging.set_verbosity_error()

In [39]:
DATASET_PATH = 'dataframe.csv'
df = pd.read_csv(DATASET_PATH, sep=";", encoding="utf-8")
df = df.where(pd.notnull(df), '')
df.head(5)

Unnamed: 0,text,to,from,moment
0,Depuis La chaize-le-vicomte à La roche-sur-foron,La roche-sur-foron,La chaize-le-vicomte,
1,Itiniréraire jusqu'a Giromagny depuis Quimper,Giromagny,Quimper,
2,Comment aller de Boigneville à Longjumeau mardi,Longjumeau,Boigneville,mardi
3,Je suis actuellement à Villers-sur-mer et j’ai...,Ferrières-en-bray,Villers-sur-mer,
4,Donne moi l'itinéraire pour aller à Fontenay-l...,Fontenay-le-fleury,Lizy-sur-ourcq,


In [40]:
def text_tokenizer(sentence: str) -> str:
    # Add space around special characters
    sentence = re.sub(r'([^\w\s])', r' \1 ', sentence)

    # Remove multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    # Trim
    sentence = sentence.strip()

    return sentence

df['tokenized_sentence'] = df.apply(lambda x: text_tokenizer(x['text']), axis=1)

In [41]:
def label_write(row: pd.Series, key: str, role: str) -> dict:
    if key not in row:
        return {}

    response = {}

    if ';' in row[key]:
        values = row[key].split(';')
        for value in values:
            response.update({value: []})
            tokenized_value = text_tokenizer(value).split()
            for i, _ in enumerate(tokenized_value):
                response[value].append(f'B-{role}' if i == 0 else f'I-{role}')
    else:
        response.update({row[key]: []})
        tokenized_value = text_tokenizer(row[key]).split()
        for i, _ in enumerate(tokenized_value):
            response[row[key]].append(f'B-{role}' if i == 0 else f'I-{role}')

    return response

def _tokenize_to_label(row: pd.Series) -> str:
    row_to_check = {
        # VC Funding
        "to": lambda k: label_write(row, k, 'TO'),
        "from": lambda k: label_write(row, k, 'FROM'),
        "moment": lambda k: label_write(row, k, 'MOMENT'),
    }

    tokenize = row['tokenized_sentence']

    for col, function in row_to_check.items():
        tokenize_label = function(col)
        for key, value in tokenize_label.items():
            key_tokenized = text_tokenizer(key).strip()
            if key_tokenized in tokenize and not key_tokenized.isspace():
                regex_pattern = r"(?P<start>^|\W|\b)(?P<word>{})(?P<end>\W|\b|$)".format(re.escape(key_tokenized))
                toknized_value = " ".join(value)
                regex = re.compile(regex_pattern)
                tokenize = regex.sub(
                    lambda m: f"{m.group('start')}{toknized_value}{m.group('end')}"
                    if m.group('start') or m.group('end')
                    else f"{toknized_value}",
                    tokenize
                )
            elif not key_tokenized.isspace():
                print(f'Label not found in tokenize: {key_tokenized} - {row["text"]}')
                row['KO'] = 'KO'

    # Replace all characters who don't start with "B-" or "I-" in "o"
    tokenize = " ".join(
        ['o' if not (word.startswith('B-') or word.startswith('I-'))
         else word for word in tokenize.split()]
    )

    return tokenize

df['annotations'] = df.apply(lambda x: _tokenize_to_label(x), axis=1)

In [42]:
train_ds, val_ds = train_test_split(df, train_size=0.8, random_state=42)

train_ds['corpus_type'] = 'TRAIN'
val_ds['corpus_type'] = 'VAL'

df = pd.concat([train_ds, val_ds])

# About the Dataset

In [43]:
# Split labels based on whitespace and turn them into a list
labels = [i.split() for i in df['annotations'].values.tolist()]

# Check how many labels are there in the dataset
unique_labels = set()

for lb in labels:
    [unique_labels.add(i) for i in lb if i not in unique_labels]

unique_labels


{'B-FROM', 'B-MOMENT', 'B-TO', 'I-FROM', 'I-MOMENT', 'I-TO', 'o'}

In [44]:
labels_to_ids = {k: v for v, k in enumerate(sorted(unique_labels))}
ids_to_labels = {v: k for v, k in enumerate(sorted(unique_labels))}

# save in json
import json
with open('labels_to_ids.json', 'w') as fp:
    json.dump(labels_to_ids, fp)

with open('ids_to_labels.json', 'w') as fp:
    json.dump(ids_to_labels, fp)

labels_to_ids


{'B-FROM': 0,
 'B-MOMENT': 1,
 'B-TO': 2,
 'I-FROM': 3,
 'I-MOMENT': 4,
 'I-TO': 5,
 'o': 6}

# Data Preprocessing
Before we are able to use a BERT model to classify the entity of a token, of course, we need to do data preprocessing first, which includes two parts: tokenization and adjusting the label to match the tokenization. Let’s start with tokenization first.

## Tokenization
Tokenization can be easily implemented with BERT, as we can use BertTokenizerFast class from a pretrained BERT base model with HuggingFace.
To give you an example how BERT tokenizer works, let’s take a look at one of the texts from our dataset:

In [45]:
# Let's take a look at how can we preprocess the text - Take first example
text = df['text'].values.tolist()
example = text[36]

example


'Je cherche le train le plus rapide pour faire Vireux-molhain Gargenville'

In [46]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
text_tokenized = tokenizer(example, padding='max_length', max_length=128, truncation=True, return_tensors="pt")

text_tokenized

{'input_ids': tensor([[  101, 27901, 22572,  1200,  4386,  5837,  2669,  5837,  4882,  6099,
          1162, 11480,  4652,  1162,   159,  5817,  5025,   118,   182,  4063,
         10390,  1179,   144,  1813,  4915,  2138,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [47]:
tokenizer.decode(text_tokenized.input_ids[0])


'[CLS] Je cherche le train le plus rapide pour faire Vireux - molhain Gargenville [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

## Adjusting Label After Tokenization
This is a very important step that we need to do after the tokenization process. This is because the length of the sequence is no longer matching the length of the original label after the tokenization process.

The BERT tokenizer uses the so-called word-piece tokenizer under the hood, which is a sub-word tokenizer. This means that BERT tokenizer will likely to split one word into one or more meaningful sub-words.

In [48]:
tokenizer.convert_ids_to_tokens(text_tokenized["input_ids"][0])


['[CLS]',
 'Je',
 'ch',
 '##er',
 '##che',
 'le',
 'train',
 'le',
 'plus',
 'rapid',
 '##e',
 'pour',
 'fair',
 '##e',
 'V',
 '##ire',
 '##ux',
 '-',
 'm',
 '##ol',
 '##hai',
 '##n',
 'G',
 '##ar',
 '##gen',
 '##ville',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',

There are two problems that we need to address after tokenization process:

The addition of special tokens from BERT such as [CLS], [SEP], and [PAD]
The fact that some tokens are splitted into sub-words.
As sub-word tokenization, word-piece tokenization splits uncommon words into their sub-words, such as ‘Geir’ and ‘Haarde’ in the example above. This sub-word tokenization helps the BERT model to learn the semantic meaning of related words.

The consequence of this word piece tokenization and the addition of special tokens from BERT is that the sequence length after tokenization is no longer matching the length of the initial label.

From the example above, now there are in total 128 tokens in the sequence after tokenization, while the length of the label is still the same as before. Also, the first token in a sequence is no longer the word ‘Prime’, but the newly added [CLS] token, so we need to shift our label as well.

To solve this problem, we need to adjust the label such that it has the same length as the sequence after tokenization. To do this, we can utilize the word_ids method from the tokenization result as follows:

In [49]:
word_ids = text_tokenized.word_ids()
word_ids

[None,
 0,
 1,
 1,
 1,
 2,
 3,
 4,
 5,
 6,
 6,
 7,
 8,
 8,
 9,
 9,
 9,
 10,
 11,
 11,
 11,
 11,
 12,
 12,
 12,
 12,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

As you can see from the code snippet above, each splitted token shares the same word_ids , where special tokens from BERT such as [CLS], [SEP], and [PAD] all do not have specificword_ids.

These word_ids will be very useful to adjust the length of the label by applying either of these two methods:

We only provide a label to the first sub-word of each splitted token. The continuation of the sub-word then will simply have ‘-100’ as a label. All tokens that don’t have word_ids will also be labeled with ‘-100’.
We provide the same label among all of the sub-words that belong to the same token. All tokens that don’t have word_ids will be labeled with ‘-100’.
The function in the code snippet below will do exactly the step defined above.



In [50]:
def align_label_example(tokenized_input, labels, label_all_tokens):

    word_ids = tokenized_input.word_ids()

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            label_ids.append(-100)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]])
            except:
                label_ids.append(-100)

        else:
            label_ids.append(labels_to_ids[labels[word_idx]] if label_all_tokens else -100)
        previous_word_idx = word_idx


    return label_ids


If you want to apply the first method, set label_all_tokens to False. If you want to apply the second method, set label_all_tokens to True, as you can see in the following code snippet:


In [51]:
label = labels[36]

#If we set label_all_tokens to True.....
label_all_tokens = True

new_label = align_label_example(text_tokenized, label, label_all_tokens)
new_label

[-100,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 0,
 0,
 0,
 3,
 3,
 3,
 3,
 3,
 2,
 2,
 2,
 2,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100]

In [52]:
label_all_tokens = False

new_label = align_label_example(text_tokenized, label, label_all_tokens)
new_label


[-100,
 6,
 6,
 -100,
 -100,
 6,
 6,
 6,
 6,
 6,
 -100,
 6,
 6,
 -100,
 0,
 -100,
 -100,
 3,
 3,
 -100,
 -100,
 -100,
 2,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100]

# Dataset Class
Before we train our BERT model for NER task, we need to create a dataset class to generate and fetch data in a batch.

In [53]:
def align_label(texts, labels):
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=128, truncation=True)

    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            label_ids.append(-100)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]])
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]] if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids

In [54]:
class DataSequence(torch.utils.data.Dataset):

    def __init__(self, df):

        lb = [i.split() for i in df['annotations'].values.tolist()]
        txt = df['text'].values.tolist()
        self.texts = [tokenizer(str(i),  padding='max_length', max_length=128, truncation=True, return_tensors="pt") for i in txt]
        self.labels = [align_label(i,j) for i,j in zip(txt, lb)]
        print(len(self.labels))
        
    def __len__(self):

        return len(self.labels)

    def get_batch_data(self, idx):

        return self.texts[idx]

    def get_batch_labels(self, idx):

        return torch.LongTensor(self.labels[idx])

    def __getitem__(self, idx):

        batch_data = self.get_batch_data(idx)
        batch_labels = self.get_batch_labels(idx)

        return batch_data, batch_labels


In the code snippet above, we call BertTokenizerFast class with tokenizer variable in the __init__ function to tokenize our input texts, and align_label function to adjust our label after tokenization process.

Next, let’s split our data randomly into training, vaidation, and test. However, mind you that the total number of data is 2000.

In [55]:
df_train = df[df['corpus_type'] == 'TRAIN']
df_val = df[df['corpus_type'] == 'VAL']

print(f'Train: {len(df_train)}')
print(f'Val: {len(df_val)}')

Train: 1600
Val: 400


# Model Building

In [56]:
class BertNerModel(torch.nn.Module):

    def __init__(self):
        super(BertNerModel, self).__init__()
        self.bert = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(unique_labels))

    def forward(self, input_id, mask, labels):
        output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False, labels=labels)

        return output

# Training Loop

In [61]:
def train(df_train, df_val):
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    
    model = BertNerModel().to(device)
    
    train_dataset = DataSequence(df_train)
    val_dataset = DataSequence(df_val)

    train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=8)

    optimizer = SGD(model.parameters(), lr=5e-3, momentum=0.9)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)
    
    for epoch_num in range(EPOCHS):

        total_acc_train = 0
        total_loss_train = 0

        model.train()

        for train_data, train_label in tqdm(train_dataloader):

            train_label = train_label.to(device)
            mask = train_data['attention_mask'].squeeze(1).to(device)
            input_id = train_data['input_ids'].squeeze(1).to(device)

            optimizer.zero_grad()
            loss, logits = model(input_id, mask, train_label)

            for i in range(logits.shape[0]):

                logits_clean = logits[i][train_label[i] != -100]
                label_clean = train_label[i][train_label[i] != -100]

                predictions = logits_clean.argmax(dim=1)
                acc = (predictions == label_clean).float().mean()
                total_acc_train += acc.item()
                total_loss_train += loss.item()

            loss.backward()
            optimizer.step()

        model.eval()

        total_acc_val = 0
        total_loss_val = 0

        for val_data, val_label in val_dataloader:

            val_label = val_label.to(device)
            mask = val_data['attention_mask'].squeeze(1).to(device)
            input_id = val_data['input_ids'].squeeze(1).to(device)

            loss, logits = model(input_id, mask, val_label)

            for i in range(logits.shape[0]):

                logits_clean = logits[i][val_label[i] != -100]
                label_clean = val_label[i][val_label[i] != -100]

                predictions = logits_clean.argmax(dim=1)
                acc = (predictions == label_clean).float().mean()
                total_acc_val += acc.item()
                total_loss_val += loss.item()
                
        train_accuracy = total_acc_train / len(df_train)
        train_loss = total_loss_train / len(df_train)
        val_accuracy = total_acc_val / len(df_val)
        val_loss = total_loss_val / len(df_val)
        
        scheduler.step()
        
        print(f'Epoch: {epoch_num} - Train_Loss: {train_loss} | Train_Accuracy: {train_accuracy} | Val_Loss: {val_loss} | Val_Accuracy: {val_accuracy}')

    return {
        "model": model,
        "train_accuracy": train_accuracy,
        "train_loss": train_loss,
        "val_accuracy": val_accuracy,
        "val_loss": val_loss
    }


In [62]:
EPOCHS = 10

results = train(df_train, df_val) 

1600
400


100%|██████████| 200/200 [00:38<00:00,  5.22it/s]


Epoch: 0 - Train_Loss: 0.40922343515791 | Train_Accuracy: 0.8443625670881011 | Val_Loss: 0.6086626607179642 | Val_Accuracy: 0.7595963321253657


100%|██████████| 200/200 [00:32<00:00,  6.22it/s]


Epoch: 1 - Train_Loss: 0.24247286426834763 | Train_Accuracy: 0.9230699844937772 | Val_Loss: 0.17702844887971877 | Val_Accuracy: 0.947296773865819


100%|██████████| 200/200 [00:32<00:00,  6.22it/s]


Epoch: 2 - Train_Loss: 0.11822694221278653 | Train_Accuracy: 0.9694153612665832 | Val_Loss: 0.0428910940955393 | Val_Accuracy: 0.9890069732069969


100%|██████████| 200/200 [00:32<00:00,  6.23it/s]


Epoch: 3 - Train_Loss: 0.04157181476708502 | Train_Accuracy: 0.9886558531783521 | Val_Loss: 0.011918385439785198 | Val_Accuracy: 0.997599838078022


100%|██████████| 200/200 [00:32<00:00,  6.22it/s]


Epoch: 4 - Train_Loss: 0.01779696991550736 | Train_Accuracy: 0.996368617117405 | Val_Loss: 0.0076661504665389655 | Val_Accuracy: 0.9981000185012817


100%|██████████| 200/200 [00:32<00:00,  6.22it/s]


Epoch: 5 - Train_Loss: 0.011867836708552204 | Train_Accuracy: 0.9978275588154792 | Val_Loss: 0.00703899716201704 | Val_Accuracy: 0.9982666851580143


100%|██████████| 200/200 [00:32<00:00,  6.23it/s]


Epoch: 6 - Train_Loss: 0.009062750204466284 | Train_Accuracy: 0.9984293616190553 | Val_Loss: 0.006866968900430948 | Val_Accuracy: 0.998405573964119


100%|██████████| 200/200 [00:32<00:00,  6.21it/s]


Epoch: 7 - Train_Loss: 0.007990534739510622 | Train_Accuracy: 0.9987827615439892 | Val_Loss: 0.006878538551973179 | Val_Accuracy: 0.9981783013045787


100%|██████████| 200/200 [00:32<00:00,  6.22it/s]


Epoch: 8 - Train_Loss: 0.00782390717882663 | Train_Accuracy: 0.9985744554921985 | Val_Loss: 0.006687318570329808 | Val_Accuracy: 0.998405573964119


100%|██████████| 200/200 [00:32<00:00,  6.22it/s]


Epoch: 9 - Train_Loss: 0.00773470028914744 | Train_Accuracy: 0.9987424545362592 | Val_Loss: 0.0066637948807328935 | Val_Accuracy: 0.998405573964119


In [63]:
from collections import defaultdict

def evaluate(model, df_test):

    test_dataset = DataSequence(df_test)

    test_dataloader = DataLoader(test_dataset, batch_size=1)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    total_sum_ok = 0
    grouped_sum_ok = {k[1]: 0 for k in ids_to_labels.items()}
    total_nb_elem = 0
    grouped_fp = {k[1]: 0 for k in ids_to_labels.items()}
    total_fp = 0
    grouped_fn = {k[1]: 0 for k in ids_to_labels.items()}
    total_fn = 0

    for test_data, test_label in test_dataloader:

        test_label = test_label[0].to(device)
        mask = test_data['attention_mask'][0].to(device)
        input_id = test_data['input_ids'][0].to(device)

        loss, logits = model(input_id, mask, test_label.long())

        logits_clean = logits[0][test_label != -100]
        label_clean = test_label[test_label != -100]

        predictions = logits_clean.argmax(dim=1)

        i = 0
        nb_elem = 0
        sum_ok = 0
        fp = 0
        fn = 0
        
        o_index = [k for k, v in ids_to_labels.items() if v == 'o'][0]

        for _ in label_clean:
            if label_clean[i] != o_index or predictions[i] != o_index:
                if predictions[i] == label_clean[i]:
                    sum_ok=sum_ok+1
                    grouped_sum_ok[ids_to_labels[label_clean[i].item()]] = grouped_sum_ok[ids_to_labels[label_clean[i].item()]] + 1
                if predictions[i] != label_clean[i] and predictions[i] != o_index:
                    fp = fp+1
                    grouped_fp[ids_to_labels[predictions[i].item()]] = grouped_fp[ids_to_labels[predictions[i].item()]] + 1
                if predictions[i] != label_clean[i] and label_clean[i] != o_index:
                    fn = fn+1
                    grouped_fn[ids_to_labels[label_clean[i].item()]] = grouped_fn[ids_to_labels[label_clean[i].item()]] + 1
                nb_elem=nb_elem+1
            i=i+1
        if nb_elem > 0:
            total_sum_ok = total_sum_ok+sum_ok
            total_nb_elem = total_nb_elem+nb_elem
            total_fp = total_fp+fp
            total_fn = total_fn+fn
            
    grouped_precision = defaultdict(int)
    grouped_recall = defaultdict(int)
    grouped_f1 = defaultdict(int)

    total_precision = total_sum_ok / (total_sum_ok+total_fp)
    total_recall = total_sum_ok / (total_sum_ok+total_fn)
    total_f1 = 2*((total_precision*total_recall)/(total_precision+total_recall))
    
    for i in grouped_fn.keys():
        try:
            grouped_precision[i] = grouped_sum_ok[i] / (grouped_sum_ok[i] + grouped_fp[i])
        except ZeroDivisionError:
            grouped_precision[i] = 0

    for i in grouped_fn.keys():
        try:
            grouped_recall[i] = grouped_sum_ok[i] / (grouped_sum_ok[i] + grouped_fn[i])
        except ZeroDivisionError:
            grouped_recall[i] = 0

    for i in grouped_fn.keys():
        try:
            grouped_f1[i] = 2 * ((grouped_precision[i] * grouped_recall[i]) / (grouped_precision[i] + grouped_recall[i]))
        except ZeroDivisionError:
            grouped_f1[i] = 0

    print(f'Test Precision: {total_precision: .3f}')
    print(f'Test Recall: {total_recall: .3f}')
    print(f'Test F1: {total_f1: .3f}')
    
    print('======== GROUPED EVALUATED ========')
    # regroup f1, precision and recall by label
    for k, v in grouped_precision.items():
        print(f'{k} Precision:    {v: .3f}')
    for k, v in grouped_recall.items():
        print(f'{k} Recall:       {v: .3f}')
    for k, v in grouped_f1.items():
        print(f'{k} F1:           {v: .3f}')
    
    return {"precision": total_precision, "recall": total_recall, "F1": total_f1}

In [64]:
evaluate(results['model'], df_val)

400
Test Precision:  0.998
Test Recall:  0.997
Test F1:  0.997
B-FROM Precision:     1.000
B-MOMENT Precision:     1.000
B-TO Precision:     1.000
I-FROM Precision:     0.997
I-MOMENT Precision:     1.000
I-TO Precision:     0.995
o Precision:     0.000
B-FROM Recall:        0.998
B-MOMENT Recall:        0.963
B-TO Recall:        1.000
I-FROM Recall:        0.995
I-MOMENT Recall:        1.000
I-TO Recall:        1.000
o Recall:        0.000
B-FROM F1:            0.999
B-MOMENT F1:            0.981
B-TO F1:            1.000
I-FROM F1:            0.996
I-MOMENT F1:            1.000
I-TO F1:            0.998
o F1:            0.000


{'precision': 0.9977638640429338,
 'recall': 0.9968722073279714,
 'F1': 0.9973178363880197}

In [65]:
def align_word_ids(texts):

    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=128, truncation=True)

    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            label_ids.append(-100)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(1)
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(1 if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids


def evaluate_one_text(model, sentence):


    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    text = tokenizer(sentence, padding='max_length', max_length = 128, truncation=True, return_tensors="pt")

    mask = text['attention_mask'].to(device)
    input_id = text['input_ids'].to(device)
    label_ids = torch.Tensor(align_word_ids(sentence)).unsqueeze(0).to(device)

    logits = model(input_id, mask, None)
    logits_clean = logits[0][label_ids != -100]

    predictions = logits_clean.argmax(dim=1).tolist()
    prediction_label = [ids_to_labels[i] for i in predictions]
    print(sentence)
    print(prediction_label)


In [67]:
evaluate_one_text(results['model'], 'Je voudrais partir en vacance à Lyon la semaine prochaine en partant de Marseille')


Je voudrais partir en vacance à Lyon la semaine prochaine en partant de Marseille
['o', 'o', 'o', 'o', 'o', 'o', 'B-TO', 'B-MOMENT', 'I-MOMENT', 'I-MOMENT', 'o', 'o', 'o', 'B-FROM']
