In [1]:
import json
import os

DIR= r'/Token_Classification/'
js_path = os.path.join(DIR, 'ner_dataset.json')
js_file = open(js_path)

js_dict = json.load(js_file)
for k,v in js_dict.items():
    print(k, v)

train [[[['CRICKET', 'O'], ['-', 'O'], ['GIBBS', 'B-PER'], ['GETS', 'O'], ['INTERNATIONAL', 'O'], ['CALL', 'O'], ['UP', 'O'], ['.', 'O']], [['JOHANNESBURG', 'B-LOC'], ['1996-08-28', 'O']], [['Western', 'B-ORG'], ['Province', 'I-ORG'], ['batsman', 'O'], ['Herschelle', 'B-PER'], ['Gibbs', 'I-PER'], ['was', 'O'], ['the', 'O'], ['only', 'O'], ['uncapped', 'O'], ['player', 'O'], ['in', 'O'], ['South', 'B-LOC'], ['Africa', 'I-LOC'], ["'s", 'O'], ['14-man', 'O'], ['squad', 'O'], ['named', 'O'], ['on', 'O'], ['Wednesday', 'O'], ['for', 'O'], ['a', 'O'], ['quadrangular', 'O'], ['one-day', 'O'], ['series', 'O'], ['in', 'O'], ['Kenya', 'B-LOC'], ['next', 'O'], ['month', 'O'], ['.', 'O']], [['Kenya', 'B-LOC'], [',', 'O'], ['South', 'B-LOC'], ['Africa', 'I-LOC'], [',', 'O'], ['Pakistan', 'B-LOC'], ['and', 'O'], ['Sri', 'B-LOC'], ['Lanka', 'I-LOC'], ['will', 'O'], ['take', 'O'], ['part', 'O'], ['in', 'O'], ['the', 'O'], ['series', 'O'], ['.', 'O']], [['National', 'O'], ['coach', 'O'], ['Bob', 'B-PER

In [2]:
print(len(js_dict['train']))
print(len(js_dict['valid']))
print(len(js_dict['test']))

94
21
23


In [3]:
def unzip_data(key):
    sentences = []
    labels_sentence = []

    for doc in key:
        for sent in doc:
            tokens = [t[0] for t in sent]
            sentences.append(tokens)
            labels = [t[1] for t in sent]
            labels_sentence.append(labels)    

    return sentences, labels_sentence

In [4]:
sent_train, labels_sent_train= unzip_data(js_dict['train'])
sent_val, labels_sent_val = unzip_data(js_dict['valid'])
sent_test, labels_sent_test= unzip_data(js_dict['test'])

In [5]:
import pandas as pd
df_train = pd.DataFrame(zip(sent_train, labels_sent_train), columns=['sentence', 'label_sentence'])
df_val = pd.DataFrame(zip(sent_val, labels_sent_val), columns=['sentence', 'label_sentence'])
df_test = pd.DataFrame(zip(sent_test, labels_sent_test), columns=['sentence', 'label_sentence'])
print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

(1155, 2)
(177, 2)
(315, 2)


In [6]:
# join 'train'and 'valid' data to make bigger the training set
df_train_ = pd.concat([df_train, df_val])
df_train_.shape

(1332, 2)

In [7]:
# convert labels in ids
label_list = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

label2id = {v: k for k, v in enumerate(label_list)}

In [8]:
def convert_labels(df, label_list):
    df['ids_labels'] = df['label_sentence'].transform(lambda x: [int(label2id[i]) for i in x])
    return df

In [9]:
df_train1 = convert_labels(df_train_, label_list)
df_test1 = convert_labels(df_test, label_list)

In [10]:
df_train1 = df_train1.reset_index(drop=True)

In [11]:
from datasets import Dataset

dt_train = Dataset.from_pandas(df_train1)
dt_test = Dataset.from_pandas(df_test1)

In [12]:
from datasets import DatasetDict

dataset = DatasetDict({'train': dt_train, 'test':dt_test})
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label_sentence', 'ids_labels'],
        num_rows: 1332
    })
    test: Dataset({
        features: ['sentence', 'label_sentence', 'ids_labels'],
        num_rows: 315
    })
})

In [13]:
#set name BERT-model, batch size, learning rate
model_checkpoint = "distilbert-base-uncased" #uncased works better
batch_size = 8
lr = 2e-5 

In [14]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer.is_fast #check tokenizer is backed by Tokenizers library

True

In [15]:
# tokenize sentences, align labels ids to tokenized sentences adding -100 
# as a special token (ignored by cross entropy loss function), 
# and coping with multiple tokens for a same word

def tokenize_(examples):
    tokenized_inputs = tokenizer(examples["sentence"], truncation=True, is_split_into_words=True)  
    
    labels=[]

    for i, label in enumerate(examples['ids_labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        new_labels = []
        current_word = None

        for word_id in word_ids:

            if word_id != current_word: #new word
                current_word = word_id
                l = -100 if word_id is None else label[word_id]
                new_labels.append(l)
            
            elif word_id is None: #special token
                l = -100
                new_labels.append(l)

            else: #same word 
                l = label[word_id]
                new_labels.append(l)

        labels.append(new_labels)

    tokenized_inputs["labels"] = labels

    return tokenized_inputs

In [16]:
example = dataset["train"][:2]
t_example = tokenize_(example)
t_example

{'input_ids': [[101, 4533, 1011, 15659, 4152, 2248, 2655, 2039, 1012, 102], [101, 15976, 2727, 1011, 5511, 1011, 2654, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 0, 0, 1, 0, 0, 0, 0, 0, -100], [-100, 5, 0, 0, 0, 0, 0, -100]]}

In [17]:
tokenized_dataset = dataset.map(tokenize_, batched=True, remove_columns=dataset["train"].column_names)
tokenized_dataset

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels'],
        num_rows: 1332
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels'],
        num_rows: 315
    })
})

In [18]:
# find max lengh of sentences for padding
def FindMaxLength(lst):
    max_length = max(len(x) for x in lst )
    return max_length

In [19]:
list_of_list = []

for i in tokenized_dataset['train']:
    list_of_list.append(i['input_ids'])
print(len(list_of_list))
max_length = FindMaxLength(list_of_list)
print(max_length)

1332
132


In [20]:
from transformers import DataCollatorForTokenClassification
from torch.utils.data import DataLoader

data_collator = DataCollatorForTokenClassification(tokenizer,
                                                   padding='max_length', 
                                                   max_length=max_length)

train_dataloader = DataLoader(
    tokenized_dataset["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=batch_size,
)

eval_dataloader = DataLoader(
    tokenized_dataset["test"], 
    collate_fn=data_collator, 
    batch_size=batch_size)

In [21]:
# set low number of epochs to prevent overfitting
num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

In [22]:
from datasets import load_metric

metric = load_metric("seqeval") #CoNLL

example = dataset["train"][4]
labels = [label_list[i] for i in example['ids_labels']]
results = metric.compute(predictions=[labels], references=[labels])

df_results = pd.DataFrame(results)
df_results_ = df_results.T
df_results_.number = df_results_.number.astype(int)
display(df_results_)

Unnamed: 0,precision,recall,f1,number
LOC,1.0,1.0,1.0,1
MISC,1.0,1.0,1.0,1
PER,1.0,1.0,1.0,2
overall_precision,1.0,1.0,1.0,1
overall_recall,1.0,1.0,1.0,1
overall_f1,1.0,1.0,1.0,1
overall_accuracy,1.0,1.0,1.0,1


In [23]:
# process predictions and true labels for metric 
# avoiding -100 token and converting ids in labels

def postprocess(predictions, labels, label_list):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    true_labels = [
        [label_list[l] for l in label if l != -100] for label in labels]

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)]
    
    return true_labels, true_predictions

In [24]:
# import net
import torch.nn as nn
from transformers import AutoModel, AutoConfig

from classes import Model_TC 

In [25]:
# define loss
def loss_funct(labels, mask_ids, logits, num_labels):
    loss = None
    if labels is not None:
        loss_fct = nn.CrossEntropyLoss()
        if mask_ids is not None:
            active_loss = mask_ids.view(-1) == 1
            active_logits = logits.view(-1, num_labels)
            active_labels = torch.where(
                active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
                    )
            loss = loss_fct(active_logits, active_labels)
        else:
            loss = loss_fct(logits.view(-1, num_labels), labels.view(-1))
    return loss

In [26]:
import datetime

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    # hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [27]:
from tqdm.auto import tqdm
import numpy as np
import torch
import time
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from accelerate import Accelerator

torch.manual_seed(42) #for reproducibility

progress_bar = tqdm(range(num_training_steps))

num_labels=len(label_list)
classifier = Model_TC.TokenClassifier(model_checkpoint, num_labels=num_labels, freeze_bert=False)

output_file= os.path.join(DIR, 'model_classifier.pth')

optimizer = AdamW(classifier.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

# handle the device placement for training
accelerator = Accelerator()
classifier, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(classifier, optimizer, train_dataloader, eval_dataloader)

# tracking the training loss
train_losses = []
# tracking the validation loss
valid_losses = []

for epoch in range(num_train_epochs):
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, num_train_epochs))
    print('Training...')

    t0 = time.time()
    train_loss = 0
    
    # Training
    classifier.train()
    
    for batch in train_dataloader:
        labels = batch["labels"]
        input_ids = batch['input_ids']
        mask_ids = batch['attention_mask']

        logits = classifier(input_ids, mask_ids)
        
        loss = loss_funct(labels, mask_ids, logits, num_labels)
        
        accelerator.backward(loss)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        
        train_losses.append(loss.item())
    
    avg_train_loss = np.average(train_losses)
    
    training_time = format_time(time.time() - t0)
    print("")
    print("  Training epoch took: {:}".format(training_time))

    # Evaluation
    classifier.eval()
    
    for batch in eval_dataloader:
        labels = batch["labels"]
        input_ids = batch['input_ids']
        mask_ids = batch['attention_mask']
  
        with torch.no_grad():
            outputs = classifier(input_ids, mask_ids)
        
        loss = loss_funct(labels, mask_ids, outputs, num_labels)
        valid_losses.append(loss.item())

        predictions = outputs.argmax(dim=-1)
        labels = batch["labels"]

        # padding predictions and labels for being gathered
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)

        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered, label_list)
        metric.add_batch(predictions=true_predictions, references=true_labels)

    avg_valid_loss = np.average(valid_losses)
            
    print_msg = (f'  train_loss: {avg_train_loss:.5f} ' + 
                 f'valid_loss: {avg_valid_loss:.5f}')
    print(print_msg)

    print("")
    results = metric.compute()
    df_results = pd.DataFrame(results)
    df_results_ = df_results[['LOC', 'PER', 'MISC', 'ORG']]
    df_results_ = df_results_.T
    df_results_.number = df_results_.number.astype(int)
    display(df_results_)
    print('overall_precision: ', round(df_results['overall_precision'][0], 4))
    print('overall_recall: ', round(df_results['overall_recall'][0], 4))
    print('overall_f1: ', round(df_results['overall_f1'][0], 4))
    print("")
    
    print('saving...')
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(classifier)
    accelerator.save({        
            'epoch': epoch,
            'model_state_dict': unwrapped_model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss}, output_file)
    print("")

HBox(children=(FloatProgress(value=0.0, max=501.0), HTML(value='')))

Training...

  Training epoch took: 0:05:05
  train_loss: 0.55337 valid_loss: 0.30363



Unnamed: 0,precision,recall,f1,number
LOC,0.732394,0.395437,0.51358,263
PER,0.887097,0.866142,0.876494,127
MISC,0.166667,0.578947,0.258824,19
ORG,0.636364,0.62536,0.630814,347


overall_precision:  0.6568
overall_recall:  0.5847
overall_f1:  0.6186

saving...

Training...

  Training epoch took: 0:05:21
  train_loss: 0.36677 valid_loss: 0.25931



Unnamed: 0,precision,recall,f1,number
LOC,0.697183,0.502538,0.584071,197
PER,0.854839,0.913793,0.883333,116
MISC,0.409091,0.428571,0.418605,63
ORG,0.703812,0.695652,0.699708,345


overall_precision:  0.7013
overall_recall:  0.6546
overall_f1:  0.6772

saving...

Training...

  Training epoch took: 0:05:14
  train_loss: 0.28419 valid_loss: 0.23761



Unnamed: 0,precision,recall,f1,number
LOC,0.704225,0.526316,0.60241,190
PER,0.879032,0.886179,0.882591,123
MISC,0.409091,0.3375,0.369863,80
ORG,0.718475,0.704023,0.711176,348


overall_precision:  0.7147
overall_recall:  0.6491
overall_f1:  0.6803

saving...

