# **Fine-tuning RoBERTa for named-entity recognition**

In [None]:
#!pip install seqeval

In [None]:
#!pip install tensorflow

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import pipeline, AutoModelForTokenClassification, get_cosine_schedule_with_warmup, AutoTokenizer
from seqeval.metrics import classification_report
import math
import os
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

  from .autonotebook import tqdm as notebook_tqdm
2023-09-01 02:38:24.238157: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-01 02:38:24.288559: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


cuda


In [2]:
!nvidia-smi

Fri Sep  1 02:38:27 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  Off  | 00000000:3B:00.0 Off |                    0 |
| N/A   35C    P0    33W / 250W |      4MiB / 32768MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-PCIE...  Off  | 00000000:AF:00.0 Off |                    0 |
| N/A   30C    P0    35W / 250W |      4MiB / 32768MiB |      2%      Default |
|       

In [4]:
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
EPOCHS = 3#3#20
LEARNING_RATE = 5e-5 #1e-05
MAX_GRAD_NORM = 10
MAX_LEN = 256


# Preprocessing the dataset

Named entity recognition (NER) uses a specific annotation scheme, which is defined (at least for European languages) at the word level. An annotation scheme that is widely used is called IOB-tagging, which stands for Inside-Outside-Beginning. Each tag indicates whether the corresponding word is inside, outside or at the beginning of a specific named entity. The reason this is used is because named entities usually comprise more than 1 word.

Let's have a look at an example. If you have a sentence like "Barack Obama was born in Hawaï", then the corresponding tags would be [B-PERS, I-PERS, O, O, O, B-GEO]. B-PERS means that the word "Barack" is the beginning of a person, I-PERS means that the word "Obama" is inside a person, "O" means that the word "was" is outside a named entity, and so on. So one typically has as many tags as there are words in a sentence.

In [7]:
#df = pd.read_csv("cleaned_plain-text_labeled_term+combined_no_ref_no_cit_def_same_len_only.csv", delimiter=',')
df = pd.read_csv("cleaned_plain-text_labeled_term+combined_no_ref_no_cit_def_same_len_only_must_conatin_B.csv", delimiter=',')
len(df)

13692

In [None]:
all_data = df[['plain_text_def','labeled_def']].copy()

all_data.rename(columns={"plain_text_def": "sentence", "labeled_def": "word_labels" }, inplace=True)

all_data['word_labels'] = all_data['word_labels'].str.replace('I_MATH_TERM','I-MATH_TERM')
all_data['word_labels'] = all_data['word_labels'].str.replace('B_MATH_TERM','B-MATH_TERM')


data = all_data[:8192] #make a small sample first
data

In [None]:
gen_data = all_data[8192:9216]
gen_data

# Preparing the dataset and dataloader

Now that our data is preprocessed, we can turn it into PyTorch tensors such that we can provide it to the model. Let's start by defining some key variables that will be used later on in the training/evaluation process:


In [5]:
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    """
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword. It is, of course, a bit slower in processing
    time, but it will help our model achieve higher accuracy.
    """

    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence.split(), text_labels.split()):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [6]:
labels = ['B-MATH_TERM', 'I-MATH_TERM', 'O']

label2id = { label : labels.index(label) for label in labels}

id2label = { labels.index(label) : label for label in labels}

label2id

{'B-MATH_TERM': 0, 'I-MATH_TERM': 1, 'O': 2}

In [7]:
class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        # step 1: tokenize (and adapt corresponding labels)
        sentence = self.data.sentence[index]  
        word_labels = self.data.word_labels[index]  
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, word_labels, self.tokenizer)
        
        # step 2: add special tokens (and corresponding labels)
        tokenized_sentence = ["<s> "] + tokenized_sentence + [" </s>"] # add special tokens of Roberta
        labels.insert(0, "O") # add outside label for [CLS] token
        labels.append("O") # add outside label for [SEP] token

        # step 3: truncating/padding
        maxlen = self.max_len

        if (len(tokenized_sentence) > maxlen):
          # truncate
          tokenized_sentence = tokenized_sentence[:maxlen]
          labels = labels[:maxlen]
        else:
          # pad
          tokenized_sentence = tokenized_sentence + ['<pad>'for _ in range(maxlen - len(tokenized_sentence))]
          labels = labels + ["O" for _ in range(maxlen - len(labels))]

        # step 4: obtain the attention mask
        attn_mask = [1 if tok != '<pad>' else 0 for tok in tokenized_sentence] #modifié selon https://huggingface.co/docs/transformers/v4.21.1/en/model_doc/camembert
        
        # step 5: convert tokens to input ids
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)

        label_ids = [label2id[label] for label in labels]
        # the following line is deprecated
        #label_ids = [label if label != 0 else -100 for label in label_ids]
        
        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long),
              #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [None]:
#splite dataset and load for the first time
"""train_size = 0.8
train_dataset = data.sample(frac=train_size,random_state=200)
val_dataset = data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

#save the data sets
train_dataset.to_csv('data/train.csv', index=False)
val_dataset.to_csv('data/val.csv', index=False)
gen_data.to_csv('data/test.csv', index=False)

print("FULL TrainigDataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("VALIDATION Dataset: {}".format(val_dataset.shape))"""

In [None]:
"""training_set = dataset(train_dataset, tokenizer, MAX_LEN)
validation_set = dataset(val_dataset, tokenizer, MAX_LEN)
test_generalizability_set = dataset(gen_data, tokenizer, MAX_LEN)"""


## verify tokenization

In [9]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

In [None]:
model = RobertaForTokenClassification.from_pretrained("roberta-base",
                                                        num_labels=len(id2label),
                                                        id2label=id2label,
                                                        label2id=label2id)
model.to(device)

In [10]:
validation_set = dataset(pd.read_csv('data/val.csv'), tokenizer, MAX_LEN)

In [11]:
validation_set[0]

{'ids': tensor([    3,   250, 18076, 14377,  5000, 28523,   354,   102, 44170,  1640,
           250,     6,   565,   238,  8569,   250,   354,   102,   506, 42524,
          8738,     6,   463,   565,    35,   176, 35227,   250, 24987, 27969,
         12736, 10975,   176,   742,   176, 48552, 45152,   288,     6,   134,
         24303,   354,   102, 32557,  7761, 44143,    29,   261,   250, 48104,
         10278, 10975,   176,   742,   560, 45152,   288,     6,   134, 48268,
          8569, 10975,   176,   742,    35,  5214, 45152,   134,     6,   176,
         48268,  5488,   354,  8628,  5471, 25286,  5632, 27387,   560,  8007,
         14035,  1116,   627,   134,   463,   176, 31479,  2507,     4, 18377,
          3785,  2802, 25401,  8738,   846,     6,  1694, 47634,   102, 14377,
          5000, 32557,   565, 35227,  1640,   846,  3256,   176, 35227,   250,
         24987, 27969, 12736,   846,   176, 48552,   176, 35227,   846,   176,
          1409, 32639,  5867,   260, 14724,  

In [12]:
# print the first 50 tokens and corresponding labels
for token, label in zip(tokenizer.convert_ids_to_tokens(validation_set[15]["ids"][:50]), validation_set[15]["targets"][:50]):
  print('{0:15}  {1}'.format(token, id2label[label.item()]))

<unk>            O
For              O
k                O
âī               O
¥                O
2                O
,                O
a                O
k                B-MATH_TERM
-                B-MATH_TERM
t                B-MATH_TERM
ensor            B-MATH_TERM
with             O
ent              O
ries             O
in               O
is               O
a                O
function         O
T                O
:{               O
1                O
,...             O
,                O
d                O
}                O
^                O
k                O
â                O
Ł                O
¶                O
.                O
We               O
re               O
fer              O
to               O
the              O
number           O
k                O
as               O
the              O
order            O
of               O
the              O
t                O
ensor            O
T                O
.                O
We               O
den              O


In [13]:
# print the first 50 tokens and corresponding labels
for token, label in zip(tokenizer.convert_ids_to_tokens(validation_set[49]["ids"][:30]), validation_set[49]["targets"][:30]):
  print('{0:15}  {1}'.format(token, id2label[label.item()]))

<unk>            O
The              O
m                B-MATH_TERM
ixed             B-MATH_TERM
volume           I-MATH_TERM
(                O
P                O
_                O
1                O
,                O
âĢ¦              O
,                O
P                O
_                O
n                O
)                O
is               O
the              O
co               O
efficient        O
of               O
the              O
mon              O
omial            O
_                O
1                O
â                O
ĭ                O
¯                O
_                O


In [14]:
# 3 labels: -ln(1/3) = 1.09861228867
ids = validation_set[0]["ids"].unsqueeze(0)
mask = validation_set[0]["mask"].unsqueeze(0)
targets = validation_set[0]["targets"].unsqueeze(0)

ids = ids.to(device)#, dtype = torch.long)
mask = mask.to(device)#, dtype = torch.long)
targets = targets.to(device)#, dtype = torch.long)
model.to(device)
outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
initial_loss = outputs[0]

print(f"intial loss = {initial_loss.item()}")

intial loss = 0.9035974144935608


# Training

In [8]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(model, training_loader, optimizer, scheduler=None):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    
    for idx, batch in enumerate(training_loader):
        
        ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        targets = batch['targets'].to(device, dtype = torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
        loss, tr_logits = outputs.loss, outputs.logits
        '''
        loss, tr_logits  = model(input_ids=ids, attention_mask=mask, labels=targets)#temporary modification for transformer 3'''
        
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)
        
        #if idx % 100==0:
        #    loss_step = tr_loss/nb_tr_steps
        #    print(f"Training loss per 100 training steps: {loss_step}")
           
        # compute training accuracy
        flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
        active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_preds.extend(predictions)
        tr_labels.extend(targets)
        
        tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #scheduler.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    #print(f"Trained {nb_tr_steps} steps")
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")
    

def valid(model, validation_loader):
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    with torch.no_grad():
        for idx, batch in enumerate(validation_loader):
            
            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            targets = batch['targets'].to(device, dtype = torch.long)
            
           
            outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
            loss, eval_logits = outputs.loss, outputs.logits
            
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += targets.size(0)
        
            #if idx % 100==0:
            #    loss_step = eval_loss/nb_eval_steps
            #    print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
            active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
            targets = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(targets)
            eval_preds.extend(predictions)
            
            tmp_eval_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy
    
    #print(eval_labels)
    #print(eval_preds)

    labels = [id2label[id.item()] for id in eval_labels]
    predictions = [id2label[id.item()] for id in eval_preds]

    #print(labels)
    #print(predictions)
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

def print_reports_to_csv(test_results, model_name, LEARNING_RATE, EPOCHS, trainset_num, report_type):
    test_reports = []
    for res in test_results:
        report = classification_report([res['labels']], [res['predictions']], output_dict=True)
        flattened_report = {str(k+'_'+v_k) : v_v for k,v in report.items() for v_k, v_v in v.items()  }
        flattened_report['trainset_size'] = res['trainset_size']
        flattened_report['model'] = res['model']
        flattened_report['trainset_num'] = trainset_num
        test_reports.append(flattened_report)
    
    df_test_reports = pd.DataFrame(test_reports)
    if '/' in model_name:
        model_name =  model_name.split('/')[1] 
    test_report_name = 'finetuning_results/'+report_type+'_'+ model_name + '_' + str(LEARNING_RATE) + '_16_' + str(EPOCHS) + '.csv'
    df_test_reports.to_csv(test_report_name, mode='a', header=not os.path.exists(test_report_name),index=False)

In [10]:
'ner_model/InriaValda/cc_math_roberta_ep01_ft_5ep_train_size_1024_trainset10'.split('/')[-1]

'cc_math_roberta_ep01_ft_5ep_train_size_1024_trainset10'

In [9]:
%%time
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
            'num_workers': 0
            }

val_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
             }
for trainset_num in range(1,7): 

    train_file_name = 'data/10-fold/train_499_'+str(trainset_num)+'.csv'#'data/train.csv'
    val_file_name = 'data/10-fold/val_499_'+str(trainset_num)+'.csv'#'data/val.csv'
    
    for model_name in ['roberta-base']:
        tokenizer = AutoTokenizer.from_pretrained(model_name, from_tf=False, model_max_length=MAX_LEN)
        
        test_generalizability_set = dataset(pd.read_csv('data/test_GPT+labels.csv'), tokenizer, MAX_LEN)
        
        validation_set = dataset(pd.read_csv(val_file_name), tokenizer, MAX_LEN)
        df_training_set = pd.read_csv(train_file_name)
        
        val_results = []
        test_results = []
        
        validation_loader = DataLoader(validation_set, **val_params)
        test_gen_loader = DataLoader(test_generalizability_set, **val_params)
        
        for trainsetsize in [10240]:  #[64,128,256,512,1024,2048,4096,8192,11401] are already done
            training_set = dataset(df_training_set[:trainsetsize], tokenizer, MAX_LEN)
        
            print("TRAIN Dataset: {}".format(training_set.data.shape))
            #train_params['batch_size'] =  int( trainsetsize / 32) if (trainsetsize < 1024) else 16
            training_loader = DataLoader(training_set, **train_params)
        
        
            num_training_steps = int(training_loader.dataset.len / train_params['batch_size'] * EPOCHS)
            print(f'tranining steps: {num_training_steps+1}')
        
            #Shrey uses TF model
            model = AutoModelForTokenClassification.from_pretrained(model_name,
                                                                    from_tf=False,
                                                                    num_labels=len(id2label),
                                                                    id2label=id2label,
                                                                    label2id=label2id)
            model.to(device)
        
            optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
            #scheduler = get_cosine_schedule_with_warmup(optimizer = optimizer, num_warmup_steps = 50, num_training_steps=num_training_steps)
            for epoch in range(EPOCHS):
            #for epoch in range(flex_epoch_nb): 
                print(f"Training epoch: {epoch + 1}")
                train(model, training_loader, optimizer)
                #valid(model, validation_loader)
                #valid(model, test_gen_loader)
            labels, predictions = valid(model, validation_loader)     
            val_results.append({'trainset_size': trainsetsize, 'model': model_name, 'labels': labels, 'predictions': predictions})
        
            #test generalizablity
            labels, predictions = valid(model, test_gen_loader)
            test_results.append({'trainset_size': trainsetsize, 'model': model_name, 'labels': labels, 'predictions': predictions})
            ner_model_name = 'ner_model/'+model_name+ '_ft_' + str(EPOCHS) + 'ep_train_size_'+str(trainsetsize) + '_trainset_'+str(trainset_num)
            model.save_pretrained(ner_model_name)
            tokenizer.save_pretrained(ner_model_name)
            # gpt_aligned_eval(model, tokenizer, ner_model_name) # too slow!
        
        print_reports_to_csv(val_results, model_name, LEARNING_RATE, EPOCHS, trainset_num, 'validation')
        print_reports_to_csv(test_results, model_name, LEARNING_RATE, EPOCHS, trainset_num, 'generalizability')



TRAIN Dataset: (10240, 3)
tranining steps: 1921


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

Training epoch: 1
Training loss epoch: 0.05432697483338415
Training accuracy epoch: 0.966017342608473
Training epoch: 2
Training loss epoch: 0.030360356043092906
Training accuracy epoch: 0.9790405187596164
Training epoch: 3
Training loss epoch: 0.025628550110559443
Training accuracy epoch: 0.9820504649204796
Validation Loss: 0.025542012798965355
Validation Accuracy: 0.9822086425962374
Validation Loss: 0.030305798311019316
Validation Accuracy: 0.978955469735227
TRAIN Dataset: (10240, 3)
tranining steps: 1921


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

Training epoch: 1
Training loss epoch: 0.04702767682319973
Training accuracy epoch: 0.9696088276273758
Training epoch: 2
Training loss epoch: 0.028942278178146808
Training accuracy epoch: 0.9798845703861414
Training epoch: 3
Training loss epoch: 0.02385245669283904
Training accuracy epoch: 0.9830511905167928
Validation Loss: 0.03741293052753693
Validation Accuracy: 0.9770152346749904
Validation Loss: 0.030694629429490305
Validation Accuracy: 0.9802544972949496
TRAIN Dataset: (10240, 3)
tranining steps: 1921


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

Training epoch: 1
Training loss epoch: 0.051979272736934944
Training accuracy epoch: 0.9674871386367301
Training epoch: 2
Training loss epoch: 0.03014748073183
Training accuracy epoch: 0.9791291467540912
Training epoch: 3
Training loss epoch: 0.02538761472678743
Training accuracy epoch: 0.9821129558503511
Validation Loss: 0.03325126540575978
Validation Accuracy: 0.9796631145911341
Validation Loss: 0.03206780000618892
Validation Accuracy: 0.9807092906412751
TRAIN Dataset: (10240, 3)
tranining steps: 1921


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

Training epoch: 1
Training loss epoch: 0.05655415798246395
Training accuracy epoch: 0.9642124545425824
Training epoch: 2
Training loss epoch: 0.030403108036261985
Training accuracy epoch: 0.9788954920234378
Training epoch: 3
Training loss epoch: 0.024587140015501063
Training accuracy epoch: 0.9823414266003633
Validation Loss: 0.03251921546445051
Validation Accuracy: 0.9795394625136178
Validation Loss: 0.03265308759000618
Validation Accuracy: 0.9792171873322572
TRAIN Dataset: (10240, 3)
tranining steps: 1921


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

Training epoch: 1
Training loss epoch: 0.04848412135033868
Training accuracy epoch: 0.9700281924783223
Training epoch: 2
Training loss epoch: 0.029308031090477017
Training accuracy epoch: 0.9798862225724795
Training epoch: 3
Training loss epoch: 0.024398519496025985
Training accuracy epoch: 0.9827274717340273
Validation Loss: 0.031145209894527362
Validation Accuracy: 0.9771015967473363
Validation Loss: 0.030409194121602923
Validation Accuracy: 0.9791862787240766
TRAIN Dataset: (10240, 3)
tranining steps: 1921


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

Training epoch: 1
Training loss epoch: 0.052858430168998896
Training accuracy epoch: 0.9664776615781516
Training epoch: 2
Training loss epoch: 0.030012548248123494
Training accuracy epoch: 0.9791298590555331
Training epoch: 3
Training loss epoch: 0.024768522482918342
Training accuracy epoch: 0.9821254695536684
Validation Loss: 0.03506342390009874
Validation Accuracy: 0.9767205236670286
Validation Loss: 0.036392050300491974
Validation Accuracy: 0.9768768808303159
CPU times: user 39min 4s, sys: 14min 30s, total: 53min 35s
Wall time: 53min 47s


In [24]:
def gpt_aligned_eval(model, tokenizer, model_name):
    df_test_data = pd.read_csv('data/test_GPT+labels.csv')
    eval_list = []
    if '/' in model_name:
            model_name =  model_name.split('/')[1] 
    pipe = pipeline(task="token-classification", model=model.to('cpu'), tokenizer=tokenizer, aggregation_strategy="first")
    
    for index, row in df_test_data.iterrows():
    
        #let's remove repeated terms, keeping 1616 unique out of 1660 terms
        expected_list = set(row['plain_text_term'].split(';'))
        while '' in expected_list:
            expected_list.remove('')
    
        extracted_list = pipe(row['sentence'])
        extracted_list = [x['word'].strip() for x in extracted_list] 
        while '' in extracted_list:
            extracted_list.remove('')
        
        num_TP = 0
        num_too_long = 0
        num_cut_off = 0
        num_split_term = 0
        TP_list = []
        ST_list = [] 
        
        for expected in expected_list:
            for extracted in extracted_list:
                
                if extracted.casefold() == expected.casefold():
                    num_TP = num_TP + 1
                    TP_list.append(expected)
                elif extracted.casefold() in expected.casefold():
                    num_cut_off = num_cut_off + 1
                elif expected.casefold() in extracted.casefold():
                    num_too_long = num_too_long + 1
                    
            expected_no_space = expected.replace(" ","")
            extracted_no_space = ("".join(extracted_list)).replace(" ","")
            if expected_no_space.casefold() in extracted_no_space.casefold(): # including TPs
                num_split_term = num_split_term + 1
                ST_list.append(expected)
        
        num_TP = num_TP - (len(TP_list) - len(set(TP_list)))
        num_split_term = num_split_term - (len(ST_list) -len(set(ST_list)))
        
        eval_list.append({'True Term Num' : len(expected_list),
                            'Extracted Term Num': len(extracted_list),
                            'TP': num_TP,
                            'Cut Off': num_cut_off,
                            'Too Long': num_too_long,
                            'Split Term': num_split_term,
                            'extracted': '###'.join(extracted_list)})
    df_eval = pd.DataFrame(eval_list)
    df_eval['expected'] = df_test_data['plain_text_term']
    eval_report_name = 'GPT_results/ft_'+model_name+'_first_eval.csv'
    df_eval.to_csv(eval_report_name,index=False)

    #print eval
    print(f'ner model name: {model_name}')
    num_T = df_eval['True Term Num'].sum()
    print("True Term Num: " + str(num_T))
    num_Ex = df_eval['Extracted Term Num'].sum()
    print("Extracted Term Num: " + str(num_Ex))
    print("True positive: " + str(df_eval['TP'].sum()))
    num_ST = df_eval['Split Term'].sum()
    print("True positive + split terms: " + str(num_ST))
    print("Too Long: " + str(df_eval['Too Long'].sum()))
    print("Cut Off: " + str(df_eval['Cut Off'].sum()))
    print("precision /correct rate: " + str(num_ST / num_Ex))
    print("recall: " + str(num_ST / num_T))

In [25]:
#%time gpt_aligned_eval(model, tokenizer, ner_model_name) # Wall time: 3min 5s with gpu005



ner model name: roberta-base_ft_5ep_train_size_1024_trainset_1
True Term Num: 1616
Extracted Term Num: 2067
True positive: 893
True positive + split terms: 1100
Too Long: 224
Cut Off: 580
precision /correct rate: 0.5321722302854378
recall: 0.6806930693069307
CPU times: user 24min 33s, sys: 3.53 s, total: 24min 36s
Wall time: 3min 5s


# Saving the model for future use

In [None]:
model.save_pretrained('ner_model/'+model_name+ '_ft_' + str(EPOCHS) + 'ep')
tokenizer.save_pretrained('ner_model/'+model_name+ '_ft_' + str(EPOCHS) + 'ep')

# Inference

The fun part is when we can quickly test the model on new, unseen sentences. Here, we use the prediction of the first word piece of every word. Note that the function we used to train our model (tokenze_and_preserve_labels) propagated the label to all subsequent word pieces (so you could for example also perform a majority vote on the predicted labels of all word pieces of a word).

In other words, the code below does not take into account when predictions of different word pieces that belong to the same word do not match.

In [10]:
#model = AutoModelForTokenClassification.from_pretrained('NER_model_4/model_out/')
pipe = pipeline(task="token-classification", model=model.to('cpu'), tokenizer=tokenizer, aggregation_strategy="simple")
pipe("The Betti poset of a poset P is the subposet consisting of all homologically contributing elements, B(P)={q∈ P  | _i(Δ_q) ≠ 0  i}.")


[{'entity_group': 'MATH_TERM',
  'score': 0.9680205,
  'word': ' B',
  'start': 4,
  'end': 5},
 {'entity_group': 'MATH_TERM',
  'score': 0.9650901,
  'word': 'etti poset',
  'start': 5,
  'end': 15}]

In [11]:
pipe("A subskeleton (Γ_0,α_0,θ_0)⊆(Γ,α,θ) has trivial normal holonomy if the holonomy map K_γ^⊥ is trivial for all loops γ⊂Γ_0.")

[{'entity_group': 'MATH_TERM',
  'score': 0.903038,
  'word': ' trivial normal holonomy',
  'start': 40,
  'end': 63}]

In [13]:
df_eval.to_csv('GPT_results/ft_roberta-base_ft_3ep_train_size_11366'+'_eval.csv', index=False)