In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import os, sys
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import random

torch.__version__

2021-09-13 23:59:34.677524: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-09-13 23:59:34.677562: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


'1.8.1+cu102'

In [2]:
import transformers
from transformers import BertForTokenClassification, AdamW

transformers.__version__

'4.7.0'

In [3]:
MAX_LEN = 50
LR = 3e-5
bs = 2
epochs = 20
max_grad_norm = 1.0
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

#path to save model
MODEL_PATH = "bert_swap_model"

In [4]:
import pandas as pd
train = pd.read_csv('restaurants_train.csv')
val = pd.read_csv('restaurants_val.csv')
holdout = pd.read_csv('restaurants_holdout.csv')
# train = train.fillna('')
# val = val.fillna('')
# holdout = holdout.fillna('')

In [5]:
train = train.fillna('')
val = val.fillna('')
holdout = holdout.fillna('')

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [7]:
################################
# DO NOT CHANGE THIS FUNCTION! #
################################

def get_f1_score_on_test_data(model, data):
    model.eval()
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    for index, row in data.iterrows():
        sentence = row.sentence
        expected = row.restaurant_name
        #inputs = tokenizer([sentence], max_length=1024, return_tensors='pt')
        predicted = generate_outputs(model, sentence)
        if expected != '' and expected == predicted:
            true_positives += 1
        if expected != '' and expected != predicted:
            false_positives += 1
        if expected == '' and predicted != '':
            false_positives += 1
        if expected != '' and predicted == '':
            false_negatives += 1

    precision = 0
    recall = 0
    f1_score = 0
    if true_positives + false_positives:
        precision = true_positives / (true_positives + false_positives)
    if true_positives + false_negatives:
        recall = true_positives /(true_positives + false_negatives)
    if precision + recall:
        f1_score = 2 * precision * recall / (precision + recall)

    print(f'precision: {precision} | recall {recall} | f1_score {f1_score}')
    return f1_score


def convert_bio_text(words, tags):
    """
    Converts BIO tags to text
    """
    prev_tag = "O"
    extracted_dict = {}
    count = 0
    for x,t in zip(words, tags):
        if prev_tag == "O":
            if t == "B":
                prev_tag = "B"
                extracted_dict[count] = [x]
        else:
            if t == "I":
                extracted_dict[count].append(x)
                prev_tag = "I"
            elif t == "B" :
                prev_tag = "B"
                count += 1
                extracted_dict[count] = [x]
            else:
                count += 1
                prev_tag = "O"
                
    for i in extracted_dict.keys():
        extracted_dict[i] = " ".join(extracted_dict[i])
    return list(extracted_dict.values())

def generate_outputs(model, sentence, device="cpu"):
    
    """
    Predicts on a sentence and converts predicted BIO tags to text
    """
    
    tokenized_sentence = tokenizer.encode(sentence)
    
    input_ids = torch.tensor([tokenized_sentence]).to(device)
    
    with torch.no_grad():
        
        output = model(input_ids)
        
    label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)
    
    tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])

    new_tokens, new_labels = [], []

    for token, label_idx in zip(tokens, label_indices[0]):

        if token.startswith("##"):

            new_tokens[-1] = new_tokens[-1] + token[2:]

        else:

            new_labels.append(tag_values[label_idx])

            new_tokens.append(token)
            
    #return " ".join(convert_bio_text(new_tokens, new_labels)), new_labels
    
    out = convert_bio_text(new_tokens, new_labels)
    
    if len(out) > 0:
        
        return out[0]
    else:
        return ""
    

In [8]:
def _tag_bio(txt, entity):    
        """
        Given a text and an entity, creates BIO tags for that entity.
        """

        entity_tok = entity.split()

        txt_tok = txt.split()

        out_tags = ["O"]*len(txt_tok)
        
        if entity == "":
            return out_tags

        b_idxs = []

        # Fetch indices of B
        for i, w in enumerate(txt_tok):
            if w == entity_tok[0]:
                b_idxs.append(i)

        #print(b_idxs, txt_tok)

        # if entity is just of 1 word, then tag Bs appropriately and exit
        if len(entity_tok) == 1:
            for b in b_idxs:
                out_tags[b] = "B"
            return out_tags

        # ------- Tagging Is-----#
        tags_idx = {}
        for bidx in b_idxs:
            # for each B as pivot, fetch it's corresponding Is
            if (bidx < (len(txt_tok) - 1)):

                counter = 0

                while counter < (len(entity_tok) - 1):
                    # loop till all words in entity have been iterated
                    counter += 1

                    if (bidx + counter > (len(txt_tok) - 1)):
                        # exit if reached end of sentence before looping through all of entity words
                        counter -= 1
                        break
                    if (txt_tok[bidx + counter] != entity_tok[counter]):
                        # exit if any I word doesnt match
                        counter -= 1
                        # check for matching Is
                        break;
                if counter == (len(entity_tok) - 1):
                    # if all I words match then counter should be equal to number of Is
                    # if so, then add the indexes of Is to its corresponding B dict
                    tags_idx[bidx] = [bidx + i for i in range(len(entity_tok))]

        #print(tags_idx)
        # now for every B-I index key val pairs, get BI tags
        for b, i in tags_idx.items():
            if len(i) > 1:
                out_tags[b] = "B"
                for ii in i[1:]:
                    out_tags[ii] = "I"

        return out_tags
    
    
def get_replacement_label(label, label_list):
    
    cands = [x for x in label_list if x != label]
    
    return cands[random.randint(0, len(cands)-1)]

def swap_labels(txt, label, replacement):
    
    return txt.replace(label, replacement)
    
    

In [9]:
def tokenize_and_preserve_labels(sentence, text_labels):
    """
    Handles Bert's sub word encoding
    """
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels


### Data Prep

In [10]:
train["tags"] = train.apply(lambda x: _tag_bio(x["sentence"], x["restaurant_name"]), axis=1)

train

Unnamed: 0,sentence,restaurant_name,tags
0,a four star restaurant with a bar,,"[O, O, O, O, O, O, O]"
1,areas that allow smoking,,"[O, O, O, O]"
2,any restaurants that still allow smoking,,"[O, O, O, O, O, O]"
3,are there any restaurants for diabetics that s...,,"[O, O, O, O, O, O, O, O, O, O, O]"
4,can you find east dedham pizzeria that have a ...,east,"[O, O, O, B, O, O, O, O, O, O, O, O, O]"
...,...,...,...
115,any mexican places have a tameles special today,,"[O, O, O, O, O, O, O, O]"
116,do you know if elmos have a dress code,elmos,"[O, O, O, O, B, O, O, O, O]"
117,are there any chicken wing places nearby,,"[O, O, O, O, O, O, O]"
118,are there any vietnamese restaurants nearby,,"[O, O, O, O, O, O]"


In [11]:
train_swap = train.loc[train.restaurant_name != "",:].copy()

label_list = train_swap["restaurant_name"].unique().tolist()

train_swap["rep_label"] = train_swap["restaurant_name"].apply(lambda x: get_replacement_label(x, label_list))

train_swap["sentence"] = train_swap.apply(lambda x: swap_labels(x["sentence"], x["restaurant_name"], 
                                                                x["rep_label"]) , axis = 1)

train_swap["restaurant_name"] = train_swap["rep_label"]

train_swap["tags"] = train_swap.apply(lambda x: _tag_bio(x["sentence"], x["restaurant_name"]), axis=1)

train_swap

Unnamed: 0,sentence,restaurant_name,tags,rep_label
4,can you find barat a nossa casa dedham pizzeri...,barat a nossa casa,"[O, O, O, B, I, I, I, O, O, O, O, O, O, O, O, O]",barat a nossa casa
6,can you tell me where the nearest lone star is,lone star,"[O, O, O, O, O, O, O, B, I, O]",lone star
14,do you think olive garden has fabulous service,olive garden,"[O, O, O, B, I, O, O, O]",olive garden
17,can i get hambers at blue coyote grill,blue coyote grill,"[O, O, O, O, O, B, I, I]",blue coyote grill
19,do you know if there are any reviews on east,east,"[O, O, O, O, O, O, O, O, O, B]",east
24,are there reservations still available for dan...,danny cooks,"[O, O, O, O, O, O, B, I, O, O, O, O, O, O]",danny cooks
26,are there any hotel dining in the city open on...,hotel dining,"[O, O, O, B, I, O, O, O, O, O, O]",hotel dining
27,do i need a reservation for taco bell,taco bell,"[O, O, O, O, O, O, B, I]",taco bell
35,are there any pf changs within 5 minutes drive...,pf changs,"[O, O, O, B, I, O, O, O, O, O, O, O, O, O, O]",pf changs
37,are there any lone star in town,lone star,"[O, O, O, B, I, O, O]",lone star


In [12]:
train = pd.concat([train, train_swap], axis = 0, ignore_index=True)

train = train.sample(frac=1)

train

Unnamed: 0,sentence,restaurant_name,tags,rep_label
65,are there any fun restaurants serving brisket ...,,"[O, O, O, O, O, O, O, O, O]",
95,are there any sushi restaurants near colonel b...,,"[O, O, O, O, O, O, O, O, O]",
48,are there any bars nearby that serve food like...,,"[O, O, O, O, O, O, O, O, O, O, O, O]",
13,are there any places to eat in the area that o...,,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]",
49,are there any turkish restaurants in florida,,"[O, O, O, O, O, O, O]",
...,...,...,...,...
79,can you find me hotel dining with comfort food,hotel dining,"[O, O, O, O, B, I, O, O, O]",
149,are the portion at waterfront large or very small,waterfront,"[O, O, O, O, B, O, O, O, O]",waterfront
17,can i get hambers at lone star,lone star,"[O, O, O, O, O, B, I]",
33,are there any chinese restaurants near cheyenne,,"[O, O, O, O, O, O, O]",


In [13]:
val["tags"] = val.apply(lambda x: _tag_bio(x["sentence"], x["restaurant_name"]), axis=1)

val

Unnamed: 0,sentence,restaurant_name,tags
0,are there any ice cream shops in my neighborho...,,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
1,are there any restaurants within 5 miles that ...,,"[O, O, O, O, O, O, O, O, O, O, O]"
2,are there any locally owned franchises that gi...,,"[O, O, O, O, O, O, O, O, O, O, O]"
3,are there any restaurants that will let me tak...,,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4,are there any five star restaurants around here,,"[O, O, O, O, O, O, O, O]"
5,do you think the noodle bar is open,noodle bar,"[O, O, O, O, B, I, O, O]"
6,are there any vegetarian restaurants in this town,,"[O, O, O, O, O, O, O, O]"
7,any places around here that has a nice view,,"[O, O, O, O, O, O, O, O, O]"
8,are there any jazz clubs that serve food,,"[O, O, O, O, O, O, O, O]"
9,do any famous people frequent the jimmys pizza...,jimmys pizza,"[O, O, O, O, O, O, B, I, O, O, O, O, O]"


In [14]:
holdout["tags"] = holdout.apply(lambda x: _tag_bio(x["sentence"], x["restaurant_name"]), axis=1)

holdout

Unnamed: 0,sentence,restaurant_name,tags
0,find pizza places,,"[O, O, O]"
1,find me the best rated chinese restaurant in t...,,"[O, O, O, O, O, O, O, O, O, O, O]"
2,what kind of food does abc cafe serve,abc cafe,"[O, O, O, O, O, B, I, O]"
3,how far away is the nearest steak house,,"[O, O, O, O, O, O, O, O]"
4,i am looking for a mexican restuarant that has...,,"[O, O, O, O, O, O, O, O, O, O, O, O]"
...,...,...,...
145,find me brazilian food with on location parking,,"[O, O, O, O, O, O, O, O]"
146,get me to a mexican place,,"[O, O, O, O, O, O]"
147,how far am i from the nearest bagel shop,,"[O, O, O, O, O, O, O, O, O]"
148,what time does sonic open,sonic,"[O, O, O, B, O]"


### Create data loaders

In [15]:
train.tags.apply(len).max()

17

In [16]:
train_sents = train["sentence"].apply(lambda x: x.split()).tolist()

train_labels = train["tags"].tolist()

val_sents = val["sentence"].apply(lambda x: x.split()).tolist()

val_labels = val["tags"].tolist()

holdout_sents = holdout["sentence"].apply(lambda x: x.split()).tolist()

holdout_labels = holdout["tags"].tolist()


In [17]:
train_tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(train_sents, train_labels)
]


In [18]:
val_tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(val_sents, val_labels)
]


In [19]:
holdout_tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(holdout_sents, holdout_labels)
]


In [20]:
train_tokenized_texts = [token_label_pair[0] for token_label_pair in train_tokenized_texts_and_labels]

train_labels = [token_label_pair[1] for token_label_pair in train_tokenized_texts_and_labels]

In [21]:
val_tokenized_texts = [token_label_pair[0] for token_label_pair in val_tokenized_texts_and_labels]

val_labels = [token_label_pair[1] for token_label_pair in val_tokenized_texts_and_labels]

In [22]:
holdout_tokenized_texts = [token_label_pair[0] for token_label_pair in holdout_tokenized_texts_and_labels]

holdout_labels = [token_label_pair[1] for token_label_pair in holdout_tokenized_texts_and_labels]

In [23]:
input_ids_train = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in train_tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

In [24]:
input_ids_val = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in val_tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

In [25]:
input_ids_holdout = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in holdout_tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

In [26]:
tag_values = ["B", "I", "O", "PAD"]

tag2idx = {t: i for i, t in enumerate(tag_values)}

tag2idx

{'B': 0, 'I': 1, 'O': 2, 'PAD': 3}

In [27]:
train_tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in train_labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

In [28]:
val_tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in val_labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

In [29]:
holdout_tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in holdout_labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

In [30]:
train_attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids_train]

In [31]:
val_attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids_val]

In [32]:
holdout_attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids_holdout]

In [33]:
train_inputs = torch.tensor(input_ids_train)

val_inputs = torch.tensor(input_ids_val)

holdout_inputs = torch.tensor(input_ids_holdout)

train_tags = torch.tensor(train_tags)

val_tags = torch.tensor(val_tags)

holdout_tags = torch.tensor(holdout_tags)

train_attention_masks = torch.tensor(train_attention_masks)

val_attention_masks = torch.tensor(val_attention_masks)

holdout_attention_masks = torch.tensor(holdout_attention_masks)

In [34]:
val_inputs.shape, val_tags.shape, val_attention_masks.shape

(torch.Size([30, 50]), torch.Size([30, 50]), torch.Size([30, 50]))

In [35]:
train_data = TensorDataset(train_inputs, train_attention_masks, train_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_attention_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)


### Model Setup

In [36]:
model = BertForTokenClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(tag2idx),
    output_attentions = False,
    output_hidden_states = False
)
#model.cuda();


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

In [37]:
FULL_FINETUNING = True

if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=LR,
    eps=1e-8
)

In [38]:
from transformers import get_linear_schedule_with_warmup

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

num_warmup_steps = int(0.1 * total_steps)

print(num_warmup_steps, total_steps)
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=total_steps
)

160 1600


### Training

In [39]:
## Store the average loss after each epoch so we can plot them.
loss_values, validation_loss_values = [], []

max_f1 = 0

for _ in trange(epochs, desc="Epoch"):
    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.

    # Put the model into training mode.
    model.train()
    # Reset the total loss for this epoch.
    total_loss = 0

    # Training loop
    for step, batch in enumerate(tqdm(train_dataloader, total = len(train_dataloader), position=0, leave=True)):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # Always clear any previously calculated gradients before performing a backward pass.
        model.zero_grad()
        # forward pass
        # This will return the loss (rather than the model output)
        # because we have provided the `labels`.
        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
        # get the loss
        loss = outputs[0]
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # track train loss
        total_loss += loss.item()
        # Clip the norm of the gradient
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))

    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)


    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    # Put the model into evaluation mode
    model.eval()
    # Reset the validation loss for this epoch.
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        # Telling the model not to compute or store gradients,
        # saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have not provided labels.
            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask, labels=b_labels)
        # Move logits and labels to CPU
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences.
        eval_loss += outputs[0].mean().item()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)

    eval_loss = eval_loss / len(valid_dataloader)
    validation_loss_values.append(eval_loss)
    print("Validation loss: {}".format(eval_loss))
    pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
                                 for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
    valid_tags = [tag_values[l_i] for l in true_labels
                                  for l_i in l if tag_values[l_i] != "PAD"]
    
    print("val on val")
    f1_val = get_f1_score_on_test_data(model, val)
    print("val on holdout")
    f1 = get_f1_score_on_test_data(model, holdout)
    if (f1_val >= max_f1) & (f1_val > 0):

        #f1 = get_f1_score_on_test_data(model, holdout)        
        print("saving model:", f1, f1_val,max_f1)
        max_f1 = f1_val
        holdout_f1 = f1
        torch.save(model.state_dict(), MODEL_PATH)
        f1_train = get_f1_score_on_test_data(model, train)
        print("Train f1", f1_train)
        
#     print("Validation Accuracy: {}".format(accuracy_score(pred_tags, valid_tags)))
#     print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))
    print()

100%|███████████████████████████████████████████| 80/80 [01:49<00:00,  1.37s/it]


Average train loss: 0.6986915646819398
Validation loss: 0.39833361698935427
val on val
precision: 0.034482758620689655 | recall 1.0 | f1_score 0.06666666666666667
val on holdout
precision: 0.0 | recall 0 | f1_score 0
saving model: 0 0.06666666666666667 0


Epoch:   5%|█▊                                  | 1/20 [02:11<41:41, 131.66s/it]

precision: 0.0 | recall 0 | f1_score 0
Train f1 0



100%|███████████████████████████████████████████| 80/80 [01:41<00:00,  1.27s/it]


Average train loss: 0.36640763079049066
Validation loss: 0.19192700151664516
val on val
precision: 0.038461538461538464 | recall 0.5 | f1_score 0.07142857142857144
val on holdout
precision: 0.058394160583941604 | recall 0.8888888888888888 | f1_score 0.1095890410958904
saving model: 0.1095890410958904 0.07142857142857144 0.06666666666666667


Epoch:  10%|███▌                                | 2/20 [04:15<38:04, 126.90s/it]

precision: 0.12781954887218044 | recall 0.6538461538461539 | f1_score 0.2138364779874214
Train f1 0.2138364779874214



100%|███████████████████████████████████████████| 80/80 [01:38<00:00,  1.23s/it]


Average train loss: 0.18121092525543644
Validation loss: 0.17013200079866994
val on val
precision: 0.3888888888888889 | recall 0.875 | f1_score 0.5384615384615385
val on holdout
precision: 0.17647058823529413 | recall 0.8333333333333334 | f1_score 0.29126213592233013
saving model: 0.29126213592233013 0.5384615384615385 0.07142857142857144


Epoch:  15%|█████▍                              | 3/20 [06:15<35:05, 123.84s/it]

precision: 0.5052631578947369 | recall 0.8421052631578947 | f1_score 0.631578947368421
Train f1 0.631578947368421



100%|███████████████████████████████████████████| 80/80 [01:38<00:00,  1.23s/it]


Average train loss: 0.11030103148586931
Validation loss: 0.10947073011775502
val on val
precision: 0.47058823529411764 | recall 1.0 | f1_score 0.6399999999999999
val on holdout
precision: 0.336734693877551 | recall 0.868421052631579 | f1_score 0.48529411764705876
saving model: 0.48529411764705876 0.6399999999999999 0.5384615384615385


Epoch:  20%|███████▏                            | 4/20 [08:17<32:52, 123.31s/it]

precision: 0.6082474226804123 | recall 0.9076923076923077 | f1_score 0.728395061728395
Train f1 0.728395061728395



100%|███████████████████████████████████████████| 80/80 [01:52<00:00,  1.41s/it]


Average train loss: 0.0583283292264241
Validation loss: 0.1782504402033131
val on val
precision: 0.5 | recall 1.0 | f1_score 0.6666666666666666
val on holdout
precision: 0.3148148148148148 | recall 0.8717948717948718 | f1_score 0.4625850340136054
saving model: 0.4625850340136054 0.6666666666666666 0.6399999999999999


Epoch:  25%|█████████                           | 5/20 [10:34<32:03, 128.24s/it]

precision: 0.7291666666666666 | recall 0.9859154929577465 | f1_score 0.8383233532934132
Train f1 0.8383233532934132



100%|███████████████████████████████████████████| 80/80 [01:44<00:00,  1.30s/it]


Average train loss: 0.043833411149535095
Validation loss: 0.1361959441822061
val on val
precision: 0.5 | recall 1.0 | f1_score 0.6666666666666666
val on holdout
precision: 0.3627450980392157 | recall 0.925 | f1_score 0.5211267605633803
saving model: 0.5211267605633803 0.6666666666666666 0.6666666666666666


Epoch:  30%|██████████▊                         | 6/20 [12:44<30:02, 128.72s/it]

precision: 0.69 | recall 0.971830985915493 | f1_score 0.8070175438596492
Train f1 0.8070175438596492



100%|███████████████████████████████████████████| 80/80 [01:39<00:00,  1.25s/it]


Average train loss: 0.02727573043976008
Validation loss: 0.1761913458215228
val on val
precision: 0.5 | recall 1.0 | f1_score 0.6666666666666666
val on holdout
precision: 0.37 | recall 0.925 | f1_score 0.5285714285714286
saving model: 0.5285714285714286 0.6666666666666666 0.6666666666666666


Epoch:  35%|████████████▌                       | 7/20 [14:45<27:20, 126.18s/it]

precision: 0.7142857142857143 | recall 0.9722222222222222 | f1_score 0.8235294117647058
Train f1 0.8235294117647058



100%|███████████████████████████████████████████| 80/80 [01:39<00:00,  1.24s/it]


Average train loss: 0.022179130338918183
Validation loss: 0.1422317713770705
val on val
precision: 0.5 | recall 1.0 | f1_score 0.6666666666666666
val on holdout
precision: 0.31896551724137934 | recall 0.9487179487179487 | f1_score 0.47741935483870973
saving model: 0.47741935483870973 0.6666666666666666 0.6666666666666666


Epoch:  40%|██████████████▍                     | 8/20 [16:46<24:53, 124.47s/it]

precision: 0.6666666666666666 | recall 0.9722222222222222 | f1_score 0.7909604519774012
Train f1 0.7909604519774012



100%|███████████████████████████████████████████| 80/80 [01:41<00:00,  1.27s/it]


Average train loss: 0.01740581281328559
Validation loss: 0.18656121977692236
val on val
precision: 0.5 | recall 1.0 | f1_score 0.6666666666666666
val on holdout
precision: 0.3394495412844037 | recall 0.9487179487179487 | f1_score 0.5
saving model: 0.5 0.6666666666666666 0.6666666666666666


Epoch:  45%|████████████████▏                   | 9/20 [18:50<22:49, 124.47s/it]

precision: 0.6893203883495146 | recall 0.9726027397260274 | f1_score 0.8068181818181818
Train f1 0.8068181818181818



100%|███████████████████████████████████████████| 80/80 [01:42<00:00,  1.28s/it]


Average train loss: 0.014178818261098059
Validation loss: 0.15812852947589515
val on val
precision: 0.5 | recall 1.0 | f1_score 0.6666666666666666
val on holdout
precision: 0.3185840707964602 | recall 0.9230769230769231 | f1_score 0.4736842105263158
saving model: 0.4736842105263158 0.6666666666666666 0.6666666666666666


Epoch:  50%|█████████████████▌                 | 10/20 [20:58<20:55, 125.53s/it]

precision: 0.6796116504854369 | recall 0.9722222222222222 | f1_score 0.8
Train f1 0.8



100%|███████████████████████████████████████████| 80/80 [01:41<00:00,  1.26s/it]


Average train loss: 0.006255825162952533
Validation loss: 0.1916019760998703
val on val
precision: 0.5 | recall 1.0 | f1_score 0.6666666666666666
val on holdout
precision: 0.31092436974789917 | recall 0.9487179487179487 | f1_score 0.46835443037974683
saving model: 0.46835443037974683 0.6666666666666666 0.6666666666666666


Epoch:  55%|███████████████████▎               | 11/20 [23:01<18:41, 124.59s/it]

precision: 0.660377358490566 | recall 0.9722222222222222 | f1_score 0.7865168539325842
Train f1 0.7865168539325842



100%|███████████████████████████████████████████| 80/80 [01:40<00:00,  1.25s/it]


Average train loss: 0.0009738019827636891
Validation loss: 0.17337724138172536
val on val
precision: 0.47368421052631576 | recall 1.0 | f1_score 0.6428571428571429
val on holdout


Epoch:  60%|█████████████████████              | 12/20 [24:52<16:04, 120.58s/it]

precision: 0.29508196721311475 | recall 0.9473684210526315 | f1_score 0.45



100%|███████████████████████████████████████████| 80/80 [01:38<00:00,  1.23s/it]


Average train loss: 0.009410615123124444
Validation loss: 0.23267632916249567
val on val
precision: 0.5 | recall 1.0 | f1_score 0.6666666666666666
val on holdout
precision: 0.3135593220338983 | recall 0.9487179487179487 | f1_score 0.4713375796178344
saving model: 0.4713375796178344 0.6666666666666666 0.6666666666666666


Epoch:  65%|██████████████████████▊            | 13/20 [26:53<14:03, 120.54s/it]

precision: 0.6601941747572816 | recall 0.9714285714285714 | f1_score 0.7861271676300579
Train f1 0.7861271676300579



100%|███████████████████████████████████████████| 80/80 [01:37<00:00,  1.22s/it]


Average train loss: 0.003977217415376799
Validation loss: 0.2390364848038492
val on val
precision: 0.5 | recall 1.0 | f1_score 0.6666666666666666
val on holdout
precision: 0.3135593220338983 | recall 0.9487179487179487 | f1_score 0.4713375796178344
saving model: 0.4713375796178344 0.6666666666666666 0.6666666666666666


Epoch:  70%|████████████████████████▌          | 14/20 [28:52<12:01, 120.27s/it]

precision: 0.6538461538461539 | recall 0.9714285714285714 | f1_score 0.7816091954022988
Train f1 0.7816091954022988



100%|███████████████████████████████████████████| 80/80 [01:40<00:00,  1.25s/it]


Average train loss: 0.0005255963004856312
Validation loss: 0.23061925681467982
val on val
precision: 0.5 | recall 1.0 | f1_score 0.6666666666666666
val on holdout
precision: 0.30327868852459017 | recall 0.9487179487179487 | f1_score 0.45962732919254656
saving model: 0.45962732919254656 0.6666666666666666 0.6666666666666666


Epoch:  75%|██████████████████████████▎        | 15/20 [30:55<10:05, 121.13s/it]

precision: 0.6509433962264151 | recall 0.971830985915493 | f1_score 0.7796610169491526
Train f1 0.7796610169491526



100%|███████████████████████████████████████████| 80/80 [01:36<00:00,  1.21s/it]


Average train loss: 0.005995989197162999
Validation loss: 0.23834746289843073
val on val
precision: 0.5 | recall 1.0 | f1_score 0.6666666666666666
val on holdout
precision: 0.3135593220338983 | recall 0.9487179487179487 | f1_score 0.4713375796178344
saving model: 0.4713375796178344 0.6666666666666666 0.6666666666666666


Epoch:  80%|████████████████████████████       | 16/20 [32:54<08:01, 120.40s/it]

precision: 0.6699029126213593 | recall 0.971830985915493 | f1_score 0.793103448275862
Train f1 0.793103448275862



100%|███████████████████████████████████████████| 80/80 [01:37<00:00,  1.22s/it]


Average train loss: 0.010961201936061116
Validation loss: 0.23949597134948514
val on val
precision: 0.5 | recall 1.0 | f1_score 0.6666666666666666
val on holdout
precision: 0.3162393162393162 | recall 0.9487179487179487 | f1_score 0.47435897435897434
saving model: 0.47435897435897434 0.6666666666666666 0.6666666666666666


Epoch:  85%|█████████████████████████████▊     | 17/20 [34:54<06:00, 120.31s/it]

precision: 0.6601941747572816 | recall 0.9714285714285714 | f1_score 0.7861271676300579
Train f1 0.7861271676300579



100%|███████████████████████████████████████████| 80/80 [01:42<00:00,  1.29s/it]


Average train loss: 0.0069971776879356185
Validation loss: 0.24058005456754472
val on val
precision: 0.5 | recall 1.0 | f1_score 0.6666666666666666
val on holdout
precision: 0.30833333333333335 | recall 0.9487179487179487 | f1_score 0.46540880503144655
saving model: 0.46540880503144655 0.6666666666666666 0.6666666666666666


Epoch:  90%|███████████████████████████████▌   | 18/20 [37:01<04:04, 122.34s/it]

precision: 0.6476190476190476 | recall 0.9714285714285714 | f1_score 0.7771428571428572
Train f1 0.7771428571428572



100%|███████████████████████████████████████████| 80/80 [01:49<00:00,  1.37s/it]


Average train loss: 0.0003849779498523276
Validation loss: 0.23902942387697598
val on val
precision: 0.5 | recall 1.0 | f1_score 0.6666666666666666
val on holdout
precision: 0.30833333333333335 | recall 0.9487179487179487 | f1_score 0.46540880503144655
saving model: 0.46540880503144655 0.6666666666666666 0.6666666666666666


Epoch:  95%|█████████████████████████████████▎ | 19/20 [39:17<02:06, 126.45s/it]

precision: 0.6415094339622641 | recall 0.9714285714285714 | f1_score 0.7727272727272727
Train f1 0.7727272727272727



100%|███████████████████████████████████████████| 80/80 [01:50<00:00,  1.38s/it]


Average train loss: 0.002769723329674889
Validation loss: 0.2385231912014812
val on val
precision: 0.5 | recall 1.0 | f1_score 0.6666666666666666
val on holdout
precision: 0.30833333333333335 | recall 0.9487179487179487 | f1_score 0.46540880503144655
saving model: 0.46540880503144655 0.6666666666666666 0.6666666666666666


Epoch: 100%|███████████████████████████████████| 20/20 [41:34<00:00, 124.72s/it]

precision: 0.6415094339622641 | recall 0.9714285714285714 | f1_score 0.7727272727272727
Train f1 0.7727272727272727






### Evauation

In [40]:
holdout_f1

0.46540880503144655

In [41]:
get_f1_score_on_test_data(model, train)

precision: 0.6415094339622641 | recall 0.9714285714285714 | f1_score 0.7727272727272727


0.7727272727272727

In [42]:
get_f1_score_on_test_data(model, holdout)

precision: 0.30833333333333335 | recall 0.9487179487179487 | f1_score 0.46540880503144655


0.46540880503144655

In [45]:
# MODEL_PATH = "test_local_model"

In [46]:
model = BertForTokenClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(tag2idx),
    output_attentions = False,
    output_hidden_states = False
)
model.load_state_dict(torch.load(MODEL_PATH))

#model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

<All keys matched successfully>

In [47]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [48]:
get_f1_score_on_test_data(model, holdout)

precision: 0.30833333333333335 | recall 0.9487179487179487 | f1_score 0.46540880503144655


0.46540880503144655