In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import os, sys
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

torch.__version__

2021-09-13 23:11:24.192735: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-09-13 23:11:24.192768: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


'1.8.1+cu102'

In [2]:
import transformers
from transformers import BertForTokenClassification, AdamW

transformers.__version__

'4.7.0'

In [3]:
MAX_LEN = 50
LR = 3e-5
bs = 2
epochs = 20
max_grad_norm = 1.0
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

#path to save model
MODEL_PATH = "bert_base_ner_model"

In [4]:
import pandas as pd
train = pd.read_csv('restaurants_train.csv')
val = pd.read_csv('restaurants_val.csv')
holdout = pd.read_csv('restaurants_holdout.csv')
# train = train.fillna('')
# val = val.fillna('')
# holdout = holdout.fillna('')

In [5]:
train = train.fillna('')
val = val.fillna('')
holdout = holdout.fillna('')

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [7]:
################################
# DO NOT CHANGE THIS FUNCTION! #
################################

def get_f1_score_on_test_data(model, data):
    model.eval()
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    for index, row in data.iterrows():
        sentence = row.sentence
        expected = row.restaurant_name
        #inputs = tokenizer([sentence], max_length=1024, return_tensors='pt')
        predicted = generate_outputs(model, sentence)
        if expected != '' and expected == predicted:
            true_positives += 1
        if expected != '' and expected != predicted:
            false_positives += 1
        if expected == '' and predicted != '':
            false_positives += 1
        if expected != '' and predicted == '':
            false_negatives += 1

    precision = 0
    recall = 0
    f1_score = 0
    if true_positives + false_positives:
        precision = true_positives / (true_positives + false_positives)
    if true_positives + false_negatives:
        recall = true_positives /(true_positives + false_negatives)
    if precision + recall:
        f1_score = 2 * precision * recall / (precision + recall)

    print(f'precision: {precision} | recall {recall} | f1_score {f1_score}')
    return f1_score


def convert_bio_text(words, tags):
    """
    Converts BIO tags to text
    """
    prev_tag = "O"
    extracted_dict = {}
    count = 0
    for x,t in zip(words, tags):
        if prev_tag == "O":
            if t == "B":
                prev_tag = "B"
                extracted_dict[count] = [x]
        else:
            if t == "I":
                extracted_dict[count].append(x)
                prev_tag = "I"
            elif t == "B" :
                prev_tag = "B"
                count += 1
                extracted_dict[count] = [x]
            else:
                count += 1
                prev_tag = "O"
                
    for i in extracted_dict.keys():
        extracted_dict[i] = " ".join(extracted_dict[i])
    return list(extracted_dict.values())

def generate_outputs(model, sentence, device="cpu"):
    
    """
    Predicts on a sentence and converts predicted BIO tags to text
    """
    
    tokenized_sentence = tokenizer.encode(sentence)
    
    input_ids = torch.tensor([tokenized_sentence]).to(device)
    
    with torch.no_grad():
        
        output = model(input_ids)
        
    label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)
    
    tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])

    new_tokens, new_labels = [], []

    for token, label_idx in zip(tokens, label_indices[0]):

        if token.startswith("##"):

            new_tokens[-1] = new_tokens[-1] + token[2:]

        else:

            new_labels.append(tag_values[label_idx])

            new_tokens.append(token)
            
    #return " ".join(convert_bio_text(new_tokens, new_labels)), new_labels
    
    out = convert_bio_text(new_tokens, new_labels)
    
    if len(out) > 0:
        
        return out[0]
    else:
        return ""
    

In [8]:
def _tag_bio(txt, entity):    
        """
        Given a text and an entity, creates BIO tags for that entity.
        """

        entity_tok = entity.split()

        txt_tok = txt.split()

        out_tags = ["O"]*len(txt_tok)
        
        if entity == "":
            return out_tags

        b_idxs = []

        # Fetch indices of B
        for i, w in enumerate(txt_tok):
            if w == entity_tok[0]:
                b_idxs.append(i)

        #print(b_idxs, txt_tok)

        # if entity is just of 1 word, then tag Bs appropriately and exit
        if len(entity_tok) == 1:
            for b in b_idxs:
                out_tags[b] = "B"
            return out_tags

        # ------- Tagging Is-----#
        tags_idx = {}
        for bidx in b_idxs:
            # for each B as pivot, fetch it's corresponding Is
            if (bidx < (len(txt_tok) - 1)):

                counter = 0

                while counter < (len(entity_tok) - 1):
                    # loop till all words in entity have been iterated
                    counter += 1

                    if (bidx + counter > (len(txt_tok) - 1)):
                        # exit if reached end of sentence before looping through all of entity words
                        counter -= 1
                        break
                    if (txt_tok[bidx + counter] != entity_tok[counter]):
                        # exit if any I word doesnt match
                        counter -= 1
                        # check for matching Is
                        break;
                if counter == (len(entity_tok) - 1):
                    # if all I words match then counter should be equal to number of Is
                    # if so, then add the indexes of Is to its corresponding B dict
                    tags_idx[bidx] = [bidx + i for i in range(len(entity_tok))]

        #print(tags_idx)
        # now for every B-I index key val pairs, get BI tags
        for b, i in tags_idx.items():
            if len(i) > 1:
                out_tags[b] = "B"
                for ii in i[1:]:
                    out_tags[ii] = "I"

        return out_tags

In [9]:
def tokenize_and_preserve_labels(sentence, text_labels):
    """
    Handles Bert's sub word encoding
    """
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels


In [10]:
train["tags"] = train.apply(lambda x: _tag_bio(x["sentence"], x["restaurant_name"]), axis=1)

train

Unnamed: 0,sentence,restaurant_name,tags
0,a four star restaurant with a bar,,"[O, O, O, O, O, O, O]"
1,areas that allow smoking,,"[O, O, O, O]"
2,any restaurants that still allow smoking,,"[O, O, O, O, O, O]"
3,are there any restaurants for diabetics that s...,,"[O, O, O, O, O, O, O, O, O, O, O]"
4,can you find east dedham pizzeria that have a ...,east,"[O, O, O, B, O, O, O, O, O, O, O, O, O]"
...,...,...,...
115,any mexican places have a tameles special today,,"[O, O, O, O, O, O, O, O]"
116,do you know if elmos have a dress code,elmos,"[O, O, O, O, B, O, O, O, O]"
117,are there any chicken wing places nearby,,"[O, O, O, O, O, O, O]"
118,are there any vietnamese restaurants nearby,,"[O, O, O, O, O, O]"


In [11]:
val["tags"] = val.apply(lambda x: _tag_bio(x["sentence"], x["restaurant_name"]), axis=1)

val

Unnamed: 0,sentence,restaurant_name,tags
0,are there any ice cream shops in my neighborho...,,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
1,are there any restaurants within 5 miles that ...,,"[O, O, O, O, O, O, O, O, O, O, O]"
2,are there any locally owned franchises that gi...,,"[O, O, O, O, O, O, O, O, O, O, O]"
3,are there any restaurants that will let me tak...,,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4,are there any five star restaurants around here,,"[O, O, O, O, O, O, O, O]"
5,do you think the noodle bar is open,noodle bar,"[O, O, O, O, B, I, O, O]"
6,are there any vegetarian restaurants in this town,,"[O, O, O, O, O, O, O, O]"
7,any places around here that has a nice view,,"[O, O, O, O, O, O, O, O, O]"
8,are there any jazz clubs that serve food,,"[O, O, O, O, O, O, O, O]"
9,do any famous people frequent the jimmys pizza...,jimmys pizza,"[O, O, O, O, O, O, B, I, O, O, O, O, O]"


In [12]:
holdout["tags"] = holdout.apply(lambda x: _tag_bio(x["sentence"], x["restaurant_name"]), axis=1)

holdout

Unnamed: 0,sentence,restaurant_name,tags
0,find pizza places,,"[O, O, O]"
1,find me the best rated chinese restaurant in t...,,"[O, O, O, O, O, O, O, O, O, O, O]"
2,what kind of food does abc cafe serve,abc cafe,"[O, O, O, O, O, B, I, O]"
3,how far away is the nearest steak house,,"[O, O, O, O, O, O, O, O]"
4,i am looking for a mexican restuarant that has...,,"[O, O, O, O, O, O, O, O, O, O, O, O]"
...,...,...,...
145,find me brazilian food with on location parking,,"[O, O, O, O, O, O, O, O]"
146,get me to a mexican place,,"[O, O, O, O, O, O]"
147,how far am i from the nearest bagel shop,,"[O, O, O, O, O, O, O, O, O]"
148,what time does sonic open,sonic,"[O, O, O, B, O]"


### Create data loaders

In [13]:
train.tags.apply(len).max()

17

In [14]:
train_sents = train["sentence"].apply(lambda x: x.split()).tolist()

train_labels = train["tags"].tolist()

val_sents = val["sentence"].apply(lambda x: x.split()).tolist()

val_labels = val["tags"].tolist()

holdout_sents = holdout["sentence"].apply(lambda x: x.split()).tolist()

holdout_labels = holdout["tags"].tolist()


In [15]:
train_tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(train_sents, train_labels)
]


In [16]:
val_tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(val_sents, val_labels)
]


In [17]:
holdout_tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(holdout_sents, holdout_labels)
]


In [18]:
train_tokenized_texts = [token_label_pair[0] for token_label_pair in train_tokenized_texts_and_labels]

train_labels = [token_label_pair[1] for token_label_pair in train_tokenized_texts_and_labels]

In [19]:
val_tokenized_texts = [token_label_pair[0] for token_label_pair in val_tokenized_texts_and_labels]

val_labels = [token_label_pair[1] for token_label_pair in val_tokenized_texts_and_labels]

In [20]:
holdout_tokenized_texts = [token_label_pair[0] for token_label_pair in holdout_tokenized_texts_and_labels]

holdout_labels = [token_label_pair[1] for token_label_pair in holdout_tokenized_texts_and_labels]

In [21]:
input_ids_train = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in train_tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

In [22]:
input_ids_val = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in val_tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

In [23]:
input_ids_holdout = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in holdout_tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

In [24]:
tag_values = ["B", "I", "O", "PAD"]

tag2idx = {t: i for i, t in enumerate(tag_values)}

tag2idx

{'B': 0, 'I': 1, 'O': 2, 'PAD': 3}

In [25]:
train_tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in train_labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

In [26]:
val_tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in val_labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

In [27]:
holdout_tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in holdout_labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

In [28]:
train_attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids_train]

In [29]:
val_attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids_val]

In [30]:
holdout_attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids_holdout]

In [31]:
train_inputs = torch.tensor(input_ids_train)

val_inputs = torch.tensor(input_ids_val)

holdout_inputs = torch.tensor(input_ids_holdout)

train_tags = torch.tensor(train_tags)

val_tags = torch.tensor(val_tags)

holdout_tags = torch.tensor(holdout_tags)

train_attention_masks = torch.tensor(train_attention_masks)

val_attention_masks = torch.tensor(val_attention_masks)

holdout_attention_masks = torch.tensor(holdout_attention_masks)

In [32]:
val_inputs.shape, val_tags.shape, val_attention_masks.shape

(torch.Size([30, 50]), torch.Size([30, 50]), torch.Size([30, 50]))

In [33]:
train_data = TensorDataset(train_inputs, train_attention_masks, train_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_attention_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)


### Model Setup

In [34]:
model = BertForTokenClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(tag2idx),
    output_attentions = False,
    output_hidden_states = False
)
#model.cuda();


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

In [35]:
FULL_FINETUNING = True

if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=LR,
    eps=1e-8
)

In [36]:
from transformers import get_linear_schedule_with_warmup

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

num_warmup_steps = int(0.1 * total_steps)

print(num_warmup_steps, total_steps)
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=total_steps
)

120 1200


### Training

In [37]:
## Store the average loss after each epoch so we can plot them.
loss_values, validation_loss_values = [], []

max_f1 = 0

for _ in trange(epochs, desc="Epoch"):
    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.

    # Put the model into training mode.
    model.train()
    # Reset the total loss for this epoch.
    total_loss = 0

    # Training loop
    for step, batch in enumerate(tqdm(train_dataloader, total = len(train_dataloader), position=0, leave=True)):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # Always clear any previously calculated gradients before performing a backward pass.
        model.zero_grad()
        # forward pass
        # This will return the loss (rather than the model output)
        # because we have provided the `labels`.
        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
        # get the loss
        loss = outputs[0]
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # track train loss
        total_loss += loss.item()
        # Clip the norm of the gradient
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))

    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)


    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    # Put the model into evaluation mode
    model.eval()
    # Reset the validation loss for this epoch.
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        # Telling the model not to compute or store gradients,
        # saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have not provided labels.
            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask, labels=b_labels)
        # Move logits and labels to CPU
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences.
        eval_loss += outputs[0].mean().item()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)

    eval_loss = eval_loss / len(valid_dataloader)
    validation_loss_values.append(eval_loss)
    print("Validation loss: {}".format(eval_loss))
    pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
                                 for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
    valid_tags = [tag_values[l_i] for l in true_labels
                                  for l_i in l if tag_values[l_i] != "PAD"]
    
    print("val on val")
    f1_val = get_f1_score_on_test_data(model, val)
    print("val on holdout")
    f1 = get_f1_score_on_test_data(model, holdout)
    if (f1_val >= max_f1) & (f1_val > 0):

        #f1 = get_f1_score_on_test_data(model, holdout)        
        print("saving model:", f1, f1_val,max_f1)
        max_f1 = f1_val
        holdout_f1 = f1
        torch.save(model.state_dict(), MODEL_PATH)
        f1_train = get_f1_score_on_test_data(model, train)
        print("Train f1", f1_train)
        
#     print("Validation Accuracy: {}".format(accuracy_score(pred_tags, valid_tags)))
#     print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))
    print()

100%|███████████████████████████████████████████| 60/60 [01:16<00:00,  1.28s/it]


Average train loss: 0.6761328076478094
Validation loss: 0.4444347341855367
val on val
precision: 0.0 | recall 0.0 | f1_score 0
val on holdout


Epoch:   5%|█▊                                   | 1/20 [01:29<28:19, 89.45s/it]

precision: 0.0 | recall 0.0 | f1_score 0



100%|███████████████████████████████████████████| 60/60 [01:17<00:00,  1.29s/it]


Average train loss: 0.3033017841167748
Validation loss: 0.27279622706895074
val on val
precision: 0.0 | recall 0.0 | f1_score 0
val on holdout


Epoch:  10%|███▋                                 | 2/20 [02:57<26:39, 88.86s/it]

precision: 0.0 | recall 0.0 | f1_score 0



100%|███████████████████████████████████████████| 60/60 [01:16<00:00,  1.27s/it]


Average train loss: 0.1896519787425253
Validation loss: 0.18715094078021746
val on val
precision: 0.1875 | recall 0.375 | f1_score 0.25
val on holdout
precision: 0.12307692307692308 | recall 0.19047619047619047 | f1_score 0.14953271028037382
saving model: 0.14953271028037382 0.25 0


Epoch:  15%|█████▌                               | 3/20 [04:34<26:06, 92.17s/it]

precision: 0.2608695652173913 | recall 0.375 | f1_score 0.30769230769230765
Train f1 0.30769230769230765



100%|███████████████████████████████████████████| 60/60 [01:14<00:00,  1.25s/it]


Average train loss: 0.11539823771066343
Validation loss: 0.25113359331541385
val on val
precision: 0.08333333333333333 | recall 0.2 | f1_score 0.11764705882352941
val on holdout


Epoch:  20%|███████▍                             | 4/20 [05:59<23:53, 89.59s/it]

precision: 0.18055555555555555 | recall 0.325 | f1_score 0.23214285714285715



100%|███████████████████████████████████████████| 60/60 [01:14<00:00,  1.24s/it]


Average train loss: 0.0942740190618982
Validation loss: 0.147504563141653
val on val
precision: 0.4117647058823529 | recall 1.0 | f1_score 0.5833333333333334
val on holdout
precision: 0.21428571428571427 | recall 0.6666666666666666 | f1_score 0.3243243243243243
saving model: 0.3243243243243243 0.5833333333333334 0.25


Epoch:  25%|█████████▎                           | 5/20 [07:33<22:47, 91.18s/it]

precision: 0.41935483870967744 | recall 0.7878787878787878 | f1_score 0.5473684210526315
Train f1 0.5473684210526315



100%|███████████████████████████████████████████| 60/60 [01:15<00:00,  1.26s/it]


Average train loss: 0.0599842900444249
Validation loss: 0.19101326429323914
val on val
precision: 0.3 | recall 1.0 | f1_score 0.4615384615384615
val on holdout


Epoch:  30%|███████████                          | 6/20 [08:59<20:53, 89.53s/it]

precision: 0.16806722689075632 | recall 0.6451612903225806 | f1_score 0.2666666666666667



100%|███████████████████████████████████████████| 60/60 [01:14<00:00,  1.24s/it]


Average train loss: 0.05552342402782718
Validation loss: 0.2886834773501808
val on val
precision: 0.4117647058823529 | recall 1.0 | f1_score 0.5833333333333334
val on holdout
precision: 0.2545454545454545 | recall 0.717948717948718 | f1_score 0.3758389261744966
saving model: 0.3758389261744966 0.5833333333333334 0.5833333333333334


Epoch:  35%|████████████▉                        | 7/20 [10:34<19:46, 91.25s/it]

precision: 0.6296296296296297 | recall 0.9444444444444444 | f1_score 0.7555555555555556
Train f1 0.7555555555555556



100%|███████████████████████████████████████████| 60/60 [01:18<00:00,  1.31s/it]


Average train loss: 0.047494667185189125
Validation loss: 0.34538539645582206
val on val
precision: 0.47368421052631576 | recall 1.0 | f1_score 0.6428571428571429
val on holdout
precision: 0.23529411764705882 | recall 0.717948717948718 | f1_score 0.3544303797468355
saving model: 0.3544303797468355 0.6428571428571429 0.5833333333333334


Epoch:  40%|██████████████▊                      | 8/20 [12:16<18:54, 94.58s/it]

precision: 0.6551724137931034 | recall 1.0 | f1_score 0.7916666666666666
Train f1 0.7916666666666666



100%|███████████████████████████████████████████| 60/60 [01:18<00:00,  1.31s/it]


Average train loss: 0.026732550378073937
Validation loss: 0.30974295511453726
val on val
precision: 0.5 | recall 1.0 | f1_score 0.6666666666666666
val on holdout
precision: 0.23728813559322035 | recall 0.717948717948718 | f1_score 0.356687898089172
saving model: 0.356687898089172 0.6666666666666666 0.6428571428571429


Epoch:  45%|████████████████▋                    | 9/20 [13:55<17:36, 96.07s/it]

precision: 0.6 | recall 0.972972972972973 | f1_score 0.7422680412371134
Train f1 0.7422680412371134



100%|███████████████████████████████████████████| 60/60 [01:14<00:00,  1.24s/it]


Average train loss: 0.01644582487618512
Validation loss: 0.1938777434991304
val on val
precision: 0.3333333333333333 | recall 1.0 | f1_score 0.5
val on holdout


Epoch:  50%|██████████████████                  | 10/20 [15:22<15:31, 93.14s/it]

precision: 0.24193548387096775 | recall 0.8571428571428571 | f1_score 0.37735849056603776



100%|███████████████████████████████████████████| 60/60 [01:22<00:00,  1.38s/it]


Average train loss: 0.011206025500359828
Validation loss: 0.22421969663143196
val on val
precision: 0.3333333333333333 | recall 1.0 | f1_score 0.5
val on holdout


Epoch:  55%|███████████████████▊                | 11/20 [16:59<14:08, 94.22s/it]

precision: 0.23770491803278687 | recall 0.8055555555555556 | f1_score 0.36708860759493667



100%|███████████████████████████████████████████| 60/60 [01:20<00:00,  1.35s/it]


Average train loss: 0.011805310508376958
Validation loss: 0.19368935265301843
val on val
precision: 0.38095238095238093 | recall 1.0 | f1_score 0.5517241379310345
val on holdout


Epoch:  60%|█████████████████████▌              | 12/20 [18:32<12:30, 93.86s/it]

precision: 0.2601626016260163 | recall 0.8648648648648649 | f1_score 0.4



100%|███████████████████████████████████████████| 60/60 [01:17<00:00,  1.29s/it]


Average train loss: 0.006989277739194222
Validation loss: 0.2412122503403225
val on val
precision: 0.3888888888888889 | recall 1.0 | f1_score 0.56
val on holdout


Epoch:  65%|███████████████████████▍            | 13/20 [20:03<10:51, 93.03s/it]

precision: 0.24369747899159663 | recall 0.7837837837837838 | f1_score 0.37179487179487175



100%|███████████████████████████████████████████| 60/60 [01:20<00:00,  1.34s/it]


Average train loss: 0.0062565624074826095
Validation loss: 0.2616001856835889
val on val
precision: 0.4444444444444444 | recall 1.0 | f1_score 0.6153846153846153
val on holdout


Epoch:  70%|█████████████████████████▏          | 14/20 [21:35<09:17, 92.93s/it]

precision: 0.2457627118644068 | recall 0.7837837837837838 | f1_score 0.3741935483870968



100%|███████████████████████████████████████████| 60/60 [01:20<00:00,  1.34s/it]


Average train loss: 0.00479124087226713
Validation loss: 0.2984132166066653
val on val
precision: 0.3888888888888889 | recall 1.0 | f1_score 0.56
val on holdout


Epoch:  75%|███████████████████████████         | 15/20 [23:08<07:44, 92.80s/it]

precision: 0.25 | recall 0.7837837837837838 | f1_score 0.3790849673202614



100%|███████████████████████████████████████████| 60/60 [01:25<00:00,  1.42s/it]


Average train loss: 0.008106452612143282
Validation loss: 0.2555449088443614
val on val
precision: 0.3888888888888889 | recall 1.0 | f1_score 0.56
val on holdout


Epoch:  80%|████████████████████████████▊       | 16/20 [24:46<06:17, 94.38s/it]

precision: 0.24166666666666667 | recall 0.8055555555555556 | f1_score 0.3717948717948718



100%|███████████████████████████████████████████| 60/60 [01:23<00:00,  1.40s/it]


Average train loss: 0.00104408281428429
Validation loss: 0.24524865201553137
val on val
precision: 0.3888888888888889 | recall 1.0 | f1_score 0.56
val on holdout


Epoch:  85%|██████████████████████████████▌     | 17/20 [26:22<04:45, 95.00s/it]

precision: 0.24166666666666667 | recall 0.8055555555555556 | f1_score 0.3717948717948718



100%|███████████████████████████████████████████| 60/60 [01:21<00:00,  1.35s/it]


Average train loss: 0.006783764894498745
Validation loss: 0.2456383436122754
val on val
precision: 0.3684210526315789 | recall 1.0 | f1_score 0.5384615384615384
val on holdout


Epoch:  90%|████████████████████████████████▍   | 18/20 [27:55<03:08, 94.31s/it]

precision: 0.24793388429752067 | recall 0.8571428571428571 | f1_score 0.38461538461538464



100%|███████████████████████████████████████████| 60/60 [01:19<00:00,  1.33s/it]


Average train loss: 0.003981005276182259
Validation loss: 0.23609816715543275
val on val
precision: 0.35 | recall 1.0 | f1_score 0.5185185185185185
val on holdout


Epoch:  95%|██████████████████████████████████▏ | 19/20 [29:27<01:33, 93.55s/it]

precision: 0.24390243902439024 | recall 0.8333333333333334 | f1_score 0.37735849056603776



100%|███████████████████████████████████████████| 60/60 [01:17<00:00,  1.30s/it]


Average train loss: 0.0019264135176247994
Validation loss: 0.2375902030391444
val on val
precision: 0.35 | recall 1.0 | f1_score 0.5185185185185185
val on holdout


Epoch: 100%|████████████████████████████████████| 20/20 [30:56<00:00, 92.83s/it]

precision: 0.2459016393442623 | recall 0.8333333333333334 | f1_score 0.379746835443038






### Evauation

In [38]:
holdout_f1

0.356687898089172

In [39]:
get_f1_score_on_test_data(model, train)

precision: 0.5873015873015873 | recall 1.0 | f1_score 0.74


0.74

In [40]:
get_f1_score_on_test_data(model, holdout)

precision: 0.2459016393442623 | recall 0.8333333333333334 | f1_score 0.379746835443038


0.379746835443038

In [41]:
generate_outputs(model, "do you know if elmos have a dress code")

'elmos'

In [42]:
generate_outputs(model, 'find me brazilian food with on location parking')

'brazilian food'

In [43]:
# MODEL_PATH = "test_local_model"

In [44]:
model = BertForTokenClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(tag2idx),
    output_attentions = False,
    output_hidden_states = False
)
model.load_state_dict(torch.load(MODEL_PATH))

#model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

<All keys matched successfully>

In [45]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [46]:
get_f1_score_on_test_data(model, holdout)

precision: 0.4044943820224719 | recall 0.9 | f1_score 0.5581395348837209


0.5581395348837209