In [4]:
import numpy as np
import pandas as pd
import transformers 
from sklearn.model_selection import KFold
import re
import json
from utliss import evaluate_predictions, ENTITIES,LABEL2ID,df
from transformers import (BertForTokenClassification, AutoTokenizer, Trainer,
                          TrainingArguments, DataCollatorForTokenClassification)
from datasets import Dataset

In [6]:
# %run utliss.ipynb -- quiet

In [7]:
# from utliss import LABEL2ID
CONFIG ={
    'bert_type' : 'distilbert-base-cased',
    'label2id' : LABEL2ID,
    'training_args' : {
        "output_dir": './bert-checkpoints',
        "learning_rate": 2e-5,
        "per_device_train_batch_size": 16,
        "per_device_eval_batch_size": 32,
        "num_train_epochs": 30,
        "weight_decay": 0.01
                      },
    'only_first_token': False,
    'max_length' : 128
}

In [9]:
def tokenize_and_allign(reciepe, labels, tokenizer, label_ids, only_first_token):
    """
    :param recipes: list of lists of words from a recipe
    :param labels: list of lists of labels from a recipe
    :return: a dictionary of tokens of recipes and a optional allign token labels

    """
    
    tokens = tokenizer(reciepe, truncation = True, is_split_into_words = True, max_length = 128)
    aligned_tokens_entities = []

    for reciepe_id in range(len(reciepe)):
        if not labels: break
        word_ids = tokens.word_ids(reciepe_id)
        prev_id = None
        curr_entities = []
        for idx in word_ids:

            if idx!= prev_id:
                curr_entities.append(-100 if idx == None else label_ids["B-"+labels[reciepe_id][idx]])
            elif idx == None:
                curr_entities.append(-100)
            else:
                curr_entities.append(-100 if only_first_token else label_ids["I-"+labels[reciepe_id][idx]])
            prev_id = idx
        aligned_tokens_entities.append(curr_entities)
    if labels:
        tokens['labels']  = aligned_tokens_entities
    tokens['recipes'] = reciepe
    return tokens

model building

In [10]:
def check_if_entity_correctly_began(entity, prev_entity):
    """
    This function checks if "I-" entity is preceded with "B-" or "I-". For
    example, "I-FOOD" should not happen after "O" or after "B-QUANT".
    :param entity:
    :param prev_entity:
    :return: bool
    """
    if "I-" in entity and re.sub(r"[BI]-", "", entity) != \
            re.sub(r"[BI]-", "", prev_entity):
        return False
    return True


def token_to_entity_predictions(text_split_words, text_split_tokens,
                                token_labels, id2label):
    """
    Transform token (subword) predictions into word predictions.
    :param text_split_words: list of words from one recipe, eg. ["I", "eat",
    "chicken"] (the ones that go to tokenizer)
    :param text_split_tokens: list of tokens from one recipe, eg. ["I", "eat",
    "chic", "##ken"] (the ones that arise
    from input decoding)
    :param token_labels: list of labels associated with each token from
    text_split_tokens
    :param id2label: a mapping from ids (0, 1, ...) to labels ("B-FOOD",
    "I-FOOD", ...)
    :return: a list of entities associated with each word from text_split_words,
    ie. entities extracted from a recipe
    """

    word_idx = 0
    word_entities = []
    word_from_tokens = ""
    word_entity = ""
    prev_word_entity = ""

    for token_label, token in zip(token_labels, text_split_tokens):
        if token in ["[SEP]", "[CLS]"]:
            continue
        word_from_tokens += re.sub(r"^##", "", token)
        # take the entity associated with the first token (subword)
        word_entity = id2label[token_label] if word_entity == "" \
            else word_entity

        if word_from_tokens == text_split_words[word_idx] or\
                word_from_tokens == "[UNK]":
            word_idx += 1
            # replace entities containing "I-" that do not have a predecessor
            # with "B-"
            word_entity = "O" if not \
                check_if_entity_correctly_began(word_entity, prev_word_entity) \
                else word_entity
            word_entities.append(word_entity)
            word_from_tokens = ""
            prev_word_entity = word_entity
            word_entity = ""

    return word_entities

In [11]:
class TastyModel():
    def __init__(self, config):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(self.config['bert_type'])
        self.id2label = {v:k for k,v in CONFIG['label2id'].items()}
        self.label2id = self.config['label2id']

        model = BertForTokenClassification.from_pretrained(self.config['bert_type'],
            num_labels=len(self.config["label2id"]),
            label2id=self.label2id,
            id2label=self.id2label,
            classifier_dropout=0.2
        )
        training_args = TrainingArguments(**self.config['training_args'])

        self.trainer = Trainer(
            model=model,
            args=training_args,
            tokenizer=self.tokenizer,
            data_collator=DataCollatorForTokenClassification(
                tokenizer=self.tokenizer, max_length= self.config['max_length'],padding= 'max_length')
                )
    def train(self, train_reciepe, train_entities):

        _, train_dataset = self.prepare_data(train_reciepe, train_entities)

        self.trainer.train_dataset = train_dataset

        self.trainer.train()
    
    def evaluate(self, eval_reciepe, eval_entities):
        
        eval_preditions = self.predict(eval_reciepe)
        results = evaluate_predictions(eval_entities,eval_preditions)
        return results
        

    def predict(self, recipes):

        predicted_entities = []

        data, dataset = self.prepare_data(recipes,[])

        preds = self.trainer.predict(dataset)

        token_probs = preds[0]

        token_labels = token_probs.argmax(axis=2)

        num_of_recipes = dataset.num_rows

        for recipe_idx in range(num_of_recipes):
            text_split_words = recipes[recipe_idx]
            text_split_tokens = self.tokenizer.convert_ids_to_tokens(
                data["input_ids"][recipe_idx])
            word_entities = token_to_entity_predictions(
                    text_split_words,
                    text_split_tokens,
                    token_labels[recipe_idx],
                    self.id2label
                )
            predicted_entities.append(word_entities)
        
        return predicted_entities
            


        
    def prepare_data(self, reciepe, labels):
        data = tokenize_and_allign(reciepe = reciepe, labels = labels, tokenizer = self.tokenizer, label_ids = self.config['label2id'], 
                                   only_first_token= self.config['only_first_token'])
        
        dataset = Dataset.from_dict(data) ## give the dataset in shape of(700,4) 700 rows and 4 columns are [input_ids, attention_mask, entites, reciepes]
        
        return data, dataset

In [12]:
def cross_validate(no_of_folds,seed):

    CONFIG["training_args"]["seed"] = seed

    bio_recipes, bio_entities = df.entities.tolist(), df.NER_tags.tolist()

    kf = KFold(n_splits = no_of_folds, shuffle=True)
    cross_val_results = {}

    for fold_id, (train_index, test_index) in enumerate(kf.split(bio_entities)):
        tr_recipes, vl_recipes = [bio_recipes[idx] for idx in train_index], \
                                 [bio_recipes[idx] for idx in test_index]
        tr_entities, vl_entities = [bio_entities[idx] for idx in train_index], \
                                   [bio_entities[idx] for idx in test_index]

        model = TastyModel(CONFIG)
        model.train(tr_recipes, tr_entities)
        results = model.evaluate(vl_recipes, vl_entities)
        print(results)
        cross_val_results[fold_id] = results

    with open("bert_cross_val_results.json", "w") as json_file:
        json.dump(cross_val_results, json_file, indent=4)

    # aggregate and print results
    cross_val_results_aggregated = {
        entity: {"precision": [], "recall": [], "f1": []} for entity in
        ENTITIES + ["all"]
    }

    print(f"{'entity':^20s}{'precision':^15s}{'recall':^15s}{'f1-score':^15s}")
    for entity in cross_val_results_aggregated.keys():
        print(f"{entity:^20s}", end="")
        for metric in cross_val_results_aggregated[entity].keys():
            for fold_id in range(no_of_folds):
                cross_val_results_aggregated[entity][metric].append(
                    cross_val_results[fold_id][entity][metric]
                )

            mean = np.mean(cross_val_results_aggregated[entity][metric])
            mean = int(mean * 1000) / 1000
            std = np.std(cross_val_results_aggregated[entity][metric])
            std = int(std * 1000) / 1000 + 0.001 * \
                  round(std - int(std * 1000) / 1000)
            print(f"{mean:^2.3f} +- {std:^2.3f} ", end="")
        print()

In [13]:
r = [5]
if r:
    print('df')

df


In [14]:
if __name__ == "__main__":
    cross_validate(10,42)

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at distilbert-base-cased were not used when initializing BertForTokenClassification: ['distilbert.transformer.layer.3.ffn.lin2.weight', 'distilbert.transformer.layer.4.ffn.lin1.weight', 'distilbert.transformer.layer.4.attention.out_lin.weight', 'distilbert.transformer.layer.5.attention.v_lin.bias', 'distilbert.transformer.layer.2.ffn.lin1.bias', 'distilbert.transformer.layer.4.attention.q_lin.weight', 'distilbert.transformer.layer.5.attention.v_lin.weight', 'distilbert.transformer.layer.1.ffn.lin1.bias', 'vocab_layer_norm.bias', 'distilbert.transformer.layer.1.attention.v_lin.bias', 'distilbert.transformer.layer.5.ffn.lin1.weight', 'distilbert.transformer.layer.0.ffn.lin1.weight', 'distilbert.transformer.layer.3.attention.q_lin.bias', 'distilbert.transformer.layer.4.ffn.lin2.bias', 'distilbert.tra

Step,Training Loss
