<a href="https://colab.research.google.com/github/sjpark0605/NLP-FYP/blob/main/COMP0029.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Section 1: BERT for Food Recipe Named Entity Recognition

## 1.1 Data Processing

In [None]:
# FLAGS
GENERATE_RECIPE_CSV = False

In [None]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
%%capture
!pip install datasets evaluate transformers[sentencepiece] seqeval accelerate

In [None]:
# Imports for Data Processing
import glob
import csv
import pandas as pd
import torch
from datasets import Dataset, ClassLabel, Sequence, DatasetDict

In [None]:
device = torch.device('cpu')
if torch.cuda.is_available():
  device = torch.device('cuda')

In [None]:
project_dir = '/content/drive/MyDrive/COMP0029/'

In [None]:
if GENERATE_RECIPE_CSV:
  corpus_list = ['r-100', 'r-200', 'r-300']  
  for target_corpus in corpus_list:
    recipe_files = []
    if target_corpus != 'r-300':
      recipe_files += glob.glob(project_dir + target_corpus + '/*.list')
    else:
      recipe_files += glob.glob(project_dir + 'r-100/*.list')
      recipe_files += glob.glob(project_dir + 'r-200/*.list')

    recipe_ner_data_csv = open(project_dir + target_corpus + '-recipe-ner-data.csv', "w", encoding="utf8")
    writer = csv.writer(recipe_ner_data_csv)

    header = ['Sentence Number', 'Word', 'POS', 'Label']
    writer.writerow(header)

    sentence_no = 1

    for file in recipe_files:
      recipe_data = open(file, "r", encoding="utf8")
      lines = recipe_data.readlines()

      for line in lines:
        items = line.split(" ")
        word = items[3]
        pos = items[4]
        label = items[5].replace("\n", "")

        row = ["Sentence_" + str(sentence_no), word, pos, label]
        writer.writerow(row)

        if pos == '.':
          sentence_no += 1
        
      recipe_data.close()

    recipe_ner_data_csv.close()

In [None]:
# R-100 Dataset
df = pd.read_csv(project_dir + 'r-300-recipe-ner-data.csv')
pos_list = df['POS'].unique()
label_list = sorted(df['Label'].unique())
label_list.remove('O')
label_list.append('O')

grouped = df.groupby('Sentence Number').agg({'Word': list, 'POS': list, 'Label': list}).reset_index()
grouped.drop('Sentence Number', axis=1, inplace=True) 
grouped.rename(columns={"Word": "tokens", "POS": "pos", "Label": "ner_tags"}, inplace=True)

dataset = Dataset.from_pandas(grouped)
dataset = dataset.cast_column("pos", Sequence(ClassLabel(names=list(pos_list))))
dataset = dataset.cast_column("ner_tags", Sequence(ClassLabel(names=list(label_list))))

Casting the dataset:   0%|          | 0/2752 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2752 [00:00<?, ? examples/s]

In [None]:
dataset = dataset.shuffle()
split_dataset = dataset.train_test_split(test_size=0.2)

corpus_datasets = DatasetDict({
    "train": split_dataset["train"],
    "valid": split_dataset["test"]
})

In [None]:
ner_feature = corpus_datasets["train"].features["ner_tags"]
label_names = ner_feature.feature.names
pure_label_names = list(set(label.replace("-B", "").replace("-I", "") for label in label_names))

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
inputs = tokenizer(corpus_datasets["train"][0]["tokens"], is_split_into_words=True, truncation=True, max_length=75)

In [None]:
def align_labels_with_tokens(labels, word_ids):
  new_labels = []
  current_word = None
  for word_id in word_ids:
    if word_id != current_word:
      # Start of a new word!
      current_word = word_id
      label = -100 if word_id is None else labels[word_id]
      new_labels.append(label)
    elif word_id is None:
      # Special token
      new_labels.append(-100)
    else:
      # Same word as previous token
      label = labels[word_id]
      # If the label is XXX-B we change it to XXX-I
      if label % 2 == 0 and label != 20:
        new_labels.append(-100)
        # label += 1
        # new_labels.append(label)

  return new_labels

In [None]:
def tokenize_and_align_labels(examples):
  tokenized_inputs = tokenizer(
      examples["tokens"], truncation=True, is_split_into_words=True, max_length=512
  )
  all_labels = examples["ner_tags"]
  new_labels = []
  for i, labels in enumerate(all_labels):
    word_ids = tokenized_inputs.word_ids(i)
    new_labels.append(align_labels_with_tokens(labels, word_ids))

  tokenized_inputs["labels"] = new_labels
  return tokenized_inputs

In [None]:
tokenized_datasets = corpus_datasets.map(
  tokenize_and_align_labels,
  batched=True,
  remove_columns=corpus_datasets["train"].column_names,
)

Map:   0%|          | 0/2201 [00:00<?, ? examples/s]

Map:   0%|          | 0/551 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
import evaluate

metric = evaluate.load("seqeval")

In [None]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
  tokenized_datasets["train"],
  shuffle=True,
  collate_fn=data_collator,
  batch_size=32,
)
eval_dataloader = DataLoader(
  tokenized_datasets["valid"], 
  collate_fn=data_collator, 
  batch_size=32,
)

In [None]:
from transformers import AutoModelForTokenClassification

ner_model = AutoModelForTokenClassification.from_pretrained(
  model_checkpoint,
  id2label=id2label,
  label2id=label2id,
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [None]:
from torch.optim import AdamW

optimizer = AdamW(ner_model.parameters(), lr=3e-5, eps=1e-8, weight_decay=0.1)

In [None]:
from accelerate import Accelerator

accelerator = Accelerator()
ner_model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
  ner_model, optimizer, train_dataloader, eval_dataloader
)

In [None]:
print(len(train_dataloader))

69


In [None]:
from transformers import get_scheduler

num_train_epochs = 20
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
  "linear",
  optimizer=optimizer,
  num_warmup_steps=0,
  num_training_steps=num_training_steps,
)

In [None]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_predictions, true_labels

In [None]:
print(df['Label'].value_counts())

O        18569
Ac-B      4956
F-B       4850
T-B       1860
F-I       1225
Sf-B      1033
D-I       1014
St-B       868
Ac-I       671
D-B        575
T-I        554
Q-B        494
St-I       386
Sf-I       310
Af-B       265
Q-I        175
Ac2-B      173
Ac2-I      145
Af-I        77
At-B        14
At-I        10
Name: Label, dtype: int64


In [None]:
def obtain_weights():
  weights = []

  for label in label_names:
    weights.append(float(1 / df['Label'].value_counts()[label]))
  return torch.tensor(weights).to(torch.device('cuda'))


In [None]:
from tqdm.auto import tqdm
from torch import nn
import torch

progress_bar = tqdm(range(num_training_steps))
# loss_fct = nn.CrossEntropyLoss(weight=obtain_weights())

for epoch in range(num_train_epochs):
    
    # Training
    train_loss_val = 0

    ner_model.train()
    for batch in train_dataloader:
        labels = batch.get("labels")
        outputs = ner_model(**batch)

        logits = outputs.get("logits")
        loss = outputs.loss

        train_loss_val += loss.item()

        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    print(f"Training Loss: {train_loss_val / len(train_dataloader)}")

    # Evaluation
    eval_loss_val = 0

    ner_model.eval()
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = ner_model(**batch)
            
        eval_loss_val += outputs.get("loss").item()

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)

        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)


    print(f"Validation Loss: {eval_loss_val / len(eval_dataloader)}")

    results = metric.compute(suffix=True)
    print(
        f"epoch {epoch}:",
        {
            f"overall_{key}": results[f"overall_{key}"]
            for key in ["precision", "recall", "f1", "accuracy"]
        },
        "\n"
    )

  0%|          | 0/1380 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Training Loss: 1.2482141765995303
Validation Loss: 0.6984691106610827


  _warn_prf(average, modifier, msg_start, len(result))


epoch 0: {'overall_precision': 0.71724818959842, 'overall_recall': 0.7312080536912752, 'overall_f1': 0.7241608507809902, 'overall_accuracy': 0.8107631828656635} 

Training Loss: 0.6273804725944132
Validation Loss: 0.5722763902611203
epoch 1: {'overall_precision': 0.7633175994605529, 'overall_recall': 0.7597315436241611, 'overall_f1': 0.7615203498150016, 'overall_accuracy': 0.83529890199268} 

Training Loss: 0.4949020168919494
Validation Loss: 0.5154689368274477
epoch 2: {'overall_precision': 0.7762982689747004, 'overall_recall': 0.7825503355704698, 'overall_f1': 0.7794117647058825, 'overall_accuracy': 0.8456011928968415} 

Training Loss: 0.414391718480898
Validation Loss: 0.49143221808804405
epoch 3: {'overall_precision': 0.7802600866955652, 'overall_recall': 0.785234899328859, 'overall_f1': 0.78273958855996, 'overall_accuracy': 0.8485834350006778} 

Training Loss: 0.3417281940363456
Validation Loss: 0.49222788876957363
epoch 4: {'overall_precision': 0.7760078023407022, 'overall_recall

In [None]:
# Evaluation Metric of Final Iteration
for label in pure_label_names:
    if label != "O" and label != "At":
      print(
          f"{label}:",
          {
              key: results[label][key]
              for key in ["precision", "recall", "f1"]
          },
      )

Q: {'precision': 0.5833333333333334, 'recall': 0.7078651685393258, 'f1': 0.6395939086294417}
Ac2: {'precision': 0.3269230769230769, 'recall': 0.4722222222222222, 'f1': 0.38636363636363635}
Af: {'precision': 0.38636363636363635, 'recall': 0.2982456140350877, 'f1': 0.33663366336633666}
Ac: {'precision': 0.8856848609680742, 'recall': 0.8775510204081632, 'f1': 0.8815991799077396}
St: {'precision': 0.7877094972067039, 'recall': 0.8011363636363636, 'f1': 0.7943661971830986}
D: {'precision': 0.7350427350427351, 'recall': 0.8037383177570093, 'f1': 0.7678571428571429}
T: {'precision': 0.8403361344537815, 'recall': 0.8403361344537815, 'f1': 0.8403361344537815}
F: {'precision': 0.8206686930091185, 'recall': 0.8376421923474664, 'f1': 0.8290685772773797}
Sf: {'precision': 0.5639810426540285, 'recall': 0.5721153846153846, 'f1': 0.568019093078759}


# Section 2: BERT for Food Recipe Edge Classification

## 2.1 Data Processing

**Structure of List Files**:
`Step, Sentence, Char, Word, POS, Entity`

**Structure of Flow Files**:
`Step, Sentence, Char, Label, Step, Sentence, Char`

In [None]:
import numpy as np

In [None]:
def construct_key(items):
  return items[0] + ";" + items[1] + ";" + items[2]

In [None]:
def construct_word_dict(ner_lines):
  word_dict = {}

  for line in ner_lines:
    items = line.split(" ")
    items[5] = items[5].replace("\n", "")

    key = construct_key(items[:3]) 
    word = items[3]

    word_dict[key] = word
  
  return word_dict

In [None]:
def construct_ner_dict(ner_lines):
  ner_dict = {}

  for line in ner_lines:
    items = line.split(" ")
    items[5] = items[5].replace("\n", "")

    key = construct_key(items[:3]) 
    ner_tag = items[5]

    ner_dict[key] = ner_tag
  
  return ner_dict

In [None]:
def append_relation_set(flow_lines, ner_dict, relation_set):
  for line in flow_lines:
    items = line.split(" ")
    items[6] = items[6].replace("\n", "")

    source_key = construct_key(items[:3])
    dest_key = construct_key(items[4:])

    relation = ner_dict[source_key] + "->" + ner_dict[dest_key]

    relation_set.add(relation)

In [None]:
def construct_visited_dict(flow_lines):
  visited_dict = {}
  
  for line in flow_lines:
    items = line.split(" ")
    items[6] = items[6].replace("\n", "")

    source_key = construct_key(items[:3])
    dest_key = construct_key(items[4:])

    visited_dict[source_key + '->' + dest_key] = items[3]

  return visited_dict

In [None]:
def construct_sentence(ner_lines, position):
  constructed = False
  first_word = True

  sentence = ""

  for line in ner_lines:
    items = line.split(" ")
    if position[0] == items[0] and position[1] == items[1]:
      constructed = True

      if not first_word:
        sentence += " "
      else:
        first_word = False

      sentence += items[3]

    elif constructed:
      break

  return sentence

In [None]:
def compute_potential_pairs(ner_lines, ner_dict, relation_set):
  positions = []

  for line in ner_lines:
    items = line.split(" ")
    label = items[5].replace("\n", "")

    if "-I" not in label and label != "O":
      position = (items[0], items[1], items[2])
      positions.append(position)

  pairs = []
  for i in range(len(positions)):
    for j in range(len(positions)):
      if i != j:
        source_key = construct_key(positions[i])
        dest_key = construct_key(positions[j])

        source_ner = ner_dict[source_key]
        dest_ner = ner_dict[dest_key]

        potential_ner_pair = source_ner + "->" + dest_ner

        if potential_ner_pair in relation_set:
          pairs.append([positions[i], positions[j]])

  return pairs

In [None]:
def append_word_pairs(pairs, word_dict, word_pairs):
  for pair in pairs:
    source_key = construct_key(pair[0])
    dest_key = construct_key(pair[1])
    word_pairs.append(word_dict[source_key] + " " + word_dict[dest_key])

In [None]:
def append_sentence_pairs(pairs, ner_lines, sentence_pairs):
  for pair in pairs:
    source_position = (pair[0][0], pair[0][1])
    dest_position = (pair[1][0], pair[1][1])

    sentence = ""

    if source_position == dest_position:
      sentence += construct_sentence(ner_lines, source_position)
    else:
      sentence += construct_sentence(ner_lines, source_position)
      sentence += " "
      sentence += construct_sentence(ner_lines, dest_position)

    sentence_pairs.append(sentence)

In [None]:
def append_labels(pairs, visited_dict, labels):
  for pair in pairs:
    source_key = construct_key(pair[0])
    dest_key = construct_key(pair[1])
    key = source_key + "->" + dest_key

    if key in visited_dict:
      labels.append(visited_dict[key])
    else:
      labels.append('x')

In [None]:
def construct_df(word_pairs, sentence_pairs, labels):
  np_word_pairs = np.array(word_pairs)
  np_sentence_pairs = np.array(sentence_pairs)
  np_labels = np.array(labels)

  data_matrix = np.column_stack((np_word_pairs, np_sentence_pairs, np_labels))

  df = pd.DataFrame(data_matrix, columns=['Word Pairs', 'Sentence Pairs', 'Label'])

  return df


In [None]:
def construct_relation_set(target_corpus):
  relation_set = set()

  recipe_files = glob.glob(project_dir + target_corpus + '/*.flow')

  for file in recipe_files:
    recipe_ner_path = file.replace(".flow", ".list")

    recipe_flow_data = open(file, "r", encoding="utf8")
    flow_lines = recipe_flow_data.readlines()

    recipe_ner_data = open(recipe_ner_path, "r", encoding="utf8")
    ner_lines = recipe_ner_data.readlines()

    ner_dict = construct_ner_dict(ner_lines)
    append_relation_set(flow_lines, ner_dict, relation_set)

    recipe_flow_data.close()
    recipe_ner_data.close()

  return relation_set

In [None]:
# Extra Edge Label: "s" - used when an object is removed
def construct_data(target_corpus, relation_set):
  recipe_files = glob.glob(project_dir + target_corpus + '/*.flow')

  word_pairs = []
  sentence_pairs = []
  labels = []

  for file in recipe_files:
    recipe_ner_path = file.replace(".flow", ".list")

    recipe_flow_data = open(file, "r", encoding="utf8")
    recipe_ner_data = open(recipe_ner_path, "r", encoding="utf8")

    flow_lines = recipe_flow_data.readlines()
    ner_lines = recipe_ner_data.readlines()

    word_dict = construct_word_dict(ner_lines)
    ner_dict = construct_ner_dict(ner_lines)

    visited_dict = construct_visited_dict(flow_lines)

    pairs = compute_potential_pairs(ner_lines, ner_dict, relation_set)

    append_word_pairs(pairs, word_dict, word_pairs)
    append_sentence_pairs(pairs, ner_lines, sentence_pairs)
    append_labels(pairs, visited_dict, labels) 

    recipe_flow_data.close()
    recipe_ner_data.close()

  df = construct_df(word_pairs, sentence_pairs, labels)
  return df

In [None]:
def undersample(df):
  # Get indices of rows that match the value
  match_indices = df.index[df['Label'] == 'x']

  # Randomly select half of the indices to delete
  delete_indices = np.random.choice(match_indices, size=int(len(match_indices)/1.1), replace=False)

  # Delete the selected rows
  df = df.drop(delete_indices)

  df = df.reset_index(drop=True)

  # Print the resulting dataframe
  return df


In [None]:
target_corpus = 'r-100'

relation_set = construct_relation_set(target_corpus)
print(len(relation_set))
df = construct_data(target_corpus, relation_set)

65


In [None]:
df.head()
ratio = df['Label'].value_counts()['x'] / df['Label'].value_counts().sum()
ratio

0.9827075344489197

In [None]:
df = undersample(df)
df.head()
ratio = df['Label'].value_counts()['x'] / df['Label'].value_counts().sum()
ratio

0.8378274585104923

In [None]:
edge_label_list = df['Label'].unique()
edge_label_list

array(['x', 't-comp', 't', 'o', 'a', 'v-tm', 'f-eq', 'd', 'f-part-of',
       'f-comp', 't-eq', 'a-eq', 'v', 't-part-of', 's'], dtype=object)

In [None]:
dataset = Dataset.from_pandas(df)
ClassLabels = ClassLabel(num_classes=len(edge_label_list), names=list(edge_label_list))
dataset = dataset.class_encode_column("Label", ClassLabels)

Casting to class labels:   0%|          | 0/36455 [00:00<?, ? examples/s]

In [None]:
dataset = dataset.shuffle()
split_dataset = dataset.train_test_split(test_size=0.2, stratify_by_column="Label")

r_100_flow_datasets = DatasetDict({
    "train": split_dataset["train"],
    "valid": split_dataset["test"],
})

In [None]:
def tokenize_function(data):
    return tokenizer(data["Word Pairs"], data["Sentence Pairs"], truncation=True)

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenized_datasets = r_100_flow_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/29164 [00:00<?, ? examples/s]

Map:   0%|          | 0/7291 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(["Word Pairs", "Sentence Pairs"])
tokenized_datasets = tokenized_datasets.rename_column("Label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=128, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["valid"], batch_size=128, collate_fn=data_collator
)

In [None]:
from transformers import AutoModelForSequenceClassification

flow_model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=len(edge_label_list))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
label_names = tokenized_datasets["train"].features["labels"].names
label_names

['a',
 'a-eq',
 'd',
 'f-comp',
 'f-eq',
 'f-part-of',
 'o',
 's',
 't',
 't-comp',
 't-eq',
 't-part-of',
 'v',
 'v-tm',
 'x']

In [None]:
import evaluate

metric = evaluate.load("seqeval")

In [None]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_predictions = [[label_names[prediction]] for prediction in predictions]
    true_labels = [[label_names[label]] for label in labels]

    return true_predictions, true_labels

In [None]:
def evaluate(dataloader_val):

    flow_model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        inputs = {
                  'input_ids':      batch['input_ids'],
                  'attention_mask': batch['attention_mask'],
                  'labels':         batch['labels'],
                 }

        with torch.no_grad():        
            outputs = flow_model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [None]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {index: label for index, label in enumerate(label_names)}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [None]:
from tqdm.auto import tqdm
from accelerate import Accelerator
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
from sklearn.metrics import f1_score

accelerator = Accelerator()

flow_model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=len(edge_label_list))
optimizer = AdamW(flow_model.parameters(), lr=3e-5)

train_dl, eval_dl, flow_model, optimizer = accelerator.prepare(
    train_dataloader, eval_dataloader, flow_model, optimizer
)

num_epochs = 3
num_training_steps = num_epochs * len(train_dl)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_epochs):
    flow_model.train()
    for batch in train_dl:
        outputs = flow_model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)


    val_loss, predictions, true_vals = evaluate(eval_dl)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

  0%|          | 0/684 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Validation loss: 0.455494365148377
F1 Score (Weighted): 0.85451795169885
Validation loss: 0.38150714050259504
F1 Score (Weighted): 0.875600271104511
Validation loss: 0.3592918371422249
F1 Score (Weighted): 0.8795466965402874


In [None]:
_, predictions, true_vals = evaluate(eval_dl)
val_f1 = f1_score_func(predictions, true_vals)
accuracy_per_class(predictions, true_vals)

Class: a
Accuracy: 32/68

Class: a-eq
Accuracy: 1/21

Class: d
Accuracy: 17/131

Class: f-comp
Accuracy: 0/19

Class: f-eq
Accuracy: 2/93

Class: f-part-of
Accuracy: 0/49

Class: o
Accuracy: 146/199

Class: s
Accuracy: 0/6

Class: t
Accuracy: 359/446

Class: t-comp
Accuracy: 18/65

Class: t-eq
Accuracy: 0/27

Class: t-part-of
Accuracy: 0/8

Class: v
Accuracy: 0/12

Class: v-tm
Accuracy: 17/38

Class: x
Accuracy: 5946/6109

