# Install and import requirements

In [1]:
# Install required libraries
!pip install datasets transformers seqeval scipy tqdm

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m23.5 MB/s[0m eta [3

In [11]:
import json

import pandas as pd

import torch

from scipy.stats import spearmanr

from tqdm import tqdm

import numpy as np

from collections import Counter

from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.model_selection import train_test_split

from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForTokenClassification, EarlyStoppingCallback

from datasets import Dataset

import re

# Upload data

*  validation data - 'mushroom.en-val.v2.jsonl'
*  sample label files - 'en_train_labeled.jsonl'

In [4]:
# Upload validation data and sample label files
from google.colab import files
uploaded = files.upload()  # Upload your dataset files

Saving mushroom.en-val.v2.jsonl to mushroom.en-val.v2.jsonl


In [6]:
import json

def load_jsonl_file_to_records(filename):
    records = []
    try:
        with open(filename, 'r', encoding='utf-8') as f:
            for line in f:
                records.append(json.loads(line))

        return pd.DataFrame(records)
    except Exception as e:
        print(f"Error reading {filename}: {e}")
        return None

In [8]:
train_df = load_jsonl_file_to_records('en_train_labeled.jsonl')
#train_df2 = pd.read_csv("test_out.csv")

test_df = load_jsonl_file_to_records('mushroom.en-val.v2.jsonl')

## Remove responses with "I'm sorry" form train data

In [10]:
print("Length of train file:", len(train_df))
print("Length of validation file:", len(test_df))

Length of train file: 809
Length of validation file: 50


In [12]:
# Define the regex pattern to match "I'm sorry"
pattern = re.compile(r"I'm sorry")

# Apply the pattern match to the train_df
train_df["model_output_text_match"] = train_df["model_output_text"].apply(lambda x: bool(pattern.search(x)))

# Filter out the expressions that contain the pattern
train_df = train_df[~train_df["model_output_text_match"]]

# Drop the helper column used for matching
train_df = train_df.drop(columns=["model_output_text_match"])

In [13]:
print("Length of train file:", len(train_df))
print("Length of validation file:", len(test_df))

Length of train file: 762
Length of validation file: 50


## Only leave model input, output text and hard labels in train data

In [14]:
train_df = train_df[["model_input", "model_output_text", "hard_labels"]]

In [15]:
train_df.head()

Unnamed: 0,model_input,model_output_text,hard_labels
0,Do all arthropods have antennae?,"Yes, all insects and arachnids (including spi...",[]
1,Do all arthropods have antennae?,"Yes, all insects and arachnids have at least ...","[[22, 31]]"
2,Do all arthropods have antennae?,"Yes, all insects and arachnids (including spi...","[[52, 61], [67, 72], [100, 107], [43, 50], [22..."
3,Do all arthropods have antennae?,"Yes, all insects and arachnids (including spi...","[[67, 72], [43, 50], [52, 61], [22, 31]]"
4,Do all arthropods have antennae?,"Yes, all insects and arachnids (including spi...","[[43, 50], [67, 72], [52, 61], [22, 31]]"


In [16]:
test_df.head()

Unnamed: 0,id,lang,model_input,model_output_text,model_id,soft_labels,hard_labels,model_output_logits,model_output_tokens
0,val-en-1,EN,What did Petra van Staveren win a gold medal for?,Petra van Stoveren won a silver medal in the 2...,tiiuae/falcon-7b-instruct,"[{'start': 10, 'prob': 0.2, 'end': 12}, {'star...","[[25, 31], [45, 49], [69, 83]]","[-5.5669536591, -11.90533638, -13.0743436813, ...","[Pet, ra, Ġvan, ĠSto, ve, ren, Ġwon, Ġa, Ġsilv..."
1,val-en-2,EN,How many genera does the Erysiphales order con...,The Elysiphale order contains 5 genera.,tiiuae/falcon-7b-instruct,"[{'start': 4, 'prob': 0.2, 'end': 14}, {'start...","[[30, 31]]","[-6.199614048, -13.7564926147, -14.0058326721,...","[The, ĠE, lys, iph, ale, Ġorder, Ġcontains, Ġ,..."
2,val-en-3,EN,Do all arthropods have antennae?,"Yes, all arachnids have antennas. However, not...",tiiuae/falcon-7b-instruct,"[{'start': 0, 'prob': 0.6, 'end': 3}, {'start'...","[[0, 3], [9, 18], [63, 70], [78, 87]]","[-4.8190689087, -16.5279369354, -10.1344690323...","[Yes, ,, Ġall, Ġar, ach, n, ids, Ġhave, Ġanten..."
3,val-en-4,EN,When did Chance the Rapper debut?,Chance the rapper debuted in 2011.,tiiuae/falcon-7b-instruct,"[{'start': 18, 'prob': 0.0909090909, 'end': 29...","[[29, 33]]","[-6.2555613518, -10.9392805099, -13.1335763931...","[Ch, ance, Ġthe, Ġrapper, Ġdebuted, Ġin, Ġ, 20..."
4,val-en-5,EN,What is the UN Sustainable Development Goal 11...,The UN's Sustainable City initiative defines a...,tiiuae/falcon-7b-instruct,"[{'start': 0, 'prob': 0.0909090909, 'end': 9},...","[[70, 227]]","[-3.9737114906, -12.9197320938, -7.2846975327,...","[The, ĠUN, ', s, ĠSustainable, ĠCity, Ġinitiat..."


# Train

In [17]:
from transformers import BertForTokenClassification, AutoTokenizer, AutoModelForTokenClassification

# Tokenizer & Model Setup
model_checkpoint = "roberta-base"
tokenizer =  AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=2)


# Convert Pandas DataFrame to Huggingface Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Tokenize and align labels

In [18]:
def tokenize_and_align_labels(example):
    """
    Tokenize and align word-level labels with subword tokens for a combined input-output scenario.
    Parameters:
    example (dict): A dictionary containing:
                     - "model_input": input question.
                     - "model_output_text": generated response.
                     - "hard_labels": list of (start, end) for words in the output.
    Returns:
    tokenized_inputs (dict): Tokenized input with aligned labels.
    """
    concatenated_text = example['model_input'] + example['model_output_text']
    max_length = 256

    tokenized_inputs = tokenizer(
        concatenated_text,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_offsets_mapping=True
    )

    tokens = tokenizer.convert_ids_to_tokens(tokenized_inputs['input_ids'])
    offsets = tokenized_inputs['offset_mapping']

    aligned_labels = [-100] * len(tokenized_inputs['input_ids'])
    input_length =len(example['model_input'])
    output_start_idx = len(tokenizer(example['model_input'], truncation=True)['input_ids'])
    #print(output_start_idx)

    for i, (start_char, end_char) in enumerate(offsets):
        # Ignore special tokens and input tokens
        if i < output_start_idx -1 or start_char == end_char:
            continue

        label = 0
        for start, end in example['hard_labels']:
            if start_char >= (start + input_length) and end_char <= (end + input_length):
                label = 1
                break
        if i < len(aligned_labels):
            aligned_labels[i] = label

    tokenized_inputs['labels'] = aligned_labels
    tokenized_inputs['input_length'] = input_length
    tokenized_inputs['output_length'] = len(example['model_output_text'])
    tokenized_inputs['offsets'] = offsets

    return tokenized_inputs

In [19]:
# Tokenize datasets and align labels
train_dataset = train_dataset.map(tokenize_and_align_labels, batched=False)
test_dataset = test_dataset.map(tokenize_and_align_labels, batched=False)

Map:   0%|          | 0/762 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [20]:
import numpy as np
import torch.nn.functional as F
from scipy.stats import spearmanr
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def recompute_hard_labels(soft_labels):
    """Optionally, infer hard labels from the soft labels provided"""
    hard_labels = []
    prev_end = -1
    for start, end in (
        (lbl['start'], lbl['end'])
        for lbl in sorted(soft_labels, key=lambda span: (span['start'], span['end']))
        if lbl['prob'] > 0.5
    ):
        if start == prev_end:
            hard_labels[-1][-1] = end
        else:
            hard_labels.append([start, end])
        prev_end = end
    return hard_labels

def score_iou(ref_dict, pred_dict):
    """Computes intersection-over-union between reference and predicted hard labels for a single datapoint."""
    assert ref_dict['id'] == pred_dict['id']
    ref_indices = {idx for span in ref_dict['hard_labels'] for idx in range(*span)}
    pred_indices = {idx for span in pred_dict['hard_labels'] for idx in range(*span)}
    if not pred_indices and not ref_indices: return 1.
    return len(ref_indices & pred_indices) / len(ref_indices | pred_indices)

def score_cor(ref_dict, pred_dict):
    """Computes Spearman correlation between predicted and reference soft labels for a single datapoint."""
    assert ref_dict['id'] == pred_dict['id']
    ref_vec = [0.] * ref_dict['text_len']
    pred_vec = [0.] * ref_dict['text_len']
    for span in ref_dict['soft_labels']:
        for idx in range(span['start'], min(span['end'], ref_dict['text_len'])):
            ref_vec[idx] = span['prob']
    for span in pred_dict['soft_labels']:
        for idx in range(span['start'], min(span['end'], ref_dict['text_len'])):
            pred_vec[idx] = span['prob']
    if len({round(flt, 8) for flt in pred_vec}) == 1 or len({round(flt, 8) for flt in ref_vec}) == 1:
        return float(len({round(flt, 8) for flt in ref_vec}) == len({round(flt, 8) for flt in pred_vec}))
    return spearmanr(ref_vec, pred_vec).correlation

def compute_metrics(p, dataset):
    preds = p.predictions.argmax(-1)  # Get predicted labels (indices)
    logits = p.predictions  # The raw logits for soft label evaluation
    labels = p.label_ids

    # Assuming dataset has input_length, offset_mapping, and output_length
    input_length = dataset['input_length']
    offsets = dataset['offset_mapping']
    output_length = dataset['output_length']

    # Calculate probabilities using softmax
    probabilities = F.softmax(torch.tensor(logits), dim=-1).numpy()
    probabilities = probabilities [:, :, 1]

    # Flatten the predictions and labels (ignore the -100 label used for padding/special tokens)
    prediction_labels = [pred for pred, label in zip(preds.flatten(), labels.flatten()) if label != -100]
    true_labels = [label for label in labels.flatten() if label != -100]
    '''
    # Debugging prints
    print("Logits shape:", logits.shape)
    print("Predictions shape:", preds.shape)
    print("Labels shape:", labels.shape)
    print("Offsets:", offsets)
    print("Input lengths:", input_length)
    print("Output lengths:", output_length)
    '''

    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, prediction_labels, average='binary')
    acc = accuracy_score(true_labels, prediction_labels)

    # Prepare reference and predicted soft labels for custom evaluation (IoU/Spearman)
    soft_labels_ref = []
    soft_labels_pred = []

    for i in range(len(preds)):
        ref_dict = {
            'id': i,
            'soft_labels': dataset['soft_labels'][i],
            'hard_labels': dataset['hard_labels'][i],
            'text_len': output_length[i]

        }
        pred_dict = {
            'id': i,
            'soft_labels': [{'start': (s - input_length[i]), 'end': (e - input_length[i]), 'prob': pred_prob}
                            for (pred_prob, (s, e), label) in zip(probabilities[i], offsets[i], labels[i]) if label != -100],
            'hard_labels': recompute_hard_labels([{'start': (s - input_length[i]), 'end': (e - input_length[i]), 'prob': pred_prob}
                                                  for (pred_prob, (s, e), label) in zip(probabilities[i], offsets[i], labels[i]) if label != -100]),
            'text_len': output_length[i]
        }
        soft_labels_ref.append(ref_dict)
        soft_labels_pred.append(pred_dict)
        '''
        # More debugging prints
        print(f"\nExample {i}:")
        print("Predicted soft labels:", pred_dict['soft_labels'])
        print("Reference soft labels:", ref_dict['soft_labels'])
        '''

    # Calculate IoU and Spearman correlation scores
    ious = np.array([score_iou(r, d) for r, d in zip(soft_labels_ref, soft_labels_pred)])
    cors = np.array([score_cor(r, d) for r, d in zip(soft_labels_ref, soft_labels_pred)])

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'iou_mean': f'{ious.mean():.8f} ± {ious.std():.8f}',
        'spearman_mean': f'{cors.mean():.8f} ± {cors.std():.8f}',
    }

In [21]:
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback

# Define Training Arguments with Early Stopping
training_args = TrainingArguments(
    output_dir="./results",           # Directory for model output
    save_strategy="epoch",            # Save model at the end of each epoch
    eval_strategy="epoch",             # Evaluate model at the end of each epoch
    learning_rate=5e-5,
    per_device_train_batch_size=18,
    per_device_eval_batch_size=8,
    num_train_epochs=10,              # Number of training epochs
    weight_decay=0.01,                # Regularization
    logging_dir='./logs',             # Directory for logging
    logging_steps=10,
    load_best_model_at_end=True,      # Load the best model at the end
    metric_for_best_model='f1',       # Metric to compare models
    greater_is_better=True,           # Specify if the metric is better when greater
)

# Initialize the Trainer object with EarlyStoppingCallback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=lambda p: compute_metrics(p, test_dataset),  # Pass dataset to metrics
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)


# Train the model
trainer.train()

# Evaluate the model
metrics = trainer.evaluate()
print(metrics)

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: [32m[41mERROR[0m API key must be 40 characters long, yours was 164


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Iou Mean,Spearman Mean
1,0.3988,0.462991,0.803944,0.421893,0.66787,0.308333,0.34400523 ± 0.32246760,0.47083777 ± 0.22565497


KeyboardInterrupt: 