In [1]:
import json
import copy
import gc
import os
import re
from collections import defaultdict
from pathlib import Path

import torch
from torch import nn
import numpy as np
import pandas as pd
from spacy.lang.en import English
from transformers.tokenization_utils import PreTrainedTokenizerBase
from transformers import AutoModelForTokenClassification, AutoTokenizer
from transformers.trainer import Trainer
from transformers.training_args import TrainingArguments
from transformers.trainer_utils import EvalPrediction
from transformers.data.data_collator import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict, concatenate_datasets
import wandb

2025-04-13 18:12:39.510638: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-13 18:12:39.510747: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-13 18:12:39.639998: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Config & Parameters

In [2]:
# basic random seed
import os 
import random
import numpy as np 

DEFAULT_RANDOM_SEED = 29

def seedBasic(seed=DEFAULT_RANDOM_SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
# torch random seed
import torch
def seedTorch(seed=DEFAULT_RANDOM_SEED):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
      
# basic + tensorflow + torch 
def seedEverything(seed=DEFAULT_RANDOM_SEED):
    seedBasic(seed)
    seedTorch(seed)

In [3]:
TRAINING_MODEL_PATH = "FacebookAI/xlm-roberta-large"
TRAINING_MAX_LENGTH = 768
EVAL_MAX_LENGTH = 768
CONF_THRESH = 0.6
LR = 2.5e-5
LR_SCHEDULER_TYPE = "linear"
NUM_EPOCHS = 25
BATCH_SIZE = 8
EVAL_BATCH_SIZE = 8
GRAD_ACCUMULATION_STEPS = 16 // BATCH_SIZE
WARMUP_RATIO = 0.1
WEIGHT_DECAY = 0.01
AMP = True
FREEZE_EMBEDDING = False
FREEZE_LAYERS = 2
N_SPLITS = 4
NEGATIVE_RATIO = 0.3  # down sample ratio of negative samples in the training set
OUTPUT_DIR = "output"
Path(OUTPUT_DIR).mkdir(exist_ok=True)

In [4]:
args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    fp16=AMP,
    learning_rate=LR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUMULATION_STEPS,
    report_to="none",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    metric_for_best_model="f1",
    greater_is_better=True,
    load_best_model_at_end=True,
    overwrite_output_dir=True,
    lr_scheduler_type=LR_SCHEDULER_TYPE,
    warmup_ratio=WARMUP_RATIO,
    weight_decay=WEIGHT_DECAY,
)

In [5]:
from collections import defaultdict

def bundle_entities(tokens, labels):
    entity_dict = defaultdict(list)
    current_entity_tokens = []
    current_label = None

    for token, label in zip(tokens, labels):
        if label == 'O':  # Reset when outside an entity
            if current_entity_tokens and current_label:
                entity_dict[current_label].append(" ".join(current_entity_tokens))
            current_entity_tokens = []
            current_label = None
            continue

        # If we encounter a new entity (either 'B-' or 'I-' type)
        entity = label

        # If we are starting a new entity, process the previous one first
        if current_entity_tokens and current_label != entity:
            entity_dict[current_label].append(" ".join(current_entity_tokens))
            current_entity_tokens = []  # Reset the tokens for the new entity

        # Add the current token to the current entity
        current_entity_tokens.append(token)
        current_label = entity

    # Add last entity if it exists
    if current_entity_tokens and current_label:
        entity_dict[current_label].append(" ".join(current_entity_tokens))

    return dict(entity_dict)

## Dataset Preparation

In [6]:
import os

In [7]:
with open("/kaggle/input/workshop-task-acl/SOMD2025-PhaseI/train_texts.txt", "r", encoding="utf-8") as file:
    text = file.read()

text_list = text.split('\n')

with open("/kaggle/input/workshop-task-acl/SOMD2025-PhaseI/train_entities.txt", "r", encoding="utf-8") as file:
    text = file.read()

labels_list = text.split('\n')

In [8]:
adf = pd.read_csv('/kaggle/input/workshop-task-acl/augmented_gemma_2_9b.csv')

In [9]:
df = pd.DataFrame()
df['train_text'] = text_list
df['train_labels'] = labels_list

In [10]:
df['train_text_list'] = df['train_text'].str.split(' ')
df['train_labels_list'] = df['train_labels'].str.split(' ')

In [11]:
df['document'] = np.arange(len(df))
df['train_labels_list'] = df['train_labels_list'].apply(lambda tags: [tag.replace('I-', 'B-') for tag in tags])
df['unique_labels'] = df['train_labels_list'].apply(lambda x: ' '.join(list(set(x))))

In [12]:
all_labels = [
    'B-Extension',
    'I-Extension',
    'B-Application',
    'I-Application',
    'B-Abbreviation',
    'B-Citation',
    'I-Citation',
    'B-SoftwareCoreference',
    'I-SoftwareCoreference',
    'B-URL',
    'I-URL',
    'B-AlternativeName', 
    'I-AlternativeName',
    'B-OperatingSystem',
    'I-OperatingSystem',
    'B-Developer',
    'I-Developer',
    'O',
    'B-License',
    'I-License',
    'B-PlugIn',
    'I-PlugIn',
    'B-Release',
    'I-Release',
    'B-ProgrammingEnvironment',
    'I-ProgrammingEnvironment',
    'B-Version',
    'I-Version'
]

id2label = {i: l for i, l in enumerate(all_labels)}
label2id = {v: k for k, v in id2label.items()}
target = [l for l in all_labels if l != "O"]

In [13]:
# adf = adf.drop(columns = ['labels'])
# adf = adf.sort_values('text_id').reset_index(drop = True)
# adf['train_labels'] = df['train_labels'].values
# adf['train_labels'] = adf['train_labels'].fillna('O')
# adf['train_labels_list'] = adf['train_labels'].str.split(' ')
# adf['augmented_text'] = adf['augmented_text'].apply(lambda x: x.strip())
# adf['augmented_text_list'] = adf['augmented_text'].str.split(' ')
# adf['original_text'] = adf['original_text'].fillna('')
# adf['original_text_1_list'] = adf['original_text'].str.split(' ')

# l1 = []
# for i in range(0, len(adf)):
#     l1.append(bundle_entities(adf.values[i][6], adf.values[i][4]))

# aug_lab = []
# for i in range(len(l1)):
#     labels = {value: key for key, values in l1[i].items() for value in values}
#     aug_lab.append(labels)

# labels = []
# for i in range(len(adf)):
#     samp1 = adf['augmented_text_list'].values[i]
#     labeled_tokens = ' '.join([aug_lab[i].get(token, "O") for token in samp1])
#     labels.append(labeled_tokens)

# adf['labels'] = pd.Series(labels)

# del l1, aug_lab, labels
# gc.collect()

# adf = adf[['augmented_text', 'labels', 'augmented_text_list']]
# adf['labels_list'] = adf['labels'].str.split(' ')
# adf['unique_labels'] = adf['labels_list'].apply(lambda x: ' '.join(list(set(x))))
# adf['document'] = np.arange(len(adf))
# adf.columns = df.columns

In [14]:
adf

Unnamed: 0,text_id_1,text_id_2,original_text_1,original_text_2,labels_1,labels_2,augmented_text,labels
0,574,1058,Actigraphs were initialised for each child usi...,Reliability analysis of the data and univariat...,O O O O O O O O O O O O O O O O O O B-Applicat...,O O O O O O O O O O O B-Application O B-Versio...,Reliability analysis of the data and univariat...,O O O O O O O O O O O O O O O O O O O O O O O ...
1,656,210,Divergence times of L . neilli and L . edwards...,Statistical analysis was performed using Graph...,O O O O O O O O O O O O O O O O O O O O O O O ...,O O O O O B-Developer B-Application O O O B-Ve...,Divergence times of *L. neilli* and *L. edward...,O O O O O O O O O O O O O O O O O O O O O O O ...
2,685,1034,All the analyses were performed using STATA 13...,Data were analyzed with the Statistical Packag...,O O O O O O B-Application B-Version O B-Develo...,O O O O O B-Application I-Application I-Applic...,Data were analyzed using both STATA 13.1 (Stat...,O O O O O O O O O O O O O O O O O O O O O O O ...
3,986,840,All analyses were exploratory and p values sho...,"These AOI were automatically counted , and dat...",O O O O O O O O O O O O O,O O O O O O O O O O O O O O O O O O B-Develope...,"Exploratory analyses were conducted, and p-val...",O O O O O O O O O O O O O O O O O O O O O O O ...
4,906,574,The h2oEnsemble package version 0.1.8 [36] was...,Actigraphs were initialised for each child usi...,O B-PlugIn O O B-Version B-Citation O O O O O ...,O O O O O O O O O O O O O O O O O O B-Applicat...,The h2oEnsemble package version 0.1.8 [36] was...,O O O O O O O O O O O O O O O O O O O O O O O ...
...,...,...,...,...,...,...,...,...
1145,321,437,Statistical analyses were performed under a 5 ...,Analyses were conducted using SPSS version 24....,O O O O O O O O O O O O O B-Developer B-Applic...,O O O O B-Application O B-Version O B-Programm...,Statistical analyses were conducted using SPSS...,O O O O O O O O O O O O O O O O O O O O O O O ...
1146,323,988,For neuroimaging data pre - processing we used...,Statistical analysis was conducted with the St...,O O O O O O O O O B-Application I-Application ...,O O O O O O B-Application I-Application I-Appl...,We performed neuroimaging data pre-processing ...,O O O O O O O O O O O O O O O O O O O O O O O ...
1147,394,251,All analyses were performed with SPSS for Wind...,Both enumerators and operators used the same s...,O O O O O B-Application O B-OperatingSystem O ...,O O O O O O O O O O B-Application O B-Develope...,Statistical analyses were conducted using SPSS...,O O O O O O O O O O O O O O O O O O O O O O O ...
1148,146,844,All analyses were performed using GraphPad Pri...,The proportion of null alleles ( NA ) at each ...,O O O O O B-Developer B-Application B-Version ...,O O O O O O O O O O O O O O O O O O B-Applicat...,All analyses were conducted using GraphPad Pri...,O O O O O O O O O O O O O O O O O O O O O O O ...


In [15]:
adf['original_text_1'] = adf['original_text_1'].fillna('')
adf['original_text_2'] = adf['original_text_2'].fillna('')
adf['original_text_1_list'] = adf['original_text_1'].str.split(' ')
adf['original_text_2_list'] = adf['original_text_2'].str.split(' ')
adf['labels_1'] = adf['labels_1'].fillna('O')
adf['labels_2'] = adf['labels_2'].fillna('O')
adf['augmented_text'] = adf['augmented_text'].fillna('')
adf['labels_1_list'] = adf['labels_1'].str.split(' ')
adf['labels_2_list'] = adf['labels_2'].str.split(' ')
adf['augmented_text'] = adf['augmented_text'].apply(lambda x: x.strip())
adf['augmented_text_list'] = adf['augmented_text'].str.split(' ')

l2 = []
for i in range(0, len(adf)):
    l2.append(bundle_entities(adf.values[i][9], adf.values[i][11]))

l1 = []
for i in range(0, len(adf)):
    l1.append(bundle_entities(adf.values[i][8], adf.values[i][10]))

aug_lab = []
for i in range(len(l1)):
    merged_labels = {}
    for key in set(l1[i].keys()).union(set(l2[i].keys())):
        merged_labels[key] = l1[i].get(key, []) + l2[i].get(key, [])
    labels = {value: key for key, values in merged_labels.items() for value in values}
    aug_lab.append(labels)

labels = []
for i in range(len(adf)):
    samp1 = adf['augmented_text_list'].values[i]
    labeled_tokens = ' '.join([aug_lab[i].get(token, "O") for token in samp1])
    labels.append(labeled_tokens)

adf['labels'] = pd.Series(labels)

del l1, l2, aug_lab, merged_labels, labels
gc.collect()

adf = adf[['augmented_text', 'labels', 'augmented_text_list']]
adf['labels_list'] = adf['labels'].str.split(' ')
adf['unique_labels'] = adf['labels_list'].apply(lambda x: ' '.join(list(set(x))))
adf['document'] = np.arange(len(adf))
adf.columns = df.columns

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adf['labels_list'] = adf['labels'].str.split(' ')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adf['unique_labels'] = adf['labels_list'].apply(lambda x: ' '.join(list(set(x))))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adf['document'] = np.arange(len(adf))


In [16]:
# with open("/kaggle/input/workshop-task-acl/SOMD2025-PhaseI/train_relations.txt", "r", encoding="utf-8") as file:
#     text = file.read()

# text_list = text.split('\n')

# # Process the data safely
# records = []
# for i, row in enumerate(text_list):
#     relations = row.split('; ')
    
#     if len(relations[0])!=0:
#         for relation in relations:
#             parts = relation.split('\t')
#             rel_type, ent1, ent2 = parts
#             records.append({'Relation': rel_type, 'Entity_1': int(ent1), 'Entity_2': int(ent2), 'document': i})
#     else:
#         records.append({'Relation': np.nan, 'Entity_1': np.nan, 'Entity_2': np.nan, 'document': i})
            

# # # Convert to DataFrame
# relations = pd.DataFrame(records)
# relations['Relation'] = relations['Relation'].fillna('no_relation')
# relations['Entity_1'] = relations['Entity_1'].fillna(0).astype(int)
# relations['Entity_2'] = relations['Entity_2'].fillna(0).astype(int)

# df2 = df[['train_text_list', 'train_labels_list', 'document']]
# relations = relations.join(df2.set_index('document'), on = 'document')

# relations['entity_type_1'] = relations.apply(lambda row: row['train_labels_list'][row['Entity_1']], axis = 1)
# relations['entity_type_2'] = relations.apply(lambda row: row['train_labels_list'][row['Entity_2']], axis = 1)

# relations_null = relations[relations['Relation']=='no_relation'].reset_index(drop = True)
# relations_nn = relations[relations['Relation']!='no_relation'].reset_index(drop = True)

# relations_nn['entity_1_indexs'] = relations_nn.apply(
#     lambda row: [row['Entity_1']] + [
#         i for i in range(row['Entity_1'] + 1, len(row['train_labels_list'])) 
#         if row['train_labels_list'][i].startswith(f"I-{row['entity_type_1'].split('-')[1]}")
#     ], axis=1
# )

# relations_nn['entity_2_indexs'] = relations_nn.apply(
#     lambda row: [row['Entity_2']] + [
#         i for i in range(row['Entity_2'] + 1, len(row['train_labels_list'])) 
#         if row['train_labels_list'][i].startswith(f"I-{row['entity_type_2'].split('-')[1]}")
#     ], axis=1
# )

# relations_nn['entity_1_text'] = relations_nn.apply(
#     lambda row: " ".join([row['train_text_list'][idxs] for idxs in row['entity_1_indexs']]), axis=1)

# relations_nn['entity_2_text'] = relations_nn.apply(
#     lambda row: " ".join([row['train_text_list'][idxs] for idxs in row['entity_2_indexs']]), axis=1)

# relations = pd.concat([relations_nn, relations_null]).reset_index(drop = True)

# relations['entity_1_indexs'] = relations['entity_1_indexs'].fillna('null')
# relations['entity_2_indexs'] = relations['entity_2_indexs'].fillna('null')
# relations['entity_1_text'] = relations['entity_1_text'].fillna('O')
# relations['entity_2_text'] = relations['entity_2_text'].fillna('O')

# relations = relations[['Relation', 'entity_type_1', 'entity_type_2', 'entity_1_text', 'entity_2_text', 'document']]
# relations = relations.sort_values('document').reset_index(drop = True)
# relations['relation_text'] = relations.apply(lambda row: row['entity_type_1'] + ' of ' + row['entity_1_text'] + ', ' + row['entity_type_2'] + ' of ' + row['entity_2_text'], axis = 1)
# relations = relations[['Relation', 'relation_text', 'document']]

# relations.to_csv('relations.csv', index = False)

## Tokenization

In [17]:
class CustomTokenizer:
    def __init__(self, tokenizer: PreTrainedTokenizerBase, label2id: dict, max_length: int) -> None:
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.max_length = max_length

    def __call__(self, example: dict) -> dict:
        # rebuild text from tokens
        text, labels, token_map = [], [], []

        for idx, (t, l) in enumerate(
            zip(example["tokens"], example["provided_labels"])
        ):
            text.append(t)
            labels.extend([l] * len(t))
            token_map.extend([idx]*len(t))

        text = "".join(text)
        labels = np.array(labels)

        # actual tokenization
        tokenized = self.tokenizer(
            "".join(text),
            return_offsets_mapping=True,
            truncation=True,
            max_length=self.max_length
        )

        token_labels = []

        for start_idx, end_idx in tokenized.offset_mapping:
            # CLS token
            if start_idx == 0 and end_idx == 0:
                token_labels.append(self.label2id["O"])
                continue

            # case when token starts with whitespace
            if text[start_idx].isspace():
                start_idx += 1

            token_labels.append(self.label2id[labels[start_idx]])

        length = len(tokenized.input_ids)

        return {**tokenized, "labels": token_labels, "length": length, "token_map": token_map}

## Instantiate the dataset

In [18]:
tokenizer = AutoTokenizer.from_pretrained(TRAINING_MODEL_PATH)
train_encoder = CustomTokenizer(tokenizer=tokenizer, label2id=label2id, max_length=TRAINING_MAX_LENGTH)
eval_encoder = CustomTokenizer(tokenizer=tokenizer, label2id=label2id, max_length=EVAL_MAX_LENGTH)

ds = DatasetDict()

for key, data in zip(["original", "aug"], [df, adf]):
    ds[key] = Dataset.from_dict({
        "full_text": df['train_text'].values.tolist(),
        "tokens": df['train_text_list'].values.tolist(),
        "document": [str(x) for x in df['document'].values.tolist()],
        "provided_labels": df['train_labels_list'].values.tolist()})

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

## Metrics

In [19]:
def find_span(target: list[str], document: list[str]) -> list[list[int]]:
    idx = 0
    spans = []
    span = []

    for i, token in enumerate(document):
        if token != target[idx]:
            idx = 0
            span = []
            continue
        span.append(i)
        idx += 1
        if idx == len(target):
            spans.append(span)
            span = []
            idx = 0
            continue

    return spans


class PRFScore:
    """A precision / recall / F score."""

    def __init__(
        self,
        *,
        tp: int = 0,
        fp: int = 0,
        fn: int = 0,
    ) -> None:
        self.tp = tp
        self.fp = fp
        self.fn = fn

    def __len__(self) -> int:
        return self.tp + self.fp + self.fn

    def __iadd__(self, other):  # in-place add
        self.tp += other.tp
        self.fp += other.fp
        self.fn += other.fn
        return self

    def __add__(self, other):
        return PRFScore(
            tp=self.tp + other.tp, fp=self.fp + other.fp, fn=self.fn + other.fn
        )

    def score_set(self, cand: set, gold: set) -> None:
        self.tp += len(cand.intersection(gold))
        self.fp += len(cand - gold)
        self.fn += len(gold - cand)

    @property
    def precision(self) -> float:
        return self.tp / (self.tp + self.fp + 1e-100)

    @property
    def recall(self) -> float:
        return self.tp / (self.tp + self.fn + 1e-100)

    @property
    def f1(self) -> float:
        p = self.precision
        r = self.recall
        return 2 * ((p * r) / (p + r + 1e-100))

    @property
    def f1(self) -> float:
        beta = 1
        p = self.precision
        r = self.recall

        fbeta = (1+(beta**2))*p*r / ((beta**2)*p + r + 1e-100)
        return fbeta

    def to_dict(self) -> dict[str, float]:
        return {"p": self.precision, "r": self.recall, "f1": self.f1}

In [20]:
class MetricsComputer:
    nlp = English()

    def __init__(self, eval_ds: Dataset, label2id: dict, conf_thresh: float = 0.9) -> None:
        self.ds = eval_ds.remove_columns("labels").rename_columns({"provided_labels": "labels"})
        self.gt_df = self.create_gt_df(self.ds)
        self.label2id = label2id
        self.confth = conf_thresh

    def __call__(self, eval_preds: EvalPrediction) -> dict:
        pred_df = self.create_pred_df(eval_preds.predictions)
        return self.compute_metrics_from_df(self.gt_df, pred_df)

    @staticmethod
    def create_gt_df(ds: Dataset):
        gt = []
        for row in ds:
            for token_idx, (token, label) in enumerate(zip(row["tokens"], row["labels"])):
                if label == "O":
                    continue
                gt.append(
                    {"document": row["document"], "token": token_idx, "label": label, "token_str": token}
                )
        gt_df = pd.DataFrame(gt)
        gt_df["row_id"] = gt_df.index

        return gt_df

    def create_pred_df(self, logits: np.ndarray) -> pd.DataFrame:
        """
        Note:
            Thresholing is doen on logits instead of softmax, which could find better models on LB.
        """
        prediction = logits
        o_index = self.label2id["O"]
        preds = prediction.argmax(-1)
        preds_without_o = prediction.copy()
        preds_without_o[:,:,o_index] = 0
        preds_without_o = preds_without_o.argmax(-1)
        o_preds = prediction[:,:,o_index]
        preds_final = np.where(o_preds < self.confth, preds_without_o , preds)

        pairs = set()
        processed = []

        # Iterate over document
        for p_doc, token_map, offsets, tokens, doc in zip(
            preds_final, self.ds["token_map"], self.ds["offset_mapping"], self.ds["tokens"], self.ds["document"]
        ):
            # Iterate over sequence
            for p_token, (start_idx, end_idx) in zip(p_doc, offsets):
                label_pred = id2label[p_token]

                if start_idx + end_idx == 0:
                    # [CLS] token i.e. BOS
                    continue

                if token_map[start_idx] == -1:
                    start_idx += 1

                # ignore "\n\n"
                while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
                    start_idx += 1

                if start_idx >= len(token_map):
                    break

                token_id = token_map[start_idx]
                pair = (doc, token_id)

                # ignore "O", preds, phone number and  email
                # if label_pred in ("O", "B-EMAIL", "B-PHONE_NUM", "I-PHONE_NUM") or token_id == -1:
                #     continue
                if label_pred in ("O") or token_id == -1:
                    continue

                if pair in pairs:
                    continue

                processed.append(
                    {"document": doc, "token": token_id, "label": label_pred, "token_str": tokens[token_id]}
                )
                pairs.add(pair)

        # pred_df = pd.DataFrame(processed + self.emails + self.phone_nums)
        pred_df = pd.DataFrame(processed)
        pred_df["row_id"] = list(range(len(pred_df)))

        return pred_df

    def compute_metrics_from_df(self, gt_df, pred_df):
        """
        Compute the LB metric (lb) and other auxiliary metrics
        """

        references = {(row.document, row.token, row.label) for row in gt_df.itertuples()}
        predictions = {(row.document, row.token, row.label) for row in pred_df.itertuples()}

        score_per_type = defaultdict(PRFScore)
        references = set(references)

        for ex in predictions:
            pred_type = ex[-1] # (document, token, label)
            if pred_type != 'O':
                pred_type = pred_type[2:] # avoid B- and I- prefix

            if pred_type not in score_per_type:
                score_per_type[pred_type] = PRFScore()

            if ex in references:
                score_per_type[pred_type].tp += 1
                references.remove(ex)
            else:
                score_per_type[pred_type].fp += 1

        for doc, tok, ref_type in references:
            if ref_type != 'O':
                ref_type = ref_type[2:] # avoid B- and I- prefix

            if ref_type not in score_per_type:
                score_per_type[ref_type] = PRFScore()
            score_per_type[ref_type].fn += 1

        totals = PRFScore()

        for prf in score_per_type.values():
            totals += prf

        return {
            "precision": totals.precision,
            "recall": totals.recall,
            "f1": totals.f1,
            **{
                f"{v_k}-{k}": v_v
                for k in set([l[2:] for l in self.label2id.keys() if l!= 'O'])
                for v_k, v_v in score_per_type[k].to_dict().items()
            },
        }

## Model

In [21]:
class ModelInit:
    def __init__(
        self,
        checkpoint: str,
        id2label: dict,
        label2id: dict,
        freeze_embedding: bool,
        freeze_layers: int,
    ) -> None:
        self.model = AutoModelForTokenClassification.from_pretrained(
            checkpoint,
            num_labels=len(id2label),
            id2label=id2label,
            label2id=label2id,
            ignore_mismatched_sizes=True
        )
        # for param in self.model.deberta.embeddings.parameters():
        #     param.requires_grad = False if freeze_embedding else True
        # for layer in self.model.deberta.encoder.layer[:freeze_layers]:
        #     for param in layer.parameters():
        #         param.requires_grad = False
        self.weight = copy.deepcopy(self.model.state_dict())

    def __call__(self):
        self.model.load_state_dict(self.weight)
        return self.model

model_init = ModelInit(
    TRAINING_MODEL_PATH,
    id2label=id2label,
    label2id=label2id,
    freeze_embedding=FREEZE_EMBEDDING,
    freeze_layers=FREEZE_LAYERS,
)

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Split 
Split the original dataset into 4 folds according to `document % 4` <br>
Only uses the first 30% of negative samples in the training set but they are NOT excluded from the eval set to make sure cross-evalidation is done on the entire training dataset.

In [22]:
id2label

{0: 'B-Extension',
 1: 'I-Extension',
 2: 'B-Application',
 3: 'I-Application',
 4: 'B-Abbreviation',
 5: 'B-Citation',
 6: 'I-Citation',
 7: 'B-SoftwareCoreference',
 8: 'I-SoftwareCoreference',
 9: 'B-URL',
 10: 'I-URL',
 11: 'B-AlternativeName',
 12: 'I-AlternativeName',
 13: 'B-OperatingSystem',
 14: 'I-OperatingSystem',
 15: 'B-Developer',
 16: 'I-Developer',
 17: 'O',
 18: 'B-License',
 19: 'I-License',
 20: 'B-PlugIn',
 21: 'I-PlugIn',
 22: 'B-Release',
 23: 'I-Release',
 24: 'B-ProgrammingEnvironment',
 25: 'I-ProgrammingEnvironment',
 26: 'B-Version',
 27: 'I-Version'}

In [23]:
# split according to document id
folds = [
    (
        np.array([i for i, d in enumerate(ds["original"]["document"]) if int(d) % N_SPLITS != s]),
        np.array([i for i, d in enumerate(ds["original"]["document"]) if int(d) % N_SPLITS == s])
    )
    for s in range(N_SPLITS)
]

negative_idxs = [i for i, labels in enumerate(ds["original"]["provided_labels"]) if not any(np.array(labels) != "O")]
exclude_indices = negative_idxs[int(len(negative_idxs) * NEGATIVE_RATIO):]

## Train
Performs cross-validation and save the best checkpoint's metrics as json.

In [24]:
for fold_idx, (train_idx, eval_idx) in enumerate(folds):
    if fold_idx==2:
        print('fold_idx: ', fold_idx)
        args.run_name = f"fold-{fold_idx}"
        args.output_dir = os.path.join(OUTPUT_DIR, f"fold_{fold_idx}")
        original_ds = ds["original"].select([i for i in train_idx if i not in exclude_indices])
        train_ds = concatenate_datasets([original_ds, ds["aug"]])
        train_ds = train_ds.map(train_encoder, num_proc=os.cpu_count())
        eval_ds = ds["original"].select(eval_idx)
        eval_ds = eval_ds.map(eval_encoder, num_proc=os.cpu_count())
        trainer = Trainer(
            args=args,
            model_init=model_init,
            train_dataset=train_ds,
            eval_dataset=eval_ds,
            tokenizer=tokenizer,
            compute_metrics=MetricsComputer(eval_ds=eval_ds, label2id=label2id),
            data_collator=DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16),
        )
        trainer.train()
        eval_res = trainer.evaluate(eval_dataset=eval_ds)
        with open(os.path.join(args.output_dir, "eval_result.json"), "w") as f:
            json.dump(eval_res, f)
        del trainer
        gc.collect()
        torch.cuda.empty_cache()
    else:
        pass

fold_idx:  2
     

#0:   0%|          | 0/490 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/490 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/490 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/490 [00:00<?, ?ex/s]

     

#0:   0%|          | 0/72 [00:00<?, ?ex/s]

  

#1:   0%|          | 0/72 [00:00<?, ?ex/s]

#2:   0%|          | 0/72 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/71 [00:00<?, ?ex/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,P-release,R-release,F1-release,P-citation,R-citation,F1-citation,P-softwarecoreference,R-softwarecoreference,F1-softwarecoreference,P-application,R-application,F1-application,P-alternativename,R-alternativename,F1-alternativename,P-url,R-url,F1-url,P-license,R-license,F1-license,P-programmingenvironment,R-programmingenvironment,F1-programmingenvironment,P-extension,R-extension,F1-extension,P-abbreviation,R-abbreviation,F1-abbreviation,P-developer,R-developer,F1-developer,P-version,R-version,F1-version,P-operatingsystem,R-operatingsystem,F1-operatingsystem,P-plugin,R-plugin,F1-plugin
0,No log,0.27721,0.732272,0.722857,0.727534,0.0,0.0,0.0,0.891667,0.856,0.873469,0.0,0.0,0.0,0.609677,0.928747,0.736125,0.0,0.0,0.0,0.6,0.983607,0.745342,0.0,0.0,0.0,1.0,0.101449,0.184211,0.0,0.0,0.0,0.0,0.0,0.0,0.902527,0.847458,0.874126,0.813953,0.954545,0.878661,0.0,0.0,0.0,0.0,0.0,0.0
2,No log,0.074565,0.912894,0.950714,0.931421,0.964286,0.870968,0.915254,0.932331,0.992,0.96124,0.0,0.0,0.0,0.896163,0.97543,0.934118,0.9,0.75,0.818182,0.655914,1.0,0.792208,0.931034,1.0,0.964286,0.968254,0.884058,0.924242,0.7,0.466667,0.56,1.0,0.909091,0.952381,0.942308,0.99661,0.968699,0.981651,0.972727,0.977169,0.916667,1.0,0.956522,0.917808,0.736264,0.817073
4,0.445100,0.025,0.976462,0.977857,0.977159,1.0,0.935484,0.966667,0.992,0.992,0.992,1.0,1.0,1.0,0.985185,0.980344,0.982759,0.857143,1.0,0.923077,0.802632,1.0,0.890511,0.931034,1.0,0.964286,1.0,0.898551,0.946565,1.0,0.866667,0.928571,1.0,1.0,1.0,0.989831,0.989831,0.989831,1.0,0.981818,0.990826,1.0,1.0,1.0,0.956044,0.956044,0.956044
6,0.445100,0.009312,0.988539,0.985714,0.987124,0.966667,0.935484,0.95082,0.984127,0.992,0.988048,1.0,1.0,1.0,0.982968,0.992629,0.987775,1.0,1.0,1.0,0.983871,1.0,0.99187,0.964286,1.0,0.981818,0.984375,0.913043,0.947368,1.0,1.0,1.0,1.0,1.0,1.0,0.996599,0.99322,0.994907,0.990868,0.986364,0.98861,1.0,1.0,1.0,1.0,0.967033,0.98324
8,0.019900,0.003992,0.992143,0.992143,0.992143,0.96875,1.0,0.984127,0.968992,1.0,0.984252,0.75,1.0,0.857143,0.997531,0.992629,0.995074,1.0,1.0,1.0,1.0,1.0,1.0,0.964286,1.0,0.981818,1.0,0.927536,0.962406,0.9375,1.0,0.967742,0.916667,1.0,0.956522,1.0,1.0,1.0,0.995413,0.986364,0.990868,1.0,1.0,1.0,1.0,1.0,1.0
10,0.019900,0.001677,0.996421,0.994286,0.995352,0.96875,1.0,0.984127,1.0,1.0,1.0,0.6,1.0,0.75,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.931034,1.0,0.964286,1.0,0.927536,0.962406,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.986364,0.993135,1.0,1.0,1.0,1.0,1.0,1.0
12,0.004900,0.000167,0.999282,0.993571,0.996418,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.984615,0.927536,0.955224,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.986364,0.993135,1.0,1.0,1.0,1.0,0.989011,0.994475
14,0.004900,0.001619,0.998564,0.993571,0.996062,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.968254,1.0,0.983871,1.0,1.0,1.0,1.0,0.927536,0.962406,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.99661,0.998302,1.0,0.986364,0.993135,1.0,1.0,1.0,1.0,1.0,1.0
16,0.001700,3.2e-05,1.0,0.994286,0.997135,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.927536,0.962406,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.986364,0.993135,1.0,1.0,1.0,1.0,1.0,1.0
18,0.001700,2e-05,1.0,0.994286,0.997135,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.927536,0.962406,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.986364,0.993135,1.0,1.0,1.0,1.0,1.0,1.0
