In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

In [2]:
import ijson
from datasets import Dataset , concatenate_datasets , DatasetDict
from decimal import Decimal


def safe_convert(value):
    if isinstance(value, Decimal):
        return float(value)  # or str(value) if you want string format
    elif isinstance(value, dict):
        return {k: safe_convert(v) for k, v in value.items()}
    elif isinstance(value, list):
        return [safe_convert(v) for v in value]
    else:
        return value

def stream_json_array(file_path):
    with open(file_path, 'rb') as f:
        for item in ijson.items(f, 'item'):
            yield safe_convert(item)

tqa_dataset = Dataset.from_generator(lambda: stream_json_array("./dataset/document_tqa_train.json"))
wiki_dataset = Dataset.from_generator(lambda: stream_json_array("./dataset/document_two_wiki_qa_train.json"))

Loading dataset shards:   0%|          | 0/18 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/21 [00:00<?, ?it/s]

In [3]:
max_length = 512
stride = 256
limit_data_per_sample = 5

In [4]:
tqa_dataset = tqa_dataset.filter(
    lambda example: (
        sum(ctx["has_answer"] for ctx in example["ctxs"]) >= limit_data_per_sample and
        sum(not ctx["has_answer"] for ctx in example["ctxs"]) >= limit_data_per_sample
    )
)

wiki_dataset = wiki_dataset.filter(
    lambda example: (
        sum(ctx["gold_document"] for ctx in example["ctxs"]) >= 1 and
        sum(not ctx["gold_document"] for ctx in example["ctxs"]) >= 1
    )
)

In [5]:
tqa_dataset = tqa_dataset.train_test_split(test_size = 0.2 , seed = 42)
wiki_dataset = wiki_dataset.train_test_split(test_size = 0.2 , seed = 42)

In [6]:
tqa_dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answers', 'ctxs'],
        num_rows: 87840
    })
    test: Dataset({
        features: ['question', 'answers', 'ctxs'],
        num_rows: 21961
    })
})

In [7]:
wiki_dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answers', 'ctxs'],
        num_rows: 54516
    })
    test: Dataset({
        features: ['question', 'answers', 'ctxs'],
        num_rows: 13630
    })
})

In [8]:
from transformers import AutoTokenizer

model_checkpoint = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [9]:

def preprocess_tqa_training_examples(examples):
    true_samples = [ctx for ctx in examples["ctxs"][0] if ctx["has_answer"]][:limit_data_per_sample]
    num_true = len(true_samples)
    false_samples = [ctx for ctx in examples["ctxs"][0] if not ctx["has_answer"]][:num_true]
    all_sample = true_samples + false_samples
    inputs = tokenizer(
        [examples["question"][0]] * len(all_sample),
        [text["text"] for text in all_sample],
        truncation="only_second",
        max_length=max_length,
        stride=stride,
        return_overflowing_tokens=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    answer_occurrences = [text["answer_occurrences"] for text in all_sample]
    predict_label = [1] * len(true_samples) + [0] * len(false_samples)  

    extended_predict_label = [predict_label[sample_map[i]] for i in range(len(inputs["input_ids"]))]

    inputs["labels"] = extended_predict_label
    return inputs

In [10]:
process_tqa_dataset = tqa_dataset.map(
    preprocess_tqa_training_examples,
    remove_columns=tqa_dataset["train"].column_names,
    batched=True,
    batch_size = 1,
    num_proc = 4
)

In [11]:
process_tqa_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 878402
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 219612
    })
})

In [12]:

def preprocess_wiki_training_examples(examples):
    true_samples = [ctx for ctx in examples["ctxs"][0] if ctx["gold_document"]]
    num_true = len(true_samples)
    false_samples = [ctx for ctx in examples["ctxs"][0] if not ctx["gold_document"]][:num_true]
    all_sample = true_samples + false_samples
    inputs = tokenizer(
        [examples["question"][0]] * len(all_sample),
        [text["text"] for text in all_sample],
        truncation="only_second",
        max_length=max_length,
        stride=stride,
        return_overflowing_tokens=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    predict_label = [1] * len(true_samples) + [0] * len(false_samples)  

    extended_predict_label = [predict_label[sample_map[i]] for i in range(len(inputs["input_ids"]))]

    inputs["labels"] = extended_predict_label
    return inputs

In [13]:
process_wiki_dataset = wiki_dataset.map(
    preprocess_wiki_training_examples,
    remove_columns=wiki_dataset["train"].column_names,
    batched=True,
    batch_size = 1,
    num_proc = 4
)

In [14]:
process_wiki_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 207717
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 51662
    })
})

In [15]:
process_dataset = DatasetDict({
    'train': concatenate_datasets([
        process_tqa_dataset['train'],
        process_wiki_dataset['train']
    ]),
    'validation': concatenate_datasets([
        process_tqa_dataset['test'],
        process_wiki_dataset['test']
    ]),
})

In [16]:
process_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1086119
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 271274
    })
})

In [17]:
import evaluate
import numpy as np
import torch

def compute_metrics(eval_preds):
    
    metric = evaluate.load("accuracy")
    logits, labels = eval_preds
    logits = logits
    labels = labels

    # Apply sigmoid to get probabilities
    probs = torch.sigmoid(torch.tensor(logits)).numpy()


    # Convert probabilities to binary predictions
    predictions = (probs > 0.5).astype(int)

    return metric.compute(predictions=predictions, references=labels)

2025-04-16 18:41:54.784012: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744828914.792040   60189 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744828914.794456   60189 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-16 18:41:54.804065: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [18]:
from transformers import AutoModel , Trainer
import torch.nn.functional as F
import torch.nn as nn
import torch

class GELUActivation(nn.Module):
    """
    Original Implementation of the GELU activation function in Google BERT repo when initially created. For
    information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
    Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
    """

    def __init__(self, use_gelu_python: bool = False):
        super().__init__()
        if use_gelu_python:
            self.act = self._gelu_python
        else:
            self.act = nn.functional.gelu

    def _gelu_python(self, input):
        return input * 0.5 * (1.0 + torch.erf(input / math.sqrt(2.0)))

    def forward(self, input):
        return self.act(input)
    
class ValidatorModel(nn.Module):
    def __init__(self, checkpoint):
        super(ValidatorModel, self).__init__()
        self.encoder = AutoModel.from_pretrained(checkpoint)
        config = self.encoder.config
        self.classifier = nn.Linear(self.encoder.config.hidden_size, 1)
        self.loss_fn = nn.BCELoss()  # Use BCELoss since we apply Sigmoid manually

        self.dense = nn.Linear(config.hidden_size, config.hidden_size, config.classifier_bias)
        self.act = GELUActivation()
        self.norm = nn.LayerNorm(config.hidden_size, eps=config.norm_eps, bias=config.norm_bias)
        self.drop = torch.nn.Dropout(config.classifier_dropout)

        self.cls_loss_fn = nn.BCEWithLogitsLoss()
        self.qa_loss_fn = nn.CrossEntropyLoss(ignore_index=-100)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs[0]
    
        pooling = (last_hidden_state * attention_mask.unsqueeze(-1)).sum(dim=1) / attention_mask.sum(
            dim=1, keepdim=True
        )

        
        pooled_output = self.norm(self.act(self.dense(pooling)))
        pooled_output = self.drop(pooled_output)
        logits = self.classifier(pooled_output)
        cls_logits = logits.squeeze(-1)
        
        loss = None
        if labels is not None:
            cls_loss = self.cls_loss_fn(cls_logits, labels.float())
            loss = cls_loss

        return {
            "loss": loss,
            "logits": cls_logits,
        }

In [19]:
model = ValidatorModel(model_checkpoint)

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


In [20]:
from transformers import TrainingArguments

args = TrainingArguments(
    "modernbert-finetuned-triviaqa-wiki-noqa",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 32,
    bf16=True,
    report_to="wandb"
    # push_to_hub=True,
)

In [21]:
trainer = Trainer(
    model,
    args,
    train_dataset=process_dataset["train"],
    eval_dataset=process_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


[2025-04-16 18:41:57,719] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/ryuinw123/miniconda3/envs/deep/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/ryuinw123/miniconda3/envs/deep/compiler_compat/ld: /usr/local/cuda-11.8/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'
/home/ryuinw123/miniconda3/envs/deep/compiler_compat/ld: /usr/local/cuda-11.8/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'
/home/ryuinw123/miniconda3/envs/deep/compiler_compat/ld: /usr/local/cuda-11.8/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'
/home/ryuinw123/miniconda3/envs/deep/compiler_compat/ld: /usr/local/cuda-11.8/lib64/libcufile.so: undefined reference to `std::string::substr(unsigned long, unsigned long) const@GLIBCXX_3.4'
/home/ryuinw123/miniconda3/envs/deep/compiler_compat/ld: /usr/local/cuda-11.8/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, 

In [22]:
trainer.train()


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mryuinw123[0m ([33mllm-courseville[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2566,0.278833,0.891678
2,0.1197,0.237677,0.927553
3,0.0506,0.298223,0.939769


TrainOutput(global_step=101826, training_loss=0.18841038917588107, metrics={'train_runtime': 13372.4741, 'train_samples_per_second': 243.661, 'train_steps_per_second': 7.615, 'total_flos': 0.0, 'train_loss': 0.18841038917588107, 'epoch': 3.0})

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7ff435e8d2a0>> (for post_run_cell), with arguments args (<ExecutionResult object at 7ff435e8f610, execution_count=22 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 7ff435e8ffd0, raw_cell="trainer.train()
" store_history=True silent=False shell_futures=True cell_id=bc26f2e8-23fd-4431-a70e-f4a6e04eceb8> result=TrainOutput(global_step=101826, training_loss=0.18841038917588107, metrics={'train_runtime': 13372.4741, 'train_samples_per_second': 243.661, 'train_steps_per_second': 7.615, 'total_flos': 0.0, 'train_loss': 0.18841038917588107, 'epoch': 3.0})>,),kwargs {}:


TypeError: _WandbInit._pause_backend() takes 1 positional argument but 2 were given