# Finetune answerdotai/ModernBERT-base with chunked_opinion:relevant:irrelevant query triples

1. The opinions were pre-chunked to at most 8192 tokens
2. The queries were generated using gpt-4o
3. The finetuning is run in Google Colab with a T4 GPU

# Import Libraries

In [1]:
import pandas as pd

#%pip install datasets -q
from datasets import Dataset

#%pip install flash-attn
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    SentenceTransformerModelCardData,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import TripletEvaluator

In [2]:
import torch

print(torch.zeros(1).cuda())
print(torch.cuda.is_available())
print(torch.cuda.current_device())

torch.set_float32_matmul_precision('high')

tensor([0.], device='cuda:0')
True
0


# Load the data

In [3]:
path = ""

In [4]:
train = pd.read_csv(f"{path}outputs/5.finetune_train_8192.csv")
val = pd.read_csv(f"{path}outputs/5.finetune_val_8192.csv")

# Take a look at the token count distribution

In [5]:
train["chunk_size"].describe()

count     351.000000
mean     3019.421652
std      2468.817057
min        68.000000
25%       936.000000
50%      2303.000000
75%      4599.500000
max      7800.000000
Name: chunk_size, dtype: float64

In [6]:
val["chunk_size"].describe()

count      95.000000
mean     1809.242105
std      1663.952822
min        79.000000
25%       578.500000
50%      1405.000000
75%      2454.500000
max      7772.000000
Name: chunk_size, dtype: float64

# Generate dataset objects

In [7]:
cols = ["chunked_opinion", "relevant", "irrelevant"]
train = train[cols]
val = val[cols]

In [8]:
renames = {"chunked_opinion":"anchor", "relevant":"positive", "irrelevant":"negative"}
train = train.rename(columns=renames)
val = val.rename(columns=renames)

In [9]:
train_dataset = Dataset.from_pandas(train, split="train")
eval_dataset = Dataset.from_pandas(val, split="test")

In [10]:
train_dataset

Dataset({
    features: ['anchor', 'positive', 'negative'],
    num_rows: 351
})

In [11]:
eval_dataset

Dataset({
    features: ['anchor', 'positive', 'negative'],
    num_rows: 95
})

# Train the model

In [12]:
BASE_MODEL = "answerdotai/ModernBERT-base"
BASE_MODEL_NAME = BASE_MODEL.split("/")[-1]

In [13]:
# Load the model
model = SentenceTransformer(
    BASE_MODEL,
    model_card_data=SentenceTransformerModelCardData(
        language="en",
        license="apache-2.0",
        model_name=f"{BASE_MODEL_NAME} trained on triplets",
    )
)

# Define loss function
loss = MultipleNegativesRankingLoss(model)

No sentence-transformers model found with name answerdotai/ModernBERT-base. Creating a new one with mean pooling.


In [14]:
model.to('cuda')

SentenceTransformer(
  (0): Transformer({'max_seq_length': 8192, 'do_lower_case': False}) with Transformer model: ModernBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [15]:
# Specify training arguments
args = SentenceTransformerTrainingArguments(
    output_dir=f"{path}models/{BASE_MODEL_NAME}",
    num_train_epochs=2,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    fp16=True,
    bf16=False,
    #gradient_accumulation_steps=4,
    #gradient_checkpointing=True,
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    report_to="none"

)

In [16]:
# Evaluate the base model before finetuning
dev_evaluator = TripletEvaluator(
    anchors=eval_dataset["anchor"],
    positives=eval_dataset["positive"],
    negatives=eval_dataset["negative"],
    name="dev",
)
dev_evaluator(model)

{'dev_cosine_accuracy': 0.9263157844543457}

In [17]:
# Create a trainer & train
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=loss,
    evaluator=dev_evaluator,

)
trainer.train()

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Dev Cosine Accuracy
100,No log,0.149814,1.0
200,No log,0.124975,1.0
300,No log,0.138629,1.0


TrainOutput(global_step=352, training_loss=0.1721894307570024, metrics={'train_runtime': 357.1367, 'train_samples_per_second': 1.966, 'train_steps_per_second': 0.986, 'total_flos': 0.0, 'train_loss': 0.1721894307570024, 'epoch': 2.0})

In [18]:
# Evaluate the finetuned model
dev_evaluator = TripletEvaluator(
    anchors=eval_dataset["anchor"],
    positives=eval_dataset["positive"],
    negatives=eval_dataset["negative"],
    name="dev",
)
dev_evaluator(model)

{'dev_cosine_accuracy': 1.0}

In [19]:
# Save the trained model
model.save_pretrained(f"{path}models/{BASE_MODEL_NAME}/final")

In [21]:
# Push the trained model to hub
model.push_to_hub(f"rachelFLP/{BASE_MODEL_NAME}_finetune_8192", private=True)

model.safetensors:   0%|          | 0.00/596M [00:00<?, ?B/s]

'https://huggingface.co/rachelFLP/ModernBERT-base_finetune_8192/commit/efe3f37ad1f9986a54713094a4023d715b811cc5'