In [None]:
# %% [code]
# ENV BOOTSTRAP: protobuf crash guard, optional installs on Kaggle, version-proof TrainingArguments
import os, sys, socket, importlib, subprocess, inspect
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"

def internet_ok(host="pypi.org"):
    try:
        socket.gethostbyname(host)
        return True
    except Exception:
        return False

if internet_ok():
    subprocess.run(
        [
            sys.executable, "-m", "pip", "install", "-q", "-U",
            "protobuf<5", "pyarrow>=14,<20",
            "transformers==4.44.2", "datasets==2.19.0",
            "accelerate==1.0.1", "sentencepiece==0.2.0",
            "scikit-learn==1.5.2"
        ],
        check=False,
    )

def _imp(name):
    try:
        return importlib.import_module(name)
    except Exception as e:
        print(f"import {name} failed: {e}")
        return None

tfm = _imp("transformers")
dsets = _imp("datasets")
torch = _imp("torch")
spm = _imp("sentencepiece")

# Patch TrainingArguments to drop unknown keys on older transformers versions
try:
    from transformers import TrainingArguments as _TA
    sig = inspect.signature(_TA.__init__)
    need_patch = any(k not in sig.parameters for k in [
        "evaluation_strategy", "eval_strategy", "save_strategy",
        "gradient_checkpointing", "save_safetensors", "lr_scheduler_type"
    ])
    if need_patch:
        _Base = _TA
        class _PatchedTrainingArguments(_Base):  # type: ignore
            def __init__(self, *args, **kwargs):
                allowed = set(inspect.signature(_Base.__init__).parameters.keys())
                filtered = {k: v for k, v in kwargs.items() if k in allowed}
                dropped = sorted(set(kwargs) - set(filtered))
                if dropped:
                    print("TrainingArguments dropping unsupported keys:", dropped)
                super().__init__(*args, **filtered)
        import transformers as _m
        _m.TrainingArguments = _PatchedTrainingArguments
        print("TrainingArguments patched for compatibility")
except Exception as e:
    print("TrainingArguments patch skipped:", e)

if tfm: print("transformers:", tfm.__version__)
if dsets: print("datasets:", dsets.__version__)
if torch:
    print("torch:", torch.__version__, "| CUDA available:", torch.cuda.is_available())
    if torch.cuda.is_available():
        print("GPU:", torch.cuda.get_device_name(0))
print("Environment ready.")


In [1]:
# %% [code]
import os, json, random, numpy as np, pandas as pd, torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    DataCollatorWithPadding, TrainingArguments, Trainer, set_seed
)

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED); set_seed(SEED)

MODEL_NAME = "google/rembert"   # RemBERT uses SentencePiece tokenizer
MAX_LEN    = 160                # accuracy/VRAM tradeoff

CANDIDATE_PATHS = [
    "/kaggle/input/model4-dataset/train_data.json",
]
DATA_PATH = next((p for p in CANDIDATE_PATHS if os.path.exists(p)), None)
assert DATA_PATH, f"Place train_data.json in one of: {CANDIDATE_PATHS}"
print("Data:", DATA_PATH)


2025-11-13 05:47:08.288599: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763012828.468738     118 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763012828.518201     118 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Data: /kaggle/input/model4-dataset/train_data.json


In [2]:
# %% [code]
with open(DATA_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)

df = pd.DataFrame(data)
assert {"syllogism", "validity"}.issubset(df.columns)
df = df[["syllogism", "validity"]].dropna().rename(columns={"syllogism": "text", "validity": "label"})
df["label"] = df["label"].astype(int)

print(df.head(3))
print("Label distribution:", df["label"].value_counts(normalize=True).round(3).to_dict())


                                                text  label
0  All cars are a type of vehicle. No animal is a...      0
1  Nothing that is a soda is a juice. A portion o...      1
2  Everything that is a planet is a celestial bod...      0
Label distribution: {0: 0.5, 1: 0.5}


In [3]:
# %% [code]
train_df, val_df = train_test_split(
    df, test_size=0.15, random_state=SEED, stratify=df["label"]
)

ds = DatasetDict({
    "train": Dataset.from_pandas(train_df.reset_index(drop=True)),
    "validation": Dataset.from_pandas(val_df.reset_index(drop=True)),
})

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def tok(batch):
    return tokenizer(batch["text"], truncation=True, max_length=MAX_LEN)

cols_to_remove = [c for c in ds["train"].column_names if c not in ["text", "label"]]
ds_tok = ds.map(tok, batched=True, remove_columns=cols_to_remove)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

print(ds_tok)


tokenizer_config.json:   0%|          | 0.00/263 [00:00<?, ?B/s]

sentencepiece.model:   0%|          | 0.00/4.70M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]



Map:   0%|          | 0/816 [00:00<?, ? examples/s]

Map:   0%|          | 0/144 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 816
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 144
    })
})


In [5]:
# %% [code]
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted", zero_division=0)
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1}


In [6]:
# %% [code]
from collections import Counter

num_labels = 2
counts = Counter(ds["train"]["label"])
freqs = np.array([counts.get(i, 0) for i in range(num_labels)], dtype=np.float32)
inv = 1.0 / np.clip(freqs, 1.0, None)
class_weights = torch.tensor(inv / inv.sum() * num_labels, dtype=torch.float32)

print("Class counts:", dict(counts))
print("Class weights:", class_weights.tolist())


Class counts: {0: 408, 1: 408}
Class weights: [1.0, 1.0]


In [7]:
# %% [code]
# Fine-tuning for accuracy: unfreeze last 3 blocks + pooler + classifier
# Layer-wise learning rates (higher on head), cosine schedule with warmup,
# class-weighted loss, small per-device batch with gradient accumulation.
import gc, inspect
from torch.optim import AdamW
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification
from transformers.trainer_callback import TrainerCallback

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
OUT_DIR = "outputs_rembert"

# Load model
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)

# Memory guards
if hasattr(model, "config") and hasattr(model.config, "use_cache"):
    model.config.use_cache = False
try:
    if hasattr(model, "gradient_checkpointing_enable"):
        sig = inspect.signature(model.gradient_checkpointing_enable)
        if "use_reentrant" in sig.parameters:
            model.gradient_checkpointing_enable(use_reentrant=False)
        else:
            model.gradient_checkpointing_enable()
    elif hasattr(model, "enable_input_require_grads"):
        model.enable_input_require_grads()
        if hasattr(model, "config"):
            model.config.gradient_checkpointing = True
except Exception:
    if hasattr(model, "config"):
        model.config.gradient_checkpointing = True

# Unfreeze strategy
N_TRAIN_LAYERS = 3
for p in model.rembert.parameters():
    p.requires_grad = False
for p in model.classifier.parameters():
    p.requires_grad = True
if hasattr(model.rembert, "pooler"):
    for p in model.rembert.pooler.parameters():
        p.requires_grad = True
enc_layers = getattr(model.rembert.encoder, "layer", None)
if enc_layers is not None and N_TRAIN_LAYERS > 0:
    for blk in enc_layers[-N_TRAIN_LAYERS:]:
        for p in blk.parameters():
            p.requires_grad = True

trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Trainable params: {trainable:,} / {total:,} ({100*trainable/total:.2f}%)")

# Hyperparameters
EPOCHS = 5
BSZ = 2
ACCUM = 16
LR_HEAD = 4e-5
LR_LAST = 2e-5
WD = 0.01
LOG_STEPS = 50

# TrainingArguments (version-proof eval key)
sig_ta = inspect.signature(TrainingArguments.__init__)
eval_key = "eval_strategy" if "eval_strategy" in sig_ta.parameters else "evaluation_strategy"
ta_args = {
    "output_dir": OUT_DIR,
    "per_device_train_batch_size": BSZ,
    "per_device_eval_batch_size": BSZ,
    "gradient_accumulation_steps": ACCUM,
    "num_train_epochs": EPOCHS,
    "weight_decay": WD,
    eval_key: "epoch",
    "save_strategy": "no",          # disable mid-run checkpoint writes
    "fp16": torch.cuda.is_available(),
    "report_to": "none",
    "logging_steps": LOG_STEPS,
    "gradient_checkpointing": True,
    "save_safetensors": False,      # always use .bin
    "eval_accumulation_steps": 32,
    "dataloader_num_workers": 0,
    "warmup_ratio": 0.1,
    "lr_scheduler_type": "cosine",
}
ta_args = {k: v for k, v in ta_args.items() if k in sig_ta.parameters}
args = TrainingArguments(**ta_args)

# Layer-wise LR param groups
no_decay = ("bias", "LayerNorm.weight", "layer_norm.weight")
param_groups = []

def add_group(named_params, lr):
    decay, nodecay = [], []
    for n, p in named_params:
        if not p.requires_grad:
            continue
        if any(nd in n for nd in no_decay):
            nodecay.append(p)
        else:
            decay.append(p)
    if decay:
        param_groups.append({"params": decay, "weight_decay": WD, "lr": lr})
    if nodecay:
        param_groups.append({"params": nodecay, "weight_decay": 0.0, "lr": lr})

add_group(model.classifier.named_parameters(), LR_HEAD)
if hasattr(model.rembert, "pooler"):
    add_group([(n, p) for n, p in model.rembert.pooler.named_parameters() if p.requires_grad], LR_LAST)
if enc_layers is not None:
    for blk in enc_layers[-N_TRAIN_LAYERS:]:
        add_group(blk.named_parameters(), LR_LAST)

# Weighted loss and custom optimizer
class WeightedTrainer(Trainer):
    def create_optimizer(self):
        if self.optimizer is None:
            self.optimizer = AdamW(param_groups, lr=LR_LAST)
        return self.optimizer

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs.get("logits")
        loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights.to(logits.device))
        loss = loss_fn(logits.view(-1, num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

trainer = WeightedTrainer(
    model=model,
    args=args,
    train_dataset=ds_tok["train"],
    eval_dataset=ds_tok["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Early stopping
class EarlyStopper(TrainerCallback):
    def __init__(self, metric="eval_f1", patience=2, greater_is_better=True):
        self.metric = metric
        self.patience = patience
        self.sign = 1.0 if greater_is_better else -1.0
        self.best = -float("inf") if greater_is_better else float("inf")
        self.bad = 0
    def on_evaluate(self, args, state, control, **kw):
        score = kw.get("metrics", {}).get(self.metric)
        if score is None:
            return
        if self.sign * score > self.sign * self.best:
            self.best = score
            self.bad = 0
        else:
            self.bad += 1
            if self.bad >= self.patience:
                print(f"Early stopping on {self.metric}. Best={self.best:.4f}")
                control.should_training_stop = True

trainer.add_callback(EarlyStopper(metric="eval_f1", patience=2, greater_is_better=True))

# Train
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
train_result = trainer.train()
train_result.metrics


config.json:   0%|          | 0.00/686 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.30G [00:00<?, ?B/s]

Some weights of RemBertForSequenceClassification were not initialized from the model checkpoint at google/rembert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainable params: 49,151,234 / 575,922,690 (8.53%)


  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
0,No log,0.688197,0.520833,0.655396,0.520833,0.388441
2,0.691600,0.680461,0.611111,0.615477,0.611111,0.6074
4,0.681300,0.679754,0.618056,0.667076,0.618056,0.587822


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Early stopping on eval_f1. Best=0.6074


{'train_runtime': 109.5278,
 'train_samples_per_second': 37.251,
 'train_steps_per_second': 1.141,
 'total_flos': 442219409530752.0,
 'train_loss': 0.6843116455078125,
 'epoch': 4.901960784313726}

In [8]:
# %% [code]
import gc, torch
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()


In [9]:
# %% [code]
# 7) TRAIN — accuracy-focused fine-tuning (LLRD + class weights + early stop)
import os, gc, inspect
import torch
from torch.optim import AdamW
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers.trainer_callback import TrainerCallback

# ---- Memory/IO guards
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
OUT_DIR   = "outputs_rembert"
BEST_DIR  = "best_ckpt_rembert"  # we'll save at end only

# ---- Load model
num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)

# Disable KV cache during train
if hasattr(model, "config") and hasattr(model.config, "use_cache"):
    model.config.use_cache = False

# Gradient checkpointing (version-proof)
try:
    if hasattr(model, "gradient_checkpointing_enable"):
        sig = inspect.signature(model.gradient_checkpointing_enable)
        if "use_reentrant" in sig.parameters:
            model.gradient_checkpointing_enable(use_reentrant=False)
        else:
            model.gradient_checkpointing_enable()
    elif hasattr(model, "enable_input_require_grads"):
        model.enable_input_require_grads()
        if hasattr(model, "config"):
            model.config.gradient_checkpointing = True
except Exception:
    if hasattr(model, "config"):
        model.config.gradient_checkpointing = True

# ---- Unfreeze strategy: last 3 encoder blocks + pooler + classifier
FREEZE_ALL_BUT_LAST = True
N_TRAIN_LAYERS = 3  # try 2 if VRAM tight; 3 usually fits with our tiny batch/len
if FREEZE_ALL_BUT_LAST:
    # Freeze all
    for p in model.rembert.parameters():
        p.requires_grad = False
    # Unfreeze classifier
    for p in model.classifier.parameters():
        p.requires_grad = True
    # Unfreeze pooler if present
    if hasattr(model.rembert, "pooler"):
        for p in model.rembert.pooler.parameters():
            p.requires_grad = True
    # Unfreeze last N layers
    enc_layers = getattr(model.rembert.encoder, "layer", None)
    if enc_layers is not None and N_TRAIN_LAYERS > 0:
        for blk in enc_layers[-N_TRAIN_LAYERS:]:
            for p in blk.parameters():
                p.requires_grad = True

    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total     = sum(p.numel() for p in model.parameters())
    print(f"Trainable params: {trainable:,} / {total:,} ({100*trainable/total:.2f}%)")

# ---- Hyperparams (very VRAM-friendly)
EPOCHS    = 5           # allow more epochs; early stopping prevents overfit
BSZ       = 1           # tiny per-device batch
ACCUM     = 24          # effective batch ~24
LR_HEAD   = 3e-5        # higher LR for the head
LR_LAST   = 2e-5        # last encoder blocks/pooler
WD        = 0.01
LOG_STEPS = 50

# Prefer new eval key if supported
TA = TrainingArguments
sig_ta   = inspect.signature(TA.__init__)
eval_key = "eval_strategy" if "eval_strategy" in sig_ta.parameters else "evaluation_strategy"

desired_args = {
    "output_dir": OUT_DIR,
    "learning_rate": LR_LAST,           # base LR (we override with param groups below)
    "per_device_train_batch_size": BSZ,
    "per_device_eval_batch_size": BSZ,
    "gradient_accumulation_steps": ACCUM,
    "num_train_epochs": EPOCHS,
    "weight_decay": WD,
    eval_key: "epoch",
    "save_strategy": "no",              # no mid-run checkpoint writes
    "fp16": torch.cuda.is_available(),
    "report_to": "none",
    "logging_steps": LOG_STEPS,
    "gradient_checkpointing": True,
    "save_safetensors": False,          # avoid safetensors issue
    "eval_accumulation_steps": 32,
    "dataloader_num_workers": 0,
    "warmup_ratio": 0.1,                # warmup 10%
    "lr_scheduler_type": "cosine",      # cosine decay
}

supported_args = {k: v for k, v in desired_args.items() if k in sig_ta.parameters}
dropped = sorted(set(desired_args) - set(supported_args))
if dropped:
    print(" Dropping unsupported TrainingArguments keys:", dropped)

args = TA(**supported_args)

# ---- Build Layer-wise LR param groups (LLRD)
no_decay = ("bias", "LayerNorm.weight", "layer_norm.weight")
param_groups = []

def add_group(named_params, lr):
    decay, nodecay = [], []
    for n, p in named_params:
        if not p.requires_grad:
            continue
        (nodecay if any(nd in n for nd in no_decay) else decay).append(p)
    if decay:
        param_groups.append({"params": decay, "weight_decay": WD, "lr": lr})
    if nodecay:
        param_groups.append({"params": nodecay, "weight_decay": 0.0, "lr": lr})

# Head
add_group(model.classifier.named_parameters(), LR_HEAD)
# Pooler (if trainable)
if hasattr(model.rembert, "pooler"):
    trainable_pool = [(n, p) for n,p in model.rembert.pooler.named_parameters() if p.requires_grad]
    if trainable_pool:
        add_group(trainable_pool, LR_LAST)
# Last N blocks
if hasattr(model.rembert, "encoder") and hasattr(model.rembert.encoder, "layer"):
    layers = list(model.rembert.encoder.layer)
    for i, blk in enumerate(layers[-N_TRAIN_LAYERS:], 1):
        add_group(blk.named_parameters(), LR_LAST)

# ---- Custom Trainer with class-weighted loss + our optimizer
class WeightedTrainer(Trainer):
    def create_optimizer(self):
        if self.optimizer is None:
            self.optimizer = AdamW(param_groups, lr=LR_LAST)  # lr overridden by groups
        return self.optimizer

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**{k:v for k,v in inputs.items() if k != "labels"})
        logits = outputs.get("logits")
        # class_weights from cell 6a
        loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights.to(logits.device))
        loss = loss_fn(logits.view(-1, num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

trainer = WeightedTrainer(
    model=model,
    args=args,
    train_dataset=ds_tok["train"],
    eval_dataset=ds_tok["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# ---- Early stopping (patience=2) — compatible fallback
class EarlyStopper(TrainerCallback):
    def __init__(self, metric="eval_f1", patience=2, greater_is_better=True):
        self.metric = metric
        self.patience = patience
        self.sign = 1.0 if greater_is_better else -1.0
        self.best = -float("inf") if greater_is_better else float("inf")
        self.bad_epochs = 0

    def on_evaluate(self, args, state, control, **kwargs):
        metrics = kwargs.get("metrics", {})
        score = metrics.get(self.metric)
        if score is None:
            return
        if self.sign * score > self.sign * self.best:
            self.best = score
            self.bad_epochs = 0
        else:
            self.bad_epochs += 1
            if self.bad_epochs >= self.patience:
                print(f" Early stopping on {self.metric}. Best={self.best:.4f}")
                control.should_training_stop = True

trainer.add_callback(EarlyStopper(metric="eval_f1", patience=2, greater_is_better=True))

# ---- Free cache, then train
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

train_result = trainer.train()
train_result.metrics


Some weights of RemBertForSequenceClassification were not initialized from the model checkpoint at google/rembert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainable params: 49,151,234 / 575,922,690 (8.53%)


  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.686428,0.541667,0.661194,0.541667,0.437367
2,0.689700,0.680303,0.555556,0.561277,0.555556,0.544934
3,0.682600,0.678272,0.631944,0.672771,0.631944,0.608836
4,0.682600,0.676272,0.611111,0.620401,0.611111,0.603462
5,0.675000,0.675601,0.590278,0.590295,0.590278,0.590258


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


 Early stopping on eval_f1. Best=0.6088


{'train_runtime': 217.6365,
 'train_samples_per_second': 18.747,
 'train_steps_per_second': 0.781,
 'total_flos': 414304370709840.0,
 'train_loss': 0.6814823936013614,
 'epoch': 5.0}

In [12]:
# %% [code]
from pprint import pprint

# Collect only evaluation logs (those that have eval_accuracy)
eval_logs = [
    e for e in trainer.state.log_history
    if isinstance(e, dict) and "eval_accuracy" in e
]

if eval_logs:
    # Find the entry with the highest eval_accuracy
    best = max(eval_logs, key=lambda x: x["eval_accuracy"])

    print("Best eval metrics across all epochs (by eval_accuracy):")
    pprint(best)

    print("\nSummary (best epoch):")
    print(f"Epoch    : {best.get('epoch', 'N/A')}")
    print(f"Accuracy : {best.get('eval_accuracy', 0):.4f}")
    print(f"F1       : {best.get('eval_f1', 0):.4f}")
    print(f"Precision: {best.get('eval_precision', 0):.4f}")
    print(f"Recall   : {best.get('eval_recall', 0):.4f}")
else:
    print("No eval logs with 'eval_accuracy' found. Running a fresh evaluation...\n")
    metrics = trainer.evaluate()
    pprint(metrics)

    print("\nSummary (fresh eval):")
    print(f"Accuracy : {metrics.get('eval_accuracy', 0):.4f}")
    print(f"F1       : {metrics.get('eval_f1', 0):.4f}")
    print(f"Precision: {metrics.get('eval_precision', 0):.4f}")
    print(f"Recall   : {metrics.get('eval_recall', 0):.4f}")


Best eval metrics across all epochs (by eval_accuracy):
{'epoch': 3.0,
 'eval_accuracy': 0.6319444444444444,
 'eval_f1': 0.6088360412075241,
 'eval_loss': 0.6782717108726501,
 'eval_precision': 0.6727709017428644,
 'eval_recall': 0.6319444444444444,
 'eval_runtime': 4.9851,
 'eval_samples_per_second': 28.886,
 'eval_steps_per_second': 28.886,
 'step': 102}

Summary (best epoch):
Epoch    : 3.0
Accuracy : 0.6319
F1       : 0.6088
Precision: 0.6728
Recall   : 0.6319


In [11]:
# %% [code]
import os, torch
SAVE_DIR = "model_validity_rembert_ft"
os.makedirs(SAVE_DIR, exist_ok=True)
m = trainer.model

try:
    m.save_pretrained(SAVE_DIR, safe_serialization=False, max_shard_size="500MB")
except TypeError:
    torch.save(m.state_dict(), os.path.join(SAVE_DIR, "pytorch_model.bin"))

tokenizer.save_pretrained(SAVE_DIR)
print("Saved to", SAVE_DIR)


Saved to model_validity_rembert_ft
