In [4]:
# Option A: just install accelerate
!pip install 'accelerate>=0.26.0'

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You should consider upgrading via the '/home/venkatesh/.pyenv/versions/3.9.7/bin/python3.9 -m pip install --upgrade pip' command.[0m


In [8]:
import sys
!{sys.executable} -m pip install "accelerate>=0.26.0" transformers[torch]


Collecting accelerate>=0.26.0
  Downloading accelerate-1.6.0-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.6.0-py3-none-any.whl (354 kB)
Installing collected packages: accelerate
Successfully installed accelerate-1.6.0


In [9]:
import accelerate, transformers
print("accelerate:", accelerate.__version__)
print("transformers[torch] extra is present:", "torch" in transformers.__version__)


accelerate: 1.6.0
transformers[torch] extra is present: False


In [4]:
# Option B: install the transformers[torch] extra (pulls in accelerate & torch support)
!pip install 'transformers[torch]'


You should consider upgrading via the '/home/venkatesh/.pyenv/versions/3.9.7/bin/python3.9 -m pip install --upgrade pip' command.[0m


In [1]:
import math
import numpy as np
import torch
import torch.nn as nn
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForMultipleChoice,
    TrainingArguments,
    Trainer,
    BertConfig,
    get_scheduler,
)
from sklearn.metrics import accuracy_score
from torch.optim import AdamW

# 0) Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1) Load & subsample
ds       = load_dataset("casehold/casehold")
train_ds = ds["train"].shuffle(seed=42).select(range(10000))
val_ds   = ds["validation"].shuffle(seed=42).select(range(1000))
test_ds  = ds["test"].shuffle(seed=42).select(range(1000))

# 2) Tokenizer & model + tweak dropout
tokenizer = AutoTokenizer.from_pretrained("casehold/legalbert")
config    = BertConfig.from_pretrained(
    "casehold/legalbert",
    hidden_dropout_prob=0.3
)
model     = AutoModelForMultipleChoice.from_pretrained("casehold/legalbert", config=config)

# 3) Freeze + unfreeze top layers
for p in model.bert.parameters():
    p.requires_grad = False
for idx in [6,7,8,9,10,11]:
    for p in model.bert.encoder.layer[idx].parameters():
        p.requires_grad = True
for p in model.classifier.parameters():
    p.requires_grad = True
model.to(device)



  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForMultipleChoice were not initialized from the model checkpoint at casehold/legalbert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForMultipleChoice(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.3, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, ele

In [2]:
# 4) Build layer-wise optimizer
no_decay = ["bias", "LayerNorm.weight"]
grouped_parameters = [
    {
        "params": [p for n,p in model.named_parameters() if n.startswith("classifier")],
        "lr": 2e-5, "weight_decay": 0.01
    }
]
for layer_idx, lr in zip([11,10,9,8], [1.8e-5,1.6e-5,1.4e-5,1.2e-5]):
    for n,p in model.named_parameters():
        if f"bert.encoder.layer.{layer_idx}" in n:
            grouped_parameters.append({
                "params": [p],
                "lr": lr,
                "weight_decay": 0.0 if any(nd in n for nd in no_decay) else 0.01
            })
optimizer = AdamW(grouped_parameters)

# 5) Preprocess function
def preprocess_and_prune(ex):
    prompt  = ex["citing_prompt"]
    choices = [ex[f"holding_{i}"] for i in range(5)]
    enc = tokenizer(
        [prompt]*5, choices,
        truncation=True, padding="max_length",
        max_length=256,
        return_token_type_ids=False,
    )
    return {
        "input_ids":      enc["input_ids"],
        "attention_mask": enc["attention_mask"],
        "labels":         int(ex["label"]),
    }

train_tok = train_ds.map(preprocess_and_prune, batched=False, remove_columns=train_ds.column_names)
val_tok   = val_ds.map(preprocess_and_prune,   batched=False, remove_columns=val_ds.column_names)
test_tok  = test_ds.map(preprocess_and_prune,  batched=False, remove_columns=test_ds.column_names)

for split in (train_tok, val_tok, test_tok):
    split.set_format(type="torch", device=device)

# 6) Scheduler
training_args = TrainingArguments(
    output_dir="./results_legalbert",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    fp16=True,
    fp16_backend="auto",
    weight_decay=0.01,
    logging_dir="./logs_legalbert",
    report_to="none",
)
steps_per_epoch   = math.ceil(len(train_tok) / training_args.per_device_train_batch_size)
total_train_steps = training_args.num_train_epochs * steps_per_epoch
lr_scheduler = get_scheduler(
    name="cosine",
    optimizer=optimizer,
    num_warmup_steps=int(0.1 * total_train_steps),
    num_training_steps=total_train_steps,
)

# 7) Custom Trainer with Label Smoothing
class LabelSmoothingTrainer(Trainer):
    def __init__(self, label_smoothing=0.1, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fct = nn.CrossEntropyLoss(label_smoothing=label_smoothing)

    # add **kwargs so num_items_in_batch (and others) won't break
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = self.loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss



Map: 100%|████████████████████████████████████████████████████████████████| 10000/10000 [00:21<00:00, 465.74 examples/s]


In [3]:
# 8) Metric
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {"accuracy": accuracy_score(p.label_ids, preds)}

# 9) Trainer
trainer = LabelSmoothingTrainer(
    label_smoothing=0.1,
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, lr_scheduler),
)

# 10) Train & evaluate
trainer.train()
train_metrics = trainer.evaluate(eval_dataset=train_tok)
val_metrics   = trainer.evaluate()       # uses val_tok
test_metrics  = trainer.predict(test_tok).metrics

print(f"▶️ Train acc: {train_metrics['eval_accuracy']:.4f}")
print(f"▶️ Val   acc: {val_metrics  ['eval_accuracy']:.4f}")
print(f"▶️ Test  acc: {test_metrics['test_accuracy']:.4f}")


  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.5263,1.130871,0.643
2,1.0957,0.995525,0.689
3,1.0242,0.960726,0.711
4,0.989,0.955787,0.716
5,0.9652,0.954604,0.718
6,0.9467,0.957328,0.708
7,0.9445,0.95324,0.713
8,0.9297,0.951967,0.717
9,0.9259,0.95603,0.718
10,0.9179,0.955494,0.718


▶️ Train acc: 0.7726
▶️ Val   acc: 0.7180
▶️ Test  acc: 0.7280
