<a href="https://colab.research.google.com/github/salmanabbasi36/legal_agent/blob/main/lexglue_ledgar_distilbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Colab cell 1: runtime = GPU
!pip install -q transformers datasets accelerate evaluate[sentencepiece] scikit-learn huggingface_hub

In [None]:
from google.colab import drive
drive.mount('/content/drive')  # follow prompts

In [None]:
from huggingface_hub import login
login()

In [None]:
DATASET = "coastalcph/lex_glue"
DATA_CONFIG = "ledgar"
MODEL_NAME = "distilbert-base-uncased"
MAX_LENGTH = 512
BATCH_SIZE = 8
NUM_EPOCHS = 3
LEARNING_RATE = 2e-5
OUTPUT_DIR = '/content/lexglue-ledgar-distilbert'

In [None]:
from datasets import load_dataset
ds = load_dataset(DATASET, DATA_CONFIG)
print(ds)

print(ds['train'][0])

In [None]:
from transformers import AutoTokenizer
import numpy as np

# Inspect features to find text field & label field
features = ds['train'].features
print("Features keys:", features)

# heuristics for text column:
possible_text_cols = ['text', 'sentence', 'prompt', 'excerpt', 'passage']
text_col = None
for col in possible_text_cols:
    if col in ds['train'].column_names:
        text_col = col
        break
# fallback: choose first string column
if text_col is None:
    for k, v in features.items():
        if str(v).startswith('Value') and k != 'label':
            text_col = k
            break

print("Using text column:", text_col)

# label column detection
label_col = None
for c in ['label', 'labels', 'labels_ids', 'gold_label']:
    if c in ds['train'].column_names:
        label_col = c
        break
# fallback: pick first int/class column
if label_col is None:
    for k, v in features.items():
        if "ClassLabel" in str(v) or "Sequence" in str(v):
            label_col = k
            break
print("Using label column:", label_col)

# prepare labels
label_feature = features[label_col]
if "ClassLabel" in str(label_feature):
    id2label = label_feature.names
    label2id = {l:i for i,l in enumerate(id2label)}
    num_labels = len(id2label)
    multi_label = False
else:
    # Could be multi-hot or list -> handle as multi-label
    # Convert lists to binary vectors later
    # We'll infer at tokenization time
    id2label = None
    label2id = None
    num_labels = None
    multi_label = True

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def preprocess(examples):
    texts = examples[text_col]
    enc = tokenizer(texts, truncation=True, padding='max_length', max_length=MAX_LENGTH)
    # labels
    labs = examples[label_col]
    if not multi_label and id2label is not None:
        enc["labels"] = [label for label in labs]
    else:
        # if labels are lists of ints (multi-label)
        # create binary vector of length K (we can compute K from dataset)
        # compute num_labels if unknown
        global num_labels
        if num_labels is None:
            # estimate max label id +1
            maxid = 0
            for l in ds['train'][label_col]:
                if isinstance(l, list):
                    if l:
                        maxid = max(maxid, max(l))
            num_labels = maxid + 1
        enc["labels"] = [ [1 if i in lab else 0 for i in range(num_labels)] for lab in labs ]
    return enc

tokenized = ds.map(preprocess, batched=True, remove_columns=ds['train'].column_names)
tokenized = tokenized.shuffle(seed=42)
tokenized


In [None]:
from transformers import AutoModelForSequenceClassification
import torch

if not multi_label:
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)
else:
    # For multi-label classification, set problem_type so Trainer uses BCEWithLogitsLoss
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME, num_labels=num_labels, problem_type="multi_label_classification"
    )

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)


In [None]:
import evaluate
import numpy as np
from sklearn.metrics import f1_score, accuracy_score

metric_acc = evaluate.load("accuracy")

def compute_metrics(pred):
    logits, labels = pred
    if multi_label:
        preds = (logits > 0).astype(int)  # logits already converted by trainer to numpy
        # compute micro/macro f1
        return {
            "f1_micro": f1_score(labels, preds, average="micro", zero_division=0),
            "f1_macro": f1_score(labels, preds, average="macro", zero_division=0)
        }
    else:
        preds = np.argmax(logits, axis=-1)
        acc = accuracy_score(labels, preds)
        f1_macro = f1_score(labels, preds, average="macro", zero_division=0)
        return {"accuracy": acc, "f1_macro": f1_macro}


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro" if not multi_label else "f1_micro",
    fp16=True,  # use mixed precision if supported
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"] if "validation" in tokenized else tokenized["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


In [None]:
trainer.train()


In [None]:
metrics = trainer.evaluate()
print(metrics)

# Save locally / to Drive
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Saved to", OUTPUT_DIR)

# Optionally push to Hub
# trainer.push_to_hub("my-lexglue-ledgar-distilbert")


In [None]:
from huggingface_hub import login
login()   # it will ask: "Enter your token"


In [None]:
from huggingface_hub import HfApi
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "SalmanAbbasi/lexglue-ledgar-distilbert"
model.push_to_hub(model_name)
tokenizer.push_to_hub(model_name)


In [None]:
from transformers import pipeline
pipe = pipeline("text-classification", model="/content/lexglue-ledgar-distilbert", tokenizer="/content/lexglue-ledgar-distilbert")

text = "This clause sets out the obligations of each party regarding confidentiality."
print(pipe(text))


In [None]:
from datasets import load_dataset
ds = load_dataset("coastalcph/lex_glue", "ledgar")
label_names = ds['train'].features['label'].names
print(label_names[:10])  # show first few labels

# Now decode your prediction
result = pipe(text)[0]
label_id = int(result['label'].split('_')[-1])
print(f"Predicted label: {label_names[label_id]} ({result['score']:.2%} confidence)")


In [None]:
# Update config with id2label and label2id
from transformers import AutoConfig
config = AutoConfig.from_pretrained("/content/lexglue-ledgar-distilbert")
config.id2label = {i: name for i, name in enumerate(label_names)}
config.label2id = {name: i for i, name in enumerate(label_names)}
config.save_pretrained("/content/lexglue-ledgar-distilbert")
