In [1]:
!pip install transformers datasets evaluate --quiet

In [15]:
import os
import wandb
import numpy as np
import pandas as pd
from sklearn import preprocessing

path = "data"
raw_data_sample_path = os.path.join(path, 'raw_data_sample.json')
sample_data = pd.read_json(raw_data_sample_path)

le = preprocessing.LabelEncoder()

# Fit the encoder to the data
sample_data["label"] = le.fit_transform(sample_data["answer"])

print(sample_data.shape)
print(f"Classes {len(le.classes_)}")
sample_data.head()

(9777, 8)
Classes 307


Unnamed: 0,index,raw_dataset_id,answer,inputs,prompt_template,prompt,task,label
0,0,51,suggestive,"{'index': '51', 'text': 'The mark ""Equine Tech...",A mark is generic if it is the common name for...,A mark is generic if it is the common name for...,abercrombie,302
1,1,92,fanciful,"{'index': '92', 'text': 'The mark ""Lanbe"" for ...",A mark is generic if it is the common name for...,A mark is generic if it is the common name for...,abercrombie,278
2,2,14,generic,"{'index': '14', 'text': 'The mark ""Cutlery"" fo...",A mark is generic if it is the common name for...,A mark is generic if it is the common name for...,abercrombie,281
3,3,71,arbitrary,"{'index': '71', 'text': 'The mark ""Shark"" for ...",A mark is generic if it is the common name for...,A mark is generic if it is the common name for...,abercrombie,260
4,4,60,arbitrary,"{'index': '60', 'text': 'The mark ""Sun"" for co...",A mark is generic if it is the common name for...,A mark is generic if it is the common name for...,abercrombie,260


In [16]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer

dataset = Dataset.from_pandas(sample_data)
dataset = dataset.train_test_split(test_size=0.2)
print(dataset)

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["prompt"], truncation=True)

tokenized_data = dataset.map(preprocess_function, batched=True)
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['index', 'raw_dataset_id', 'answer', 'inputs', 'prompt_template', 'prompt', 'task', 'label', '__index_level_0__'],
        num_rows: 7821
    })
    test: Dataset({
        features: ['index', 'raw_dataset_id', 'answer', 'inputs', 'prompt_template', 'prompt', 'task', 'label', '__index_level_0__'],
        num_rows: 1956
    })
})


  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['index', 'raw_dataset_id', 'answer', 'inputs', 'prompt_template', 'prompt', 'task', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7821
    })
    test: Dataset({
        features: ['index', 'raw_dataset_id', 'answer', 'inputs', 'prompt_template', 'prompt', 'task', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1956
    })
})

In [20]:
import evaluate
import numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

accuracy = evaluate.load("accuracy")
label2id = dict(zip(le.classes_, range(len(le.classes_))))
id2label = dict(zip(range(len(le.classes_)), le.classes_))

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=len(le.classes_), id2label=id2label, label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
wandb.init(mode="disabled")

training_args = TrainingArguments(
    output_dir="legalbench_answer_classification",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.427268,0.508691
2,No log,1.272211,0.509714




TrainOutput(global_step=490, training_loss=1.8738884127869897, metrics={'train_runtime': 906.7752, 'train_samples_per_second': 17.25, 'train_steps_per_second': 0.54, 'total_flos': 4126853536819200.0, 'train_loss': 1.8738884127869897, 'epoch': 2.0})

In [27]:
trainer.evaluate(tokenized_data["train"])



{'eval_loss': 1.2489110231399536,
 'eval_accuracy': 0.5215445595192431,
 'eval_runtime': 155.0611,
 'eval_samples_per_second': 50.438,
 'eval_steps_per_second': 1.58,
 'epoch': 2.0}

In [28]:
trainer.evaluate(tokenized_data["test"])



{'eval_loss': 1.2722113132476807,
 'eval_accuracy': 0.5097137014314929,
 'eval_runtime': 39.5496,
 'eval_samples_per_second': 49.457,
 'eval_steps_per_second': 1.568,
 'epoch': 2.0}

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [25]:
trainer.push_to_hub()

events.out.tfevents.1708464029.65e4bd685465.34.1:   0%|          | 0.00/31.1k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.73k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/prithviraj-maurya/legalbench_answer_classification/commit/d40d04b964e53a808bc58ec201c57d23fc1b970c', commit_message='End of training', commit_description='', oid='d40d04b964e53a808bc58ec201c57d23fc1b970c', pr_url=None, pr_revision=None, pr_num=None)