Running the GLUE baseline tests

In [1]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, load_metric
import numpy as np
import pandas as pd

# Load the tokenizer and model from the current directory
tokenizer = DistilBertTokenizer.from_pretrained("./")
model = DistilBertForSequenceClassification.from_pretrained("./")

# List of GLUE tasks
tasks = ['cola', 'sst2', 'mrpc', 'stsb', 'qqp', 'mnli', 'qnli', 'rte', 'wnli']

# Define the compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    metric = load_metric('glue', task)
    return metric.compute(predictions=predictions, references=labels)

results = []

for task in tasks:
    # Load dataset and metric
    try:
        dataset = load_dataset('glue', task)
        tokenized_datasets = dataset.map(lambda examples: tokenizer(examples['sentence1'], examples['sentence2'] if 'sentence2' in examples else examples['sentence'], padding='max_length', truncation=True), batched=True)
        tokenized_datasets = tokenized_datasets.remove_columns([col for col in tokenized_datasets['train'].column_names if col not in ['label', 'input_ids', 'attention_mask']])
        tokenized_datasets.set_format('torch')

        # Training Arguments
        training_args = TrainingArguments(
            output_dir=f'./results/{task}',
            per_device_eval_batch_size=64,
            do_train=False,
            do_eval=True
        )

        # Initialize Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            compute_metrics=compute_metrics,
            eval_dataset=tokenized_datasets['validation_matched'] if task == 'mnli' else tokenized_datasets['validation']
        )

        # Evaluate
        eval_result = trainer.evaluate()
        eval_result['task'] = task
        results.append(eval_result)

        # Print out results
        print(f"Results for {task}:")
        for key, value in eval_result.items():
            print(f"{key}: {value}")
    except Exception as e:
        print(f"Failed to process {task}: {str(e)}")

# Save results to a CSV
df = pd.DataFrame(results)
df.to_csv('glue_evaluation_results.csv')
print("glue_evaluation_results.csv")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at ./ and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/8551 [00:00<?, ? examples/s]

Failed to process cola: 'sentence1'


Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Failed to process sst2: 'sentence1'


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/7 [00:00<?, ?it/s]

  metric = load_metric('glue', task)
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Results for mrpc:
eval_loss: 0.6805465817451477
eval_accuracy: 0.6348039215686274
eval_f1: 0.764612954186414
eval_runtime: 58.8022
eval_samples_per_second: 6.939
eval_steps_per_second: 0.119
task: mrpc


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Failed to process stsb: expected scalar type Long but found Float


Map:   0%|          | 0/363846 [00:00<?, ? examples/s]

Failed to process qqp: 'sentence1'


Map:   0%|          | 0/392702 [00:00<?, ? examples/s]

Failed to process mnli: 'sentence1'


Map:   0%|          | 0/104743 [00:00<?, ? examples/s]

Failed to process qnli: 'sentence1'


  0%|          | 0/5 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Results for rte:
eval_loss: 0.6973439455032349
eval_accuracy: 0.47653429602888087
eval_runtime: 39.544
eval_samples_per_second: 7.005
eval_steps_per_second: 0.126
task: rte


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/2 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Results for wnli:
eval_loss: 0.7053480744361877
eval_accuracy: 0.352112676056338
eval_runtime: 10.7715
eval_samples_per_second: 6.591
eval_steps_per_second: 0.186
task: wnli
glue_evaluation_results.csv
