Running the SuperGLUE Baseline tests

In [1]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, load_metric
import numpy as np
import pandas as pd

# Load the tokenizer and model from the current directory
tokenizer = DistilBertTokenizer.from_pretrained("./")
model = DistilBertForSequenceClassification.from_pretrained("./")

# List of SuperGLUE tasks
tasks = ['boolq', 'cb', 'copa', 'multirc', 'record', 'rte', 'wic', 'wsc']

# Define the compute metrics function for SuperGLUE
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    metric = load_metric('super_glue', task)
    return metric.compute(predictions=predictions, references=labels)

results = []

for task in tasks:
    # Load dataset and metric
    try:
        dataset = load_dataset('super_glue', task)
        # Handling different dataset structures in SuperGLUE
        if task in ['record', 'multirc']:
            # Special handling for tasks that require different processing
            continue  # For simplicity, skipping complex tasks like ReCoRD and MultiRC here
        else:
            # Prepare data similarly to GLUE
            tokenized_datasets = dataset.map(lambda examples: tokenizer(examples['question'], examples['passage'] if 'passage' in examples else examples['sentence'], padding="max_length", truncation=True), batched=True)
            tokenized_datasets = tokenized_datasets.remove_columns([col for col in tokenized_datasets['train'].column_names if col not in ['label', 'input_ids', 'attention_mask']])
            tokenized_datasets.set_format('torch')

        # Training Arguments
        training_args = TrainingArguments(
            output_dir=f'./results/{task}',
            per_device_eval_batch_size=64,
            do_train=False,
            do_eval=True
        )

        # Initialize Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            compute_metrics=compute_metrics,
            eval_dataset=tokenized_datasets['validation']
        )

        # Evaluate
        eval_result = trainer.evaluate()
        eval_result['task'] = task
        results.append(eval_result)

        # Print out results
        print(f"Results for {task}:")
        for key, value in eval_result.items():
            print(f"{key}: {value}")
    except Exception as e:
        print(f"Failed to process {task}: {str(e)}")

# Save results to a CSV
df = pd.DataFrame(results)
df.to_csv('super_glue_evaluation_results.csv')
print("super_glue_evaluation_results.csv")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at ./ and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/52 [00:00<?, ?it/s]

  metric = load_metric('super_glue', task)
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Results for boolq:
eval_loss: 0.6994401216506958
eval_accuracy: 0.43761467889908257
eval_runtime: 447.9487
eval_samples_per_second: 7.3
eval_steps_per_second: 0.116
task: boolq


Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Failed to process cb: 'question'


Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Failed to process copa: 'sentence'


Map:   0%|          | 0/2490 [00:00<?, ? examples/s]

Failed to process rte: 'question'


Map:   0%|          | 0/5428 [00:00<?, ? examples/s]

Failed to process wic: 'question'


Map:   0%|          | 0/554 [00:00<?, ? examples/s]

Failed to process wsc: 'question'
super_glue_evaluation_results.csv
