This notebook shows how to perform logit ensembled predictions. 

## Initial setup

In [1]:
!pip install datasets transformers -q

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p38/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

In [3]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Dataset loading

In [4]:
from datasets import load_dataset

emotions = load_dataset("emotion")

Downloading builder script:   0%|          | 0.00/3.62k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.28k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.20k [00:00<?, ?B/s]

Downloading and preparing dataset emotion/default (download: 1.97 MiB, generated: 2.07 MiB, post-processed: Unknown size, total: 4.05 MiB) to /home/ec2-user/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705...


Downloading data:   0%|          | 0.00/1.66M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/204k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/207k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Dataset emotion downloaded and prepared to /home/ec2-user/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

## Dataset preprocessing

In [5]:
from transformers import AutoTokenizer

model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [6]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)


emotions_encoded = emotions.map(tokenize, batched=True)
emotions_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])

  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

## Metric computation utilities

In [10]:
def get_test_accuracy(models):
    def fn(batch):
        inputs = {
            k: v.to(device)
            for k, v in batch.items()
            if k in tokenizer.model_input_names
        }
        outputs = []
        # Ensembling.
        for model in models:
            with torch.no_grad():
                outputs.append(model(**inputs).logits)
        outputs = torch.stack(outputs, 0)
        output = torch.sum(outputs, 0)
        pred_label = torch.argmax(output, axis=-1)
        return {"predicted_label": pred_label.cpu().numpy()}

    return fn


def compute_test_accuracy(models, split="validation"):
    accuracy_fn = get_test_accuracy(models)

    new_dataset = emotions_encoded[split].map(accuracy_fn, batched=True, batch_size=128)
    new_dataset.set_format("pandas")

    cols = ["label", "predicted_label"]
    df = new_dataset[:][cols]
    return sum(df["label"] == df["predicted_label"]) / len(df)

## Perform ensembling

In [8]:
from transformers import AutoModelForSequenceClassification

models = []

lrs = [3e-5, 2e-5, 6e-4, 1e-5, 3e-4]
wds = [1e-2, 1e-3, 3e-3, 2e-3, 3e-2]

for lr, wd in zip(lrs, wds):
    model_name = f"{model_ckpt}-finetuned-emotion-lr-{lr}-wd-{str(wd).replace('.', '')}"
    model_id = f"sayakpaul/{model_name}"
    print(f"Loading checkpoint: {model_id}.")
    models.append(
        AutoModelForSequenceClassification.from_pretrained(model_id).to(device)
    )
    print("Checkpoint loaded.")

Loading checkpoint: sayakpaul/distilbert-base-uncased-finetuned-emotion-lr-3e-05-wd-001.


Downloading:   0%|          | 0.00/883 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Checkpoint loaded.
Loading checkpoint: sayakpaul/distilbert-base-uncased-finetuned-emotion-lr-2e-05-wd-0001.


Downloading:   0%|          | 0.00/883 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Checkpoint loaded.
Loading checkpoint: sayakpaul/distilbert-base-uncased-finetuned-emotion-lr-0.0006-wd-0003.


Downloading:   0%|          | 0.00/883 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Checkpoint loaded.
Loading checkpoint: sayakpaul/distilbert-base-uncased-finetuned-emotion-lr-1e-05-wd-0002.


Downloading:   0%|          | 0.00/883 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Checkpoint loaded.
Loading checkpoint: sayakpaul/distilbert-base-uncased-finetuned-emotion-lr-0.0003-wd-003.


Downloading:   0%|          | 0.00/883 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Checkpoint loaded.


In [12]:
# For the `validation` split, it runs into: https://github.com/huggingface/datasets/issues/5179

ensemble_scores = {}

for num_members in range(1, len(models) + 1):
    accuracy = compute_test_accuracy(models[:num_members], "test")
    ensemble_scores.update({num_members: accuracy})

print(ensemble_scores)

  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/16 [00:00<?, ?ba/s]

{1: 0.9125, 2: 0.908, 3: 0.9155, 4: 0.9135, 5: 0.928}
