In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer

model_name_or_path = "gpt2"
task = "mrpc"

padding_max_len = 128
if any(k in model_name_or_path for k in ("gpt", "opt", "bloom")):
    padding_side = "left"
else:
    padding_side = "right"

tokenizer = AutoTokenizer.from_pretrained(
    model_name_or_path,
    # padding_side=padding_side,  # TODO: left or right?
    max_length=padding_max_len,
)
if getattr(tokenizer, "pad_token_id") is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

datasets = load_dataset("glue", task)
# print(datasets)


def tokenize_function(examples):
    # max_length=None => use the model max length (it's actually the default)
    outputs = tokenizer(
        examples["sentence1"],
        examples["sentence2"],
        truncation=True,
        padding="max_length",  # TODO: Must pad to same length?
        max_length=padding_max_len,
    )
    return outputs


tokenized_datasets = datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=["idx", "sentence1", "sentence2"],
)

# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
# transformers library
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
print(tokenized_datasets)

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 1725
    })
})


In [2]:
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification
from tqdm import tqdm
import evaluate


# TODO: Use Trainer interface.
def train(model, tokenized_train, tokenized_eval):
    device = "cuda"
    num_epochs = 1
    lr = 3e-4
    batch_size = 32

    optimizer = AdamW(params=model.parameters(), lr=lr)
    metric = evaluate.load("glue", task)

    # # Instantiate scheduler
    # lr_scheduler = get_linear_schedule_with_warmup(
    #     optimizer=optimizer,
    #     num_warmup_steps=0.06 * (len(train_dataloader) * num_epochs),
    #     num_training_steps=(len(train_dataloader) * num_epochs),
    # )

    # Instantiate dataloaders.
    def collate_fn(examples):
        return tokenizer.pad(
            examples,
            padding="max_length",
            max_length=padding_max_len,
            return_tensors="pt",
        )

    train_dataloader = DataLoader(
        tokenized_train,
        shuffle=True,
        collate_fn=collate_fn,
        batch_size=batch_size,
    )
    eval_dataloader = DataLoader(
        tokenized_eval,
        shuffle=False,
        collate_fn=collate_fn,
        batch_size=batch_size,
    )

    model.to(device)
    for epoch in range(num_epochs):
        model.train()
        for _, batch in enumerate(tqdm(train_dataloader)):
            batch.to(device)
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            # lr_scheduler.step()
            optimizer.zero_grad()

        model.eval()
        for _, batch in enumerate(tqdm(eval_dataloader)):
            batch.to(device)
            with torch.no_grad():
                outputs = model(**batch)
            predictions = outputs.logits.argmax(dim=-1)
            predictions, references = predictions, batch["labels"]
            metric.add_batch(
                predictions=predictions,
                references=references,
            )

        eval_metric = metric.compute()
        print(f"epoch {epoch}:", eval_metric)

In [3]:
# use all training data

model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path)
model.config.pad_token_id = model.config.eos_token_id

train(model, tokenized_datasets["train"], tokenized_datasets["validation"])

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/115 [00:00<?, ?it/s]You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 115/115 [01:19<00:00,  1.45it/s]
100%|██████████| 13/13 [00:03<00:00,  3.89it/s]

epoch 0: {'accuracy': 0.6862745098039216, 'f1': 0.8134110787172011}





In [4]:
# random sampling

import random

model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path)
model.config.pad_token_id = model.config.eos_token_id

data_train = tokenized_datasets["train"]
data_eval = tokenized_datasets["validation"]  # test

assert len(data_train) == 3668
candidate_indices = list(range(len(data_train)))

# train an initial model using labeled data
# TODO: always need this?
n_start = 68
labeled_indices = random.sample(candidate_indices, n_start)
train(model, data_train.select(labeled_indices), data_eval)
candidate_indices = [i for i in candidate_indices if i not in set(labeled_indices)]

budget = 360
num_rounds = len(candidate_indices) // budget
print(f"{budget=}")
print(f"{num_rounds=}")

for round in range(num_rounds):
    print(f"{round=}")

    # re-init model each round
    model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path)
    model.config.pad_token_id = model.config.eos_token_id

    # choose data to label
    chosen_indices = random.sample(candidate_indices, budget)

    # obtain labels
    labeled_indices += chosen_indices
    print(f"{len(labeled_indices)=}")

    # update model
    train(model, data_train.select(labeled_indices), data_eval)

    # remove labeled data from candidates
    candidate_indices = [i for i in candidate_indices if i not in set(labeled_indices)]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 3/3 [00:01<00:00,  2.20it/s]
100%|██████████| 13/13 [00:03<00:00,  4.10it/s]


epoch 0: {'accuracy': 0.6838235294117647, 'f1': 0.8122270742358079}
budget=360
num_rounds=10
round=0


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


len(labeled_indices)=428


100%|██████████| 14/14 [00:08<00:00,  1.73it/s]
100%|██████████| 13/13 [00:03<00:00,  4.00it/s]


epoch 0: {'accuracy': 0.6838235294117647, 'f1': 0.8122270742358079}
round=1


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


len(labeled_indices)=788


100%|██████████| 25/25 [00:14<00:00,  1.73it/s]
100%|██████████| 13/13 [00:03<00:00,  4.05it/s]


epoch 0: {'accuracy': 0.6985294117647058, 'f1': 0.8183161004431315}
round=2


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


len(labeled_indices)=1148


100%|██████████| 36/36 [00:21<00:00,  1.70it/s]
100%|██████████| 13/13 [00:03<00:00,  4.32it/s]


epoch 0: {'accuracy': 0.6911764705882353, 'f1': 0.810810810810811}
round=3


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


len(labeled_indices)=1508


100%|██████████| 48/48 [00:21<00:00,  2.25it/s]
100%|██████████| 13/13 [00:01<00:00,  6.64it/s]


epoch 0: {'accuracy': 0.7009803921568627, 'f1': 0.8184523809523809}
round=4


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


len(labeled_indices)=1868


100%|██████████| 59/59 [00:35<00:00,  1.65it/s]
100%|██████████| 13/13 [00:03<00:00,  3.93it/s]


epoch 0: {'accuracy': 0.6936274509803921, 'f1': 0.8169838945827232}
round=5


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


len(labeled_indices)=2228


100%|██████████| 70/70 [00:42<00:00,  1.63it/s]
100%|██████████| 13/13 [00:03<00:00,  3.86it/s]


epoch 0: {'accuracy': 0.678921568627451, 'f1': 0.7923930269413629}
round=6


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


len(labeled_indices)=2588


100%|██████████| 81/81 [00:49<00:00,  1.62it/s]
100%|██████████| 13/13 [00:03<00:00,  4.12it/s]


epoch 0: {'accuracy': 0.6985294117647058, 'f1': 0.8098918083462132}
round=7


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


len(labeled_indices)=2948


100%|██████████| 93/93 [00:56<00:00,  1.63it/s]
100%|██████████| 13/13 [00:02<00:00,  5.29it/s]


epoch 0: {'accuracy': 0.7009803921568627, 'f1': 0.8134556574923548}
round=8


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


len(labeled_indices)=3308


100%|██████████| 104/104 [00:58<00:00,  1.79it/s]
100%|██████████| 13/13 [00:03<00:00,  3.97it/s]


epoch 0: {'accuracy': 0.7107843137254902, 'f1': 0.802013422818792}
round=9


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


len(labeled_indices)=3668


100%|██████████| 115/115 [01:10<00:00,  1.62it/s]
100%|██████████| 13/13 [00:03<00:00,  3.82it/s]

epoch 0: {'accuracy': 0.6936274509803921, 'f1': 0.8085758039816232}





In [5]:
# active learning

import active_learn
import random

model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path)
model.config.pad_token_id = model.config.eos_token_id

data_train = tokenized_datasets["train"]
data_eval = tokenized_datasets["validation"]  # test

assert len(data_train) == 3668
candidate_indices = list(range(len(data_train)))

# train an initial model using labeled data
# TODO: always need this?
n_start = 68
labeled_indices = random.sample(candidate_indices, n_start)
train(model, data_train.select(labeled_indices), data_eval)
candidate_indices = [i for i in candidate_indices if i not in set(labeled_indices)]

budget = 360
num_rounds = len(candidate_indices) // budget
print(f"{budget=}")
print(f"{num_rounds=}")

for round in range(num_rounds):
    print(f"{round=}")

    # re-init model each round
    model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path)
    model.config.pad_token_id = model.config.eos_token_id

    # set up sampler
    model = active_learn.get_active_model(model)
    sampler = active_learn.ActiveSampler(
        "classification",
        model,
        budget,
        # TODO: labeled_data=X_labeled,
    )

    # TODO: add support to avoid conversion here.
    X_candidates = []
    for i in range(len(candidate_indices)):
        X_candidates.append(
            {
                "input_ids": data_train[i]["input_ids"],
                "attention_mask": data_train[i]["attention_mask"],
            }
        )
    print(f"{len(X_candidates)=}")

    # choose data to label
    indices = sampler.select(X_candidates)
    chosen_indices = [candidate_indices[i] for i in indices]
    print(f"{len(chosen_indices)=}")

    # obtain labels
    labeled_indices += chosen_indices
    print(f"{len(labeled_indices)=}")

    # update model
    train(model, data_train.select(labeled_indices), data_eval)

    # remove labeled data from candidates
    candidate_indices = [i for i in candidate_indices if i not in set(labeled_indices)]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 3/3 [00:01<00:00,  2.25it/s]
100%|██████████| 13/13 [00:03<00:00,  4.03it/s]


epoch 0: {'accuracy': 0.31862745098039214, 'f1': 0.0071428571428571435}
budget=360
num_rounds=10
round=0


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


len(X_candidates)=3600


  samps = torch.tensor(samps)


Start sampling ...
Sampling done
len(chosen_indices)=360
len(labeled_indices)=428


100%|██████████| 14/14 [00:08<00:00,  1.72it/s]
100%|██████████| 13/13 [00:03<00:00,  3.94it/s]


epoch 0: {'accuracy': 0.6838235294117647, 'f1': 0.8122270742358079}
round=1


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


len(X_candidates)=3240


  samps = torch.tensor(samps)


Start sampling ...
Sampling done
len(chosen_indices)=360
len(labeled_indices)=788


100%|██████████| 25/25 [00:15<00:00,  1.66it/s]
100%|██████████| 13/13 [00:03<00:00,  3.88it/s]


epoch 0: {'accuracy': 0.6985294117647058, 'f1': 0.8110599078341014}
round=2


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


len(X_candidates)=2880


  samps = torch.tensor(samps)


Start sampling ...
Sampling done
len(chosen_indices)=360
len(labeled_indices)=1148


100%|██████████| 36/36 [00:23<00:00,  1.55it/s]
100%|██████████| 13/13 [00:03<00:00,  3.98it/s]


epoch 0: {'accuracy': 0.6936274509803921, 'f1': 0.8091603053435115}
round=3


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


len(X_candidates)=2520


  samps = torch.tensor(samps)


Start sampling ...
Sampling done
len(chosen_indices)=360
len(labeled_indices)=1508


100%|██████████| 48/48 [00:30<00:00,  1.56it/s]
100%|██████████| 13/13 [00:03<00:00,  3.89it/s]


epoch 0: {'accuracy': 0.6838235294117647, 'f1': 0.8122270742358079}
round=4


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


len(X_candidates)=2160


  samps = torch.tensor(samps)


Start sampling ...
Sampling done
len(chosen_indices)=360
len(labeled_indices)=1868


100%|██████████| 59/59 [00:37<00:00,  1.58it/s]
100%|██████████| 13/13 [00:03<00:00,  3.81it/s]


epoch 0: {'accuracy': 0.6838235294117647, 'f1': 0.8122270742358079}
round=5


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


len(X_candidates)=1800


  samps = torch.tensor(samps)


Start sampling ...
Sampling done
len(chosen_indices)=360
len(labeled_indices)=2228


100%|██████████| 70/70 [00:44<00:00,  1.56it/s]
100%|██████████| 13/13 [00:03<00:00,  3.69it/s]


epoch 0: {'accuracy': 0.6862745098039216, 'f1': 0.8134110787172011}
round=6


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


len(X_candidates)=1440


  samps = torch.tensor(samps)


Start sampling ...
Sampling done
len(chosen_indices)=360
len(labeled_indices)=2588


100%|██████████| 81/81 [00:52<00:00,  1.55it/s]
100%|██████████| 13/13 [00:03<00:00,  3.96it/s]


epoch 0: {'accuracy': 0.7156862745098039, 'f1': 0.8268656716417911}
round=7


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


len(X_candidates)=1080


  samps = torch.tensor(samps)


Start sampling ...
Sampling done
len(chosen_indices)=360
len(labeled_indices)=2948


100%|██████████| 93/93 [00:59<00:00,  1.56it/s]
100%|██████████| 13/13 [00:03<00:00,  3.89it/s]


epoch 0: {'accuracy': 0.7083333333333334, 'f1': 0.819423368740516}
round=8


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


len(X_candidates)=720


  samps = torch.tensor(samps)


Start sampling ...
Sampling done
len(chosen_indices)=360
len(labeled_indices)=3308


100%|██████████| 104/104 [01:06<00:00,  1.57it/s]
100%|██████████| 13/13 [00:03<00:00,  3.77it/s]


epoch 0: {'accuracy': 0.7034313725490197, 'f1': 0.7986688851913478}
round=9


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


len(X_candidates)=360


  samps = torch.tensor(samps)


Start sampling ...
Sampling done
len(chosen_indices)=360
len(labeled_indices)=3668


100%|██████████| 115/115 [01:13<00:00,  1.57it/s]
100%|██████████| 13/13 [00:03<00:00,  3.72it/s]


epoch 0: {'accuracy': 0.6838235294117647, 'f1': 0.8122270742358079}
