In [1]:
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification
from transformers import AdamW
from transformers import get_scheduler

import torch
from torch.utils.data import DataLoader

import numpy as np
from tqdm.auto import tqdm

In [2]:
raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint) 

Reusing dataset glue (/home/jupyter/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [4]:
def tokenize_function(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'], truncation=True)

In [5]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

#data_collator = DataCollatorWithPadding(tokenizer)

Loading cached processed dataset at /home/jupyter/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-ef8c899ca7c3fec8.arrow
Loading cached processed dataset at /home/jupyter/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-37545a200dc4ac99.arrow
Loading cached processed dataset at /home/jupyter/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-dd44c6a74fd984ee.arrow


In [6]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [7]:
tokenized_datasets = tokenized_datasets.remove_columns(['sentence1', 'sentence2', 'idx'])
tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')
tokenized_datasets.set_format('torch')

In [8]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [9]:
data_collator = DataCollatorWithPadding(tokenizer)

In [10]:
train_dataloader = DataLoader(tokenized_datasets['train'],
                             shuffle=True,
                             batch_size=8,
                             collate_fn=data_collator)
eval_dataloader = DataLoader(tokenized_datasets['validation'],                             
                             batch_size=8,
                             collate_fn=data_collator)

In [11]:
for batch in train_dataloader:
    break
print({k:v.shape for k,v in batch.items()})

{'labels': torch.Size([8]), 'input_ids': torch.Size([8, 81]), 'token_type_ids': torch.Size([8, 81]), 'attention_mask': torch.Size([8, 81])}


In [12]:
checkpoint = "bert-base-cased"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [13]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(1.0504, grad_fn=<NllLossBackward>) torch.Size([8, 2])


In [16]:
optimizer = AdamW(model.parameters(), lr=5e-5)



In [17]:
loss = outputs.loss
loss.backward()
optimizer.step()

optimizer.zero_grad()

In [18]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(0.8815, grad_fn=<NllLossBackward>) torch.Size([8, 2])


In [19]:
num_epochs=3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler('linear',
                            optimizer=optimizer,
                            num_warmup_steps=0,
                            num_training_steps=num_training_steps)

In [20]:
lr_scheduler

<torch.optim.lr_scheduler.LambdaLR at 0x7fd51d6a9550>

In [21]:
device = torch.device('cuda')
model.to(device)
print(device)

cuda


In [22]:
progress_bar = tqdm(range(num_training_steps))

optimizer = AdamW(model.parameters(), lr=5e-5)

for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        batch = {k:v.to(device) for k,v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/1377 [00:00<?, ?it/s]



In [23]:
metric = load_metric("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k:v.to(device) for k,v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch['labels'])

metric.compute()

{'accuracy': 0.6838235294117647}