In [6]:
from datasets import load_dataset

dataset = load_dataset("emotion", cache_dir="./data")
train_data = dataset["train"]
test_data = dataset["test"]

In [7]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_train = train_data.map(tokenize_function, batched=True)
tokenized_test = test_data.map(tokenize_function, batched=True)

In [8]:
from torch.utils.data import DataLoader
import torch

def collate_fn(batch):
    return {
        'input_ids': torch.tensor([item['input_ids'] for item in batch]),
        'attention_mask': torch.tensor([item['attention_mask'] for item in batch]),
        'labels': torch.tensor([item['label'] for item in batch])
    }

train_dataloader = DataLoader(tokenized_train, shuffle=True, batch_size=8, collate_fn=collate_fn)
test_dataloader = DataLoader(tokenized_test, batch_size=8, collate_fn=collate_fn)


In [9]:
from transformers import BertForSequenceClassification

num_labels = len(train_data.features["label"].names)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from torch.optim import AdamW
from tqdm.auto import tqdm
import torch
from sklearn.metrics import accuracy_score, f1_score
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


optimizer = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):  # number of epochs
    torch.cuda.empty_cache()
    model.train()
    for batch in tqdm(train_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    model.eval()
    with torch.no_grad():
        y_true = []
        y_pred = []
        for batch in tqdm(test_dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            y_true.extend(batch["labels"].cpu().numpy())
            y_pred.extend(torch.argmax(logits, axis=-1).cpu().numpy())
        print(f"Epoch {epoch}: Accuracy: {accuracy_score(y_true, y_pred)}, F1: {f1_score(y_true, y_pred, average='macro')}")

  0%|          | 8/2000 [01:12<5:01:50,  9.09s/it]


KeyboardInterrupt: 

: 

In [None]:


model.eval()
predictions, true_labels = [], []

for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions.extend(logits.argmax(dim=-1).tolist())
    true_labels.extend(batch["labels"].tolist())

accuracy = accuracy_score(true_labels, predictions)
print(f"Test Accuracy: {accuracy}")


Test Accuracy: 0.922


In [1]:
model

NameError: ignored