In [10]:
import torch
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
# from transformers import BertForSequenceClassification.from_pretrained as BertModel
# from transformers.BertTokenizer import from_pretrained as BertTokenizer


device = torch.device("cuda" if torch.cuda.is_available else "cpu")
print(f'Device: {device}')

Device: cuda


In [11]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3, return_dict=False)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [12]:
sentences = ['This is a positive sentence.', 'This is a negative sentence.', 'I am neutral.']
labels = [1,0,2]

In [13]:
input_ids = list()
attention_masks = list()
for sent in sentences:
    encoded_dict = tokenizer.encode_plus(sent, add_special_tokens=True, max_length=64, truncation=True, pad_to_max_length=True, 
                                         return_attention_mask=True, return_tensors='pt')
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
    
    
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

dataset = TensorDataset(input_ids, attention_masks, labels)

train_size = int(0.8*len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

batch_size = 32
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
validation_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)

model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr = 2e-5)

epochs = 4

seed_val = 42
torch.manual_seed(seed_val)

<torch._C.Generator at 0x200967b7e90>

In [14]:
def flat_accuracy(preds, labels):
    pred_flat = torch.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    return torch.sum(pred_flat == labels_flat).item() / len(labels_flat)

In [15]:
for epoch in range(epochs):
    total_train_loss = 0
    total_train_accuracy = 0
    
    model.train()
    
    for step, batch in enumerate(train_dataloader):
        batch_input_ids = batch[0].to(device)
        batch_attention_masks = batch[1].to(device)
        batch_labels = batch[2].to(device)
        
        optimizer.zero_grad()
        
        loss, logits = model(batch_input_ids, attention_mask = batch_attention_masks,
                             labels = batch_labels)
        total_train_loss += loss.item()
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total_train_accuracy += flat_accuracy(logits, batch_labels)
        
        optimizer.zero_grad()

        loss, logits = model(batch_input_ids, attention_mask=batch_attention_masks, labels=batch_labels)

        total_train_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        total_train_accuracy += flat_accuracy(logits, batch_labels)
        
    print(f'| Epoch: {epoch+1} | Training Loss: {avg_train_loss} | Training Accuracy: {avg_train_accuracy} |')
        

| Epoch: 1 | Training Loss: 1.3017181158065796 | Training Accuracy: 2.0 |
| Epoch: 2 | Training Loss: 1.3017181158065796 | Training Accuracy: 2.0 |
| Epoch: 3 | Training Loss: 1.3017181158065796 | Training Accuracy: 2.0 |
| Epoch: 4 | Training Loss: 1.3017181158065796 | Training Accuracy: 2.0 |


In [16]:
avg_train_loss = total_train_loss / len(train_dataloader)
avg_train_accuracy = total_train_accuracy / len(train_dataloader)

In [17]:
print(f'| Epoch: {epoch+1} | Training Loss: {avg_train_loss} | Training Accuracy: {avg_train_accuracy} |')

| Epoch: 4 | Training Loss: 1.5985829830169678 | Training Accuracy: 1.5 |


In [18]:
# Evaluate the model on the validation set
model.eval()

total_val_loss = 0
total_val_accuracy = 0

for batch in validation_dataloader:
    batch_input_ids = batch[0].to(device)
    batch_attention_masks = batch[1].to(device)
    batch_labels = batch[2].to(device)

    with torch.no_grad():
        loss, logits = model(batch_input_ids, attention_mask=batch_attention_masks, labels=batch_labels)

    total_val_loss += loss.item()
    total_val_accuracy += flat_accuracy(logits, batch_labels)

avg_val_loss = total_val_loss / len(validation_dataloader)
avg_val_accuracy = total_val_accuracy / len(validation_dataloader)


print(f'| Validation loss: {avg_val_loss} | Validation Accuracy: {avg_val_accuracy} |')

| Validation loss: 2.449448347091675 | Validation Accuracy: 0.0 |
