In [2]:
! nvidia-smi

Sat Apr  1 23:27:31 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 520.61.05    Driver Version: 520.61.05    CUDA Version: 11.8     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:0B:00.0 Off |                    0 |
| N/A   36C    P0    41W / 163W |      0MiB / 32768MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
import os
import time
import torch
imoprt torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

from transformers import RobertaForSequenceClassification, RobertaTokenizer
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [4]:
t = torch.cuda.get_device_properties(0).total_memory
BATCH_SIZE = int(np.floor(t/1e9))


In [5]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
dataset = load_dataset('ag_news')
print(dataset['train'].features['label'].names)
num_labels = len(dataset['train'].features['label'].names)

Found cached dataset ag_news (/home/sss9772/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


  0%|          | 0/2 [00:00<?, ?it/s]

['World', 'Sports', 'Business', 'Sci/Tech']


In [6]:
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_labels)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

In [7]:
## input string, should classify as a business news
input_str = 'the quarterly results of jpmc look good'

inputs = tokenizer(input_str, return_tensors='pt')
labels = torch.tensor([1]).unsqueeze(0)
outputs = model(**inputs, labels=labels)
loss, logits = outputs[:2]
print(loss)
print(logits)

## Although the output we see, is label 1, i.e. sports news. DOESN'T MAKE SENSE
print(np.argmax(logits.detach().numpy()))

tensor(1.3094, grad_fn=<NllLossBackward0>)
tensor([[ 0.0483,  0.0896, -0.1313,  0.0305]], grad_fn=<AddmmBackward0>)
1


In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [9]:
train_data = dataset['train'].map(lambda text: tokenizer(text['text'], padding="max_length", truncation=True, max_length=512), batched=True)
test_data = dataset['test'].map(lambda text: tokenizer(text['text'], padding="max_length", truncation=True, max_length=512), batched=True)

Loading cached processed dataset at /home/sss9772/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-9eea6d1172669b3b.arrow
Loading cached processed dataset at /home/sss9772/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-a399d57179296d98.arrow


In [10]:
train_data = train_data.shuffle(seed=42)

Loading cached shuffled indices for dataset at /home/sss9772/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-35881949ebb31f70.arrow


In [11]:
train_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [12]:
def compute_metrics(labels, preds):
    
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy' : acc,
        'f1' : f1,
        'precision' : precision,
        'recall': recall
    }

In [14]:


EPOCHS = 5

def train_model(trainingDataset, model, device, optimizer, loss):
    model.train()
    model.to(device)
    for epoch in range(EPOCHS):

        train_dataset, validation_dataset = random_split(dataset, [int(len(dataset)*0.8), len(dataset)- int(len(dataset)*0.8)])
        # 8 dataloader workers are we have a 8 core CPU
        train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, num_workers = 8, shuffle=True)
        validation_dataloader = DataLoader(validation_dataset, batch_size=BATCH_SIZE, num_workers= 8, shuffle=False)
        
        running_loss = 0.0
        n_samples = 0

        all_preds = []
        all_labels = []

        for input, labels in train_dataloader:
            # push data to compute device
            input = input.to(device)
            labels = labels.to(device)
            # predict the output
            pred = model(**inputs, labels=labels)
            # find the output label
            pred_labels = torch.argmax(pred, dim=-1)
            # compute the loss
            loss_vals = loss(pred_labels, labels)
            all_preds.extend(pred_labels.detach().to("cpu"))
            all_labels.extend(labels.detach().to("cpu"))

            optimizer.zero_grad()
            # backprop
            loss.backward()
            optimizer.step()
            # accumate the loss
            running_loss += loss_vals * input.size(0)
            n_samples += input.size(0)
        
        metrics = compute_metrics(all_labels, all_preds)
        print(f'Epoch {epoch+1} metrics Acc : {metrics['acc']} F1 : {metrics['f1']} Train Loss : {running_loss/n_samples}')
        evaluate_model(model, model, device)

            

def evaluate_model(dataloader, model, device):
    running_loss = 0.0
    n_samples = 0

    all_preds = []
    all_labels = []

    for input, labels in dataloader:
        # push data to compute device
        input = input.to(device)
        labels = labels.to(device)
        # predict the output
        pred = model(**inputs, labels=labels)
        # find the output label
        pred_labels = torch.argmax(pred, dim=-1)
        # compute the loss
        loss_vals = loss(pred_labels, labels)
        all_preds.extend(pred_labels.detach().to("cpu"))
        all_labels.extend(labels.detach().to("cpu"))

        # accumate the loss
        running_loss += loss_vals * input.size(0)
        n_samples += input.size(0)
    
    metrics = compute_metrics(all_labels, all_preds)
    print(f'Validation {epoch+1} metrics: Acc : {metrics['acc']} F1 : {metrics['f1']} Eval Loss : {running_loss/n_samples}')
        


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

In [None]:
import gc

try:
    model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_labels)
    model.to(device)
    optimizer = nn.optim.Adam(model.parameters(), lr=1e-3)
    losss = nn.CrossEntropyLoss()
    train_model(train_data, model, device, optimizer, loss)
except Exception as e:
    print(e)
    gc.collect()
    torch.cuda.empty_cache()
   