In [2]:
! nvidia-smi

Sat Apr  1 23:27:31 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 520.61.05    Driver Version: 520.61.05    CUDA Version: 11.8     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:0B:00.0 Off |                    0 |
| N/A   36C    P0    41W / 163W |      0MiB / 32768MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
import os
import time
import torch

from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [4]:
t = torch.cuda.get_device_properties(0).total_memory
BATCH_SIZE = int(np.floor(t/1e9))


In [5]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
dataset = load_dataset('ag_news')
print(dataset['train'].features['label'].names)
num_labels = len(dataset['train'].features['label'].names)

Found cached dataset ag_news (/home/sss9772/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


  0%|          | 0/2 [00:00<?, ?it/s]

['World', 'Sports', 'Business', 'Sci/Tech']


In [6]:
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_labels)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

In [7]:
## input string, should classify as a business news
input_str = 'the quarterly results of jpmc look good'

inputs = tokenizer(input_str, return_tensors='pt')
labels = torch.tensor([1]).unsqueeze(0)
outputs = model(**inputs, labels=labels)
loss, logits = outputs[:2]
print(loss)
print(logits)

## Although the output we see, is label 1, i.e. sports news. DOESN'T MAKE SENSE
print(np.argmax(logits.detach().numpy()))

tensor(1.3094, grad_fn=<NllLossBackward0>)
tensor([[ 0.0483,  0.0896, -0.1313,  0.0305]], grad_fn=<AddmmBackward0>)
1


In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [9]:
train_data = dataset['train'].map(lambda text: tokenizer(text['text'], padding="max_length", truncation=True, max_length=512), batched=True)
test_data = dataset['test'].map(lambda text: tokenizer(text['text'], padding="max_length", truncation=True, max_length=512), batched=True)

Loading cached processed dataset at /home/sss9772/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-9eea6d1172669b3b.arrow
Loading cached processed dataset at /home/sss9772/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-a399d57179296d98.arrow


In [10]:
train_data = train_data.shuffle(seed=42)

Loading cached shuffled indices for dataset at /home/sss9772/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-35881949ebb31f70.arrow


In [11]:
train_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [12]:
def compute_metrics(predictor):
    labels = predictor.label_ids
    preds = predictor.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy' : acc,
        'f1' : f1,
        'precision' : precision,
        'recall': recall
    }

In [13]:
# default lr=5e-5
# fp16=True
run_name = 'roberta-classification-' + str(time.time())
training_args = TrainingArguments(
    output_dir = './output3',
    num_train_epochs=3,
    overwrite_output_dir=True,
    per_device_train_batch_size = BATCH_SIZE,
    per_device_eval_batch_size= BATCH_SIZE*2,
    gradient_accumulation_steps = 32,    
    evaluation_strategy = 'steps',
    save_strategy='steps',
    eval_steps=32,
    save_steps=64,
    disable_tqdm = False, 
    warmup_steps=100,
    logging_steps = 8,
    logging_dir='./logs1',
    dataloader_num_workers = 8,
    run_name = run_name,
    report_to='wandb',
    load_best_model_at_end=True,
    fp16=True 
)

In [14]:
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_labels)

trainer = Trainer(
            model=model,
            args=training_args,
            compute_metrics=compute_metrics,
            train_dataset=train_data,
            eval_dataset=test_data
           )

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

In [15]:
import gc

try:
    trainer.train()   
except Exception as e:
    print(e)
    gc.collect()
    torch.cuda.empty_cache()
   

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mshashvatshah9[0m ([33mshashvat[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,0.2085,0.200851,0.936842,0.936689,0.936706,0.936842
1,0.1686,0.1641,0.946711,0.946759,0.946897,0.946711
2,0.1319,0.159227,0.948684,0.948588,0.948957,0.948684


In [16]:
inputs = inputs.to(device)
labels = labels.to(device)
outputs = model(**inputs, labels=labels)
loss, logits = outputs[:2]
print(loss)
print(logits)

## Although the output we see, is label 1, i.e. sports news. DOESN'T MAKE SENSE
print(np.argmax(logits.cpu().detach().numpy()))

tensor(7.6751, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([[-2.4960, -3.8934,  3.5045,  2.3540]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
2
