In [2]:
! nvidia-smi

Mon Apr  3 13:34:45 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 520.61.05    Driver Version: 520.61.05    CUDA Version: 11.8     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  On   | 00000000:06:00.0 Off |                    0 |
| N/A   28C    P0    23W / 250W |      0MiB / 32768MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
import os
import time
import torch

from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments, models
from datasets import load_dataset

import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

2023-04-03 13:34:55.963128: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [4]:
t = torch.cuda.get_device_properties(0).total_memory
BATCH_SIZE = int(np.floor(t/1e9))


In [5]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
dataset = load_dataset('ag_news')
print(dataset['train'].features['label'].names)
num_labels = len(dataset['train'].features['label'].names)

Found cached dataset ag_news (/home/sss9772/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


  0%|          | 0/2 [00:00<?, ?it/s]

['World', 'Sports', 'Business', 'Sci/Tech']


In [6]:
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_labels)
config = models.roberta.configuration_roberta.RobertaConfig(hidden_size = 768, intermediate_size = 768)

model.roberta.encoder.layer[11].intermediate = models.roberta.modeling_roberta.RobertaIntermediate(config)
model.roberta.encoder.layer[11].output = models.roberta.modeling_roberta.RobertaOutput(config)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

In [7]:
## input string, should classify as a business news
input_str = 'the quarterly results of jpmc look good'

inputs = tokenizer(input_str, return_tensors='pt')
labels = torch.tensor([1]).unsqueeze(0)
outputs = model(**inputs, labels=labels)
loss, logits = outputs[:2]
print(loss)
print(logits)

## Although the output we see, is label 1, i.e. sports news. DOESN'T MAKE SENSE
print(np.argmax(logits.detach().numpy()))

tensor(1.0886, grad_fn=<NllLossBackward0>)
tensor([[-0.1574,  0.4531,  0.2560, -0.0478]], grad_fn=<AddmmBackward0>)
1


In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [9]:
train_data = dataset['train'].map(lambda text: tokenizer(text['text'], padding="max_length", truncation=True, max_length=512), batched=True)
test_data = dataset['test'].map(lambda text: tokenizer(text['text'], padding="max_length", truncation=True, max_length=512), batched=True)

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [10]:
train_data = train_data.shuffle(seed=42)

In [11]:
train_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [12]:
def compute_metrics(predictor):
    labels = predictor.label_ids
    preds = predictor.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy' : acc,
        'f1' : f1,
        'precision' : precision,
        'recall': recall
    }

In [13]:
# default lr=5e-5
# fp16=True
run_name = 'roberta-modified'
training_args = TrainingArguments(
    output_dir = './output4',
    num_train_epochs=5,
    overwrite_output_dir=True,
    per_device_train_batch_size = BATCH_SIZE,
    per_device_eval_batch_size= BATCH_SIZE*2,
    gradient_accumulation_steps = 32,    
    evaluation_strategy = 'steps',
    save_strategy='steps',
    eval_steps=32,
    save_steps=64,
    disable_tqdm = False, 
    warmup_steps=100,
    logging_steps = 8,
    logging_dir='./logs2',
    dataloader_num_workers = 8,
    run_name = run_name,
    report_to='wandb',
    load_best_model_at_end=True,
    fp16=True 
)

In [14]:
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_labels)

trainer = Trainer(
            model=model,
            args=training_args,
            compute_metrics=compute_metrics,
            train_dataset=train_data,
            eval_dataset=test_data
           )

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

In [None]:
import gc

try:
    trainer.train()   
except Exception as e:
    print(e)
    gc.collect()
    torch.cuda.empty_cache()
   

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mshashvatshah9[0m ([33mshashvat[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
32,1.0594,0.690595,0.864605,0.863724,0.865072,0.864605
64,0.2791,0.258326,0.918158,0.917845,0.917852,0.918158
96,0.2263,0.218401,0.924342,0.924417,0.927464,0.924342
128,0.1826,0.189403,0.938026,0.938031,0.938599,0.938026
160,0.1828,0.185101,0.938421,0.938196,0.94069,0.938421
192,0.1608,0.179823,0.939605,0.939516,0.941608,0.939605
224,0.1502,0.157955,0.949605,0.949612,0.949671,0.949605
256,0.1424,0.159934,0.946579,0.946547,0.947119,0.946579
288,0.1383,0.165937,0.945658,0.945588,0.946926,0.945658
320,0.1235,0.162032,0.947895,0.947743,0.947982,0.947895


In [None]:
inputs = inputs.to(device)
labels = labels.to(device)
outputs = model(**inputs, labels=labels)
loss, logits = outputs[:2]
print(loss)
print(logits)

## Although the output we see, is label 1, i.e. sports news. DOESN'T MAKE SENSE
print(np.argmax(logits.cpu().detach().numpy()))