In [1]:
! nvidia-smi

Fri Mar 31 15:17:45 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 520.61.05    Driver Version: 520.61.05    CUDA Version: 11.8     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:1C:00.0 Off |                    0 |
| N/A   40C    P0    40W / 300W |      0MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import os
import time
import torch

from transformers import EarlyStoppingCallback, RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments, SchedulerType
from datasets import load_dataset

import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

2023-03-31 15:18:00.025954: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
t = torch.cuda.get_device_properties(0).total_memory
BATCH_SIZE = int(np.floor(t/1e9))


In [4]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
dataset = load_dataset('ag_news')
print(dataset['train'].features['label'].names)
num_labels = len(dataset['train'].features['label'].names)

Found cached dataset ag_news (/home/sss9772/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


  0%|          | 0/2 [00:00<?, ?it/s]

['World', 'Sports', 'Business', 'Sci/Tech']


In [5]:
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_labels)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

In [6]:
## input string, should classify as a business news
input_str = 'the quarterly results of jpmc look good'

inputs = tokenizer(input_str, return_tensors='pt')
labels = torch.tensor([1]).unsqueeze(0)
outputs = model(**inputs, labels=labels)
loss, logits = outputs[:2]
print(loss)
print(logits)

## Although the output we see, is label 1, i.e. sports news. DOESN'T MAKE SENSE
print(np.argmax(logits.detach().numpy()))

tensor(1.4747, grad_fn=<NllLossBackward0>)
tensor([[ 0.0701, -0.0911,  0.0540, -0.0534]], grad_fn=<AddmmBackward0>)
0


In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
train_data = dataset['train'].map(lambda text: tokenizer(text['text'], padding="max_length", truncation=True, max_length=512), batched=True)
test_data = dataset['test'].map(lambda text: tokenizer(text['text'], padding="max_length", truncation=True, max_length=512), batched=True)

Loading cached processed dataset at /home/sss9772/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-9eea6d1172669b3b.arrow
Loading cached processed dataset at /home/sss9772/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-a399d57179296d98.arrow


In [9]:
train_data = train_data.shuffle(seed=42)

Loading cached shuffled indices for dataset at /home/sss9772/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-35881949ebb31f70.arrow


In [10]:
train_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [11]:
def compute_metrics(predictor):
    labels = predictor.label_ids
    preds = predictor.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy' : acc,
        'f1' : f1,
        'precision' : precision,
        'recall': recall
    }

In [12]:
# default lr=5e-5
run_name = 'roberta-classification-' + str(time.time())
training_args = TrainingArguments(
    output_dir = './output1',
    num_train_epochs=3,
    overwrite_output_dir=True,
    per_device_train_batch_size = BATCH_SIZE,
    per_device_eval_batch_size= BATCH_SIZE*2,
    gradient_accumulation_steps = 32,    
    evaluation_strategy = 'steps',
    save_strategy='steps',
    eval_steps=32,
    save_steps=64,
    disable_tqdm = False, 
    lr_scheduler_type = SchedulerType.COSINE_WITH_RESTARTS,
    warmup_steps=100,
    logging_steps = 8,
    logging_dir='./logs',
    dataloader_num_workers = 8,
    run_name = run_name,
    report_to='wandb',
    load_best_model_at_end=True
)

In [13]:
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_labels)
early_stopping = EarlyStoppingCallback(early_stopping_patience= 2, early_stopping_threshold= 0.001)

trainer = Trainer(
            model=model,
            args=training_args,
            compute_metrics=compute_metrics,
            train_dataset=train_data,
            eval_dataset=test_data,
            callbacks=[early_stopping,]
           )

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

In [None]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mshashvatshah9[0m ([33mshashvat[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
32,1.1562,0.801898,0.866447,0.865791,0.867012,0.866447
64,0.2837,0.279785,0.909342,0.90907,0.909626,0.909342
96,0.2607,0.238252,0.923026,0.922758,0.923888,0.923026
128,0.2318,0.215825,0.928289,0.92805,0.92859,0.928289


In [15]:
model.load_state_dict(torch.load('./output/checkpoint-640/pytorch_model.bin'))


<All keys matched successfully>

In [22]:
inputs

{'input_ids': tensor([[   0,  627, 3472,  775,    9, 1236, 1685,  438,  356,  205,    2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [25]:
inputs = inputs.to(device)
labels = labels.to(device)
outputs = model(**inputs, labels=labels)
loss, logits = outputs[:2]
print(loss)
print(logits)

## Although the output we see, is label 1, i.e. sports news. DOESN'T MAKE SENSE
print(np.argmax(logits.cpu().detach().numpy()))

tensor(8.7311, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([[-1.6577, -4.0148,  4.6970,  0.6618]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
2


wandb: Waiting for W&B process to finish... (success).
wandb: \ 0.025 MB of 0.025 MB uploaded (0.000 MB deduped)