In [1]:
!pip install -q transformers datasets evaluate accelerate nvidia-ml-py3 bitsandbytes

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m41.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m219.1/219.1 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.3/104.3 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m61.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m11.8 MB/s[0m

In [2]:
!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

--2023-05-14 18:33:26--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2023-05-14 18:33:35 (9.45 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [1]:
import numpy as np
from pynvml import *
from datasets import Dataset
from transformers import TrainingArguments, Trainer, logging
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import evaluate
import torch
import warnings
from pathlib import Path
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")
logging.set_verbosity_error()

def get_dummy_dataset():
    seq_len, dataset_size = 512, 512
    dummy_data = {
        "input_ids": np.random.randint(100, 30000, (dataset_size, seq_len)),
        "labels": np.random.randint(0, 1, (dataset_size)),
    }
    ds = Dataset.from_dict(dummy_data)
    ds.set_format("pt")
    return ds


class IMDbDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(self.labels[idx])
            return item

        def __len__(self):
            return len(self.labels)


def get_imdb_dataset(model_name):
    def read_imdb_split(split_dir):
        split_dir = Path(split_dir)
        texts = []
        labels = []
        for label_dir in ["pos", "neg"]:
            for text_file in (split_dir/label_dir).iterdir():
                texts.append(text_file.read_text())
                labels.append(0 if label_dir == "neg" else 1)

        return texts, labels

    train_texts, train_labels = read_imdb_split('aclImdb/train')
    test_texts, test_labels = read_imdb_split('aclImdb/test')

    train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    train_encodings = tokenizer(train_texts, truncation=True, padding=True)
    val_encodings = tokenizer(val_texts, truncation=True, padding=True)
    test_encodings = tokenizer(test_texts, truncation=True, padding=True)

    train_dataset = IMDbDataset(train_encodings, train_labels)
    val_dataset = IMDbDataset(val_encodings, val_labels)
    test_dataset = IMDbDataset(test_encodings, test_labels)

    return train_dataset, val_dataset, test_dataset

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")

def print_summary(model, result):
    print(f"Model Parameters: {sum(p.numel() for p in model.parameters())}")
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

def main(model_name='bert-large-uncased', grad_acc_steps=-1, grad_ckpt=False, fp16=False, optim='adamw', epochs=1):
    print("========== GPU Utilization at start ==========")
    print_gpu_utilization()

    arguments = {
        "output_dir": "tmp",
        "evaluation_strategy": "steps",
        "num_train_epochs": epochs,
        "log_level": "error",
        "report_to": "none",
        "per_device_train_batch_size": 4,
    }

    ### HYPERPARAMETERS ###
    if fp16 == True:
        arguments['fp16'] = True
    if grad_acc_steps != -1:
        arguments['gradient_accumulation_steps'] = grad_acc_steps
    if grad_ckpt == True:
        arguments['gradient_checkpointing'] = True
    if optim != 'adamw':
        arguments['optim'] = optim

    # ds = get_dummy_dataset()
    train_dataset, val_dataset, test_dataset = get_imdb_dataset(model_name)

    model = AutoModelForSequenceClassification.from_pretrained(model_name).to("cuda")
    
    metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

    training_args = TrainingArguments(**arguments)
    trainer = Trainer(
        model=model, 
        args=training_args, 
        train_dataset=train_dataset, 
        eval_dataset=val_dataset
    )
    result = trainer.train()
    print_summary(model, result)

    del model, arguments, ds, training_args, trainer, result
    torch.cuda.empty_cache()

In [None]:
main(model_name='bert-large-uncased')

GPU memory occupied: 258 MB.
{'train_runtime': 185.0935, 'train_samples_per_second': 2.766, 'train_steps_per_second': 0.692, 'train_loss': 0.013952209614217281, 'epoch': 1.0}
Model Parameters: 335143938
Time: 185.09
Samples/second: 2.77
GPU memory occupied: 14275 MB.


In [None]:
main(model_name='bert-large-uncased', grad_acc_steps=4)

GPU memory occupied: 258 MB.


Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

{'train_runtime': 150.2458, 'train_samples_per_second': 3.408, 'train_steps_per_second': 0.106, 'train_loss': 0.15585996210575104, 'epoch': 1.0}
Model Parameters: 335143938
Time: 150.25
Samples/second: 3.41
GPU memory occupied: 12363 MB.


In [None]:
main(model_name='bert-large-uncased', grad_acc_steps=4, grad_ckpt=True)

GPU memory occupied: 435 MB.
{'train_runtime': 230.5676, 'train_samples_per_second': 2.221, 'train_steps_per_second': 0.139, 'train_loss': 0.050934258848428726, 'epoch': 1.0}
Model Parameters: 335143938
Time: 230.57
Samples/second: 2.22
GPU memory occupied: 6283 MB.


In [None]:
main(model_name='bert-large-uncased', grad_acc_steps=4, grad_ckpt=True, fp16=True)

GPU memory occupied: 435 MB.
{'train_runtime': 80.4575, 'train_samples_per_second': 6.364, 'train_steps_per_second': 0.398, 'train_loss': 0.058523036539554596, 'epoch': 1.0}
Model Parameters: 335143938
Time: 80.46
Samples/second: 6.36
GPU memory occupied: 6723 MB.


In [None]:
main(model_name='bert-large-uncased', grad_acc_steps=4, grad_ckpt=True, fp16=True, optim='adafactor')

GPU memory occupied: 439 MB.
{'train_runtime': 82.3458, 'train_samples_per_second': 6.218, 'train_steps_per_second': 0.389, 'train_loss': 0.08672035485506058, 'epoch': 1.0}
Model Parameters: 335143938
Time: 82.35
Samples/second: 6.22
GPU memory occupied: 4291 MB.
