# Fine tune a classifer with/without accelerate

For deploying training jobs on an EC2 or locally, use of the accelerate library can, with few additional lines of code and automatic detection of some of the distributed features of your environment, substantially increase speeds through dividing work among multiple GPUs/TPUs, use of DeepSpeed acceleration, mixed_precision simplification, etc. Here are 2 examples:
- first, an example of training a content classifier in a single EC2, single GPU context
- second, an example of training a content classifier in a multiple GPU context with Accelerate

In [None]:
%%capture
## capture with jupyter magic to suppress output

## install needed libraries
!pip install xgboost datasets transformers sentence_transformers nltk accelerate evaluate tqdm deepspeed

#### Load a dataset to train
*note: this dataset would be loaded in the training script if not run interactively, but for convenience we're loading it before training to only load once*

In [None]:
from datasets import load_dataset, ReadInstruction

## load a subset of dataset for training and evaluationg
dataset = load_dataset("UKPLab/toxic_conversations", split = ReadInstruction('test', to=5, unit='%'))

## Single-server Example: model training without Accelerate/DeepSpeed

In [None]:
from accelerate import Accelerator
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler,DataCollatorWithPadding,AutoTokenizer
from torch.utils.data import DataLoader
import torch
import tqdm
import pandas as pd
import evaluate

### EXAMPLE 1 - single-server context
##
## the below code is aimed at training a content moderation model
## on a classification task for a single machine with a single GPU
## (e.g. a single EC2 p3.2xlarge)
## in a more production-like context this cell would be written to a .py file
## to be called at terminal as one training job
## (as one of potentially several, or a recurring job)
## note: this assumes data was loaded (loaded in above cell for convenience)

metric = evaluate.load("accuracy")

tokenizer = AutoTokenizer.from_pretrained("martin-ha/toxic-comment-model")
model = AutoModelForSequenceClassification.from_pretrained("martin-ha/toxic-comment-model")
optimizer = AdamW(model.parameters(), lr=3e-5)

tokenized_ds = dataset.map(lambda x: tokenizer(x["text"], truncation=True,padding=True))
tokenized_ds = tokenized_ds.remove_columns('label_text')
tokenized_ds = tokenized_ds.remove_columns('text')

ds_train_test = tokenized_ds.train_test_split(test_size = 0.2)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_dataloader = DataLoader(
                              ds_train_test['train'],
                              shuffle=True,
                              batch_size=8,
                              collate_fn=data_collator
)

eval_dataloader = DataLoader(
                             ds_train_test['test'],
                             batch_size=8,
                             collate_fn=data_collator
                            )

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)


num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

progress_bar = tqdm.tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

## Distributed Example - adding the Accelerate library

In [None]:
from accelerate import Accelerator
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler,DataCollatorWithPadding,AutoTokenizer
from torch.utils.data import DataLoader
import torch
import tqdm
import pandas as pd
import evaluate

### EXAMPLE 2 - addition of Accelerate for performance gains in distributed context
##
## the below code is aimed at training a content moderation model
## on a classification task for mulitple machines or a machine with multiple GPUs
## (e.g. a larger EC2 like a p3.16xlarge with 8x V100 GPUs)
## through addition of a handful of lines of code (noted with comments below)
## and removal of a handful of modules of code
## in a more production-like context this cell would be written to a .py file
## however here the training code is wrapped by a function and can be conveniently called
## for protyping and demonstration purposes with a single line of code (in the cell below)
## e.g. accelerate.notebook_launcher(accelerated_training_fn, mixed_precision = 'fp16')
## note: this assumes data was loaded (loaded in an above cell for convenience)


def accelerated_training_fn():

    metric = evaluate.load("accuracy")
    tokenizer = AutoTokenizer.from_pretrained("martin-ha/toxic-comment-model")
    model = AutoModelForSequenceClassification.from_pretrained("martin-ha/toxic-comment-model")
    optimizer = AdamW(model.parameters(), lr=3e-5)

    tokenized_ds = dataset.map(lambda x: tokenizer(x["text"], truncation=True,padding=True))
    tokenized_ds = tokenized_ds.remove_columns('label_text')
    tokenized_ds = tokenized_ds.remove_columns('text')

    ds_train_test = tokenized_ds.train_test_split(test_size = 0.2)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)



    train_dataloader = DataLoader(
                                ds_train_test['train'],
                                shuffle=True,
                                batch_size=8,
                                collate_fn=data_collator
    )

    eval_dataloader = DataLoader(
                                ds_train_test['test'],
                                batch_size=8,
                                collate_fn=data_collator
                                )

    accelerator = Accelerator()

    # - below removed for accelerate addtion
    # - device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)

    # + below added for accelerate
    train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
        train_dataloader, eval_dataloader, model, optimizer
    )

    num_epochs = 3
    num_training_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps
    )

    progress_bar = tqdm.tqdm(range(num_training_steps))

    model.train()
    for epoch in range(num_epochs):
        for batch in train_dataloader:
            # - below removed for accelerate addtion
            #batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss

            # - below removed for accelerate addtion
            #loss.backward()

            # + below added for accelerate
            accelerator.backward(loss)

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

    model.eval()
    for batch in eval_dataloader:
        # - below removed for accelerate addtion
        #batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])

    metric.compute()

Normally accelerate is launched as a scripted job at the command line - but for convenience here we've specified to run it in a notebook

In [None]:
from accelerate import notebook_launcher

notebook_launcher(accelerated_training_fn, mixed_precision = 'fp16')

## Example Config .yml file when executing at the terminal

In [None]:
# NOTE: if running at the command line, accelerate benefits from generating a config file
# with the command "accelerate config" followed by answering prompts relevent for your environment

# Sample default_config.yaml for a EC2 server with 8GPUs (like a p3.16xlarge):

default_yml_p3_16x = """
compute_environment: LOCAL_MACHINE
deepspeed_config:
  gradient_accumulation_steps: 1
  offload_optimizer_device: none
  offload_param_device: none
  zero3_init_flag: false
  zero_stage: 2
distributed_type: DEEPSPEED
downcast_bf16: 'no'
dynamo_config:
  dynamo_backend: INDUCTOR
machine_rank: 0
main_training_function: main
mixed_precision: fp16
num_machines: 1
num_processes: 8
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
"""


