In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"]="0"

try:
    HF_TOKEN = os.environ["HF_TOKEN"]
except:
    raise ValueError("Set HF_TOKEN enviornment variable equal to your access token")

In [2]:
import torch, math
from datasets import load_dataset
from peft import LoraConfig, PeftModel, get_peft_model
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    get_linear_schedule_with_warmup
)

MODEL_ID = "meta-llama/Llama-2-7b-hf"
DTYPE = torch.bfloat16
BNB_DTYPE = "nf4"
LORA_R = 16
LORA_ALPHA = 32
LORA_BIAS = "none"

DATASET_TEXT_FIELD = "text"
OUTPUT_DIR="./training-runs-accelerate/"
BATCH_SIZE=1
GRADIENT_ACCUMULATION_STEPS = 16
LEARNING_RATE = 1.41e-5
WEIGHT_DECAY=0.0
NUM_TRAIN_EPOCHS=1
SEQUENCE_LENGTH=512

In [3]:
from accelerate import Accelerator, DistributedType

accelerator = Accelerator(gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)

if accelerator.is_main_process:
    os.makedirs(OUTPUT_DIR, exist_ok=True)

In [4]:
accelerator.wait_for_everyone()

In [5]:
dataset = load_dataset("timdettmers/openassistant-guanaco")

Repo card metadata block was not found. Setting CardData to empty.


In [6]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
tokenizer.add_special_tokens({"pad_token":"<pad>"})
tokenizer.padding_side = 'left'

In [7]:
quantization_config = BitsAndBytesConfig(
    load_in_8bit=False,
    load_in_4bit=True,
    bnb_4bit_compute_type=DTYPE,
    bnb_4bit_quant_type=BNB_DTYPE,
    bnb_4bit_use_double_quant=True
)

peft_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    bias=LORA_BIAS,
    task_type="CAUSAL_LM"
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=quantization_config,
    torch_dtype=DTYPE,
    token=HF_TOKEN
)

model = get_peft_model(model, peft_config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [8]:
max_seq_len = min(tokenizer.model_max_length, SEQUENCE_LENGTH)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        padding=False,
        max_length=max_seq_len,
        return_overflowing_tokens=False,
        return_length=False,
    )

    return {
        "input_ids": outputs["input_ids"],
        "attention_mask": outputs["attention_mask"]
    }

train_dataset = dataset["train"]
eval_dataset = dataset["test"]

with accelerator.main_process_first():
    tokenized_dataset_train = train_dataset.map(
        tokenize,
        batched=True,
        remove_columns=train_dataset.column_names,
        num_proc=16,
        batch_size=BATCH_SIZE,
    )
    tokenized_dataset_eval = eval_dataset.map(
        tokenize,
        batched=True,
        remove_columns=eval_dataset.column_names,
        num_proc=16,
        batch_size=BATCH_SIZE,
    )

In [9]:
from torch.utils.data import DataLoader, RandomSampler

train_dataloader = DataLoader(
    tokenized_dataset_train, shuffle=True, collate_fn=data_collator, batch_size=BATCH_SIZE
)

eval_dataloader = DataLoader(
    tokenized_dataset_train, collate_fn=data_collator, batch_size=BATCH_SIZE
)

num_update_steps_per_epoch = math.ceil(len(train_dataloader) / GRADIENT_ACCUMULATION_STEPS)
max_train_steps = NUM_TRAIN_EPOCHS * num_update_steps_per_epoch

In [10]:
no_decay = ["bias", "input_layernorm.weight", "post_attention_layernorm.weight"]

optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": WEIGHT_DECAY,
    },
    {
        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]


optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE)


lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=max_train_steps * GRADIENT_ACCUMULATION_STEPS,
)

In [11]:
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
)

# We need to recalculate our total training steps as the size of the training dataloader may have changed.
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / GRADIENT_ACCUMULATION_STEPS)
max_train_steps = NUM_TRAIN_EPOCHS * num_update_steps_per_epoch
NUM_TRAIN_EPOCHS = math.ceil(max_train_steps / num_update_steps_per_epoch)

In [12]:
from tqdm import tqdm

EVAL_STEPS = 10
DO_EVAL = FALSE

total_batch_size = BATCH_SIZE * accelerator.num_processes * GRADIENT_ACCUMULATION_STEPS
progress_bar = tqdm(range(max_train_steps), disable=not accelerator.is_local_main_process)

completed_steps = 0

for epoch in range(NUM_TRAIN_EPOCHS):
    model.train()   

    total_loss = 0.

    for step, batch in enumerate(train_dataloader):
        with accelerator.accumulate(model):
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.detach().float()

            accelerator.backward(loss)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        # Checks if the accelerator has performed an optimization step behind the scenes
        if accelerator.sync_gradients:
            progress_bar.update(1)
            completed_steps += 1

            if completed_steps % EVAL_STEPS == 0:
                accelerator.print("STARTING EVAL")

                if DO_EVAL:
                    model.eval()
                    losses = []
                    for step, batch in enumerate(eval_dataloader):
                        with torch.no_grad():
                            outputs = model(**batch)
                        loss = outputs.loss
                        losses.append(accelerator.gather_for_metrics(loss.repeat(BATCH_SIZE)))
                    losses = torch.cat(losses)

                    try:
                        train_loss = total_loss.item() / step
                        eval_loss = torch.mean(losses)
                        eval_perplexity = math.exp(eval_loss)

                        accelerator.print(f"""
                                completed_steps: {completed_steps}
                                train_loss: {train_loss}
                                eval_loss:  {eval_loss}
                                eval_ppl:   {eval_perplexity}"""
                        )

                    except OverflowError:
                        perplexity = float("inf")
                else:
                    train_loss = total_loss.item() / step
                    accelerator.print(f"completed_steps: {completed_steps} // train_loss: {train_loss}")

  0%|                                                                                             | 0/616 [00:00<?, ?it/s]You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  2%|█▎                                                                                | 10/616 [01:34<1:36:40,  9.57s/it]

KeyboardInterrupt: 