In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"]="1"

try:
    HF_TOKEN = os.environ["HF_TOKEN"]
except:
    raise ValueError("Set HF_TOKEN enviornment variable equal to your access token")

In [2]:
import torch
from datasets import load_dataset
from peft import LoraConfig, PeftModel, get_peft_model
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)

MODEL_ID = "meta-llama/Llama-2-7b-hf"
DTYPE = torch.bfloat16
BNB_DTYPE = "nf4"
LORA_R = 16
LORA_ALPHA = 32
LORA_BIAS = "none"

DATASET_TEXT_FIELD = "text"
OUTPUT_DIR="./training-runs-direct/"
BATCH_SIZE=1
GRADIENT_ACCUMULATION_STEPS = 16
LEARNING_RATE = 1.41e-5
WEIGHT_DECAY=0.0
NUM_TRAIN_EPOCHS=1
SEQUENCE_LENGTH=512

In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
tokenizer.add_special_tokens({"pad_token":"<pad>"})
tokenizer.padding_side = 'left'

In [4]:
quantization_config = BitsAndBytesConfig(
    load_in_8bit=False,
    load_in_4bit=True,
    bnb_4bit_compute_type=DTYPE,
    bnb_4bit_quant_type=BNB_DTYPE,
    bnb_4bit_use_double_quant=True
)

peft_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    bias=LORA_BIAS,
    task_type="CAUSAL_LM"
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=quantization_config,
    torch_dtype=DTYPE,
    token=HF_TOKEN
)

model = get_peft_model(model, peft_config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [5]:
dataset = load_dataset("timdettmers/openassistant-guanaco")

Repo card metadata block was not found. Setting CardData to empty.


In [6]:
max_seq_len = min(tokenizer.model_max_length, SEQUENCE_LENGTH)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        padding=False,
        max_length=max_seq_len,
        return_overflowing_tokens=False,
        return_length=False,
    )

    return {
        "input_ids": outputs["input_ids"],
        "attention_mask": outputs["attention_mask"]
    }

train_dataset = dataset["train"]
eval_dataset = dataset["test"]

tokenized_dataset_train = train_dataset.map(
    tokenize,
    batched=True,
    remove_columns=train_dataset.column_names,
    num_proc=16,
    batch_size=BATCH_SIZE,
)

tokenized_dataset_eval = eval_dataset.map(
    tokenize,
    batched=True,
    remove_columns=eval_dataset.column_names,
    num_proc=16,
    batch_size=BATCH_SIZE,
)

In [7]:
%rm -rf training-runs-direct
%mkdir training-runs-direct

In [8]:
from torch.utils.data import DataLoader, RandomSampler

train_dataloader = DataLoader(
    tokenized_dataset_train, shuffle=True, collate_fn=data_collator, batch_size=BATCH_SIZE
)

In [9]:
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x7fadf816b760>

In [10]:
no_decay = ["bias", "input_layernorm.weight", "post_attention_layernorm.weight"]

optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": WEIGHT_DECAY,
    },
    {
        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]

In [11]:
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE)

In [12]:
from transformers import get_linear_schedule_with_warmup

lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * NUM_TRAIN_EPOCHS),
)

In [13]:
EVAL_STEPS = 100

for epoch in range(NUM_TRAIN_EPOCHS):
    total_loss = 0.

    model.train()

    for step, batch in enumerate(train_dataloader):
        outputs = model(**batch)
        loss = outputs.loss

        loss.backward()

        total_loss += loss.detach().float()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        if step % EVAL_STEPS == 0:
            print(f"Step {step} / {len(train_dataloader)} : {(total_loss / step).float() :0.2f}")

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step 0 / 9846 : inf
Step 100 / 9846 : 1.61
Step 200 / 9846 : 1.49
Step 300 / 9846 : 1.46
Step 400 / 9846 : 1.43
Step 500 / 9846 : 1.42
Step 600 / 9846 : 1.41
Step 700 / 9846 : 1.39
Step 800 / 9846 : 1.39
Step 900 / 9846 : 1.38
Step 1000 / 9846 : 1.38
Step 1100 / 9846 : 1.37
Step 1200 / 9846 : 1.36
Step 1300 / 9846 : 1.36
Step 1400 / 9846 : 1.36
Step 1500 / 9846 : 1.36
Step 1600 / 9846 : 1.36
Step 1700 / 9846 : 1.36
Step 1800 / 9846 : 1.35
Step 1900 / 9846 : 1.35
Step 2000 / 9846 : 1.34
Step 2100 / 9846 : 1.34
Step 2200 / 9846 : 1.34
Step 2300 / 9846 : 1.34
Step 2400 / 9846 : 1.34
Step 2500 / 9846 : 1.33
Step 2600 / 9846 : 1.33
Step 2700 / 9846 : 1.33
Step 2800 / 9846 : 1.33
Step 2900 / 9846 : 1.33
Step 3000 / 9846 : 1.33
Step 3100 / 9846 : 1.33
Step 3200 / 9846 : 1.33
Step 3300 / 9846 : 1.33
Step 3400 / 9846 : 1.32
Step 3500 / 9846 : 1.32
Step 3600 / 9846 : 1.32
Step 3700 / 9846 : 1.32
Step 3800 / 9846 : 1.32
Step 3900 / 9846 : 1.32
Step 4000 / 9846 : 1.32
Step 4100 / 9846 : 1.32
Step 

In [14]:
model.save_pretrained("./training-runs-direct")

In [15]:
import gc

del optimizer
del batch 
del outputs

gc.collect()


<function torch.cuda.memory.empty_cache() -> None>

In [16]:
torch.cuda.empty_cache()

In [42]:
model.eval()

prompts = [
    "### Human: What do you think about ChatGPT?### Assistant:",
]

model_inputs = tokenizer(prompts, return_tensors="pt", padding=True)

In [45]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear4bit(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
              (v_proj): Linear4bit(
             