In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "EleutherAI/gpt-neox-20b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})

  from .autonotebook import tqdm as notebook_tqdm
Downloading (…)okenizer_config.json: 100%|██████████| 156/156 [00:00<00:00, 329kB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 1.08M/1.08M [00:00<00:00, 16.1MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 457k/457k [00:00<00:00, 95.1MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 2.11M/2.11M [00:00<00:00, 95.0MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 90.0/90.0 [00:00<00:00, 680kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 613/613 [00:00<00:00, 3.36MB/s]
Downloading (…)fetensors.index.json: 100%|██████████| 60.4k/60.4k [00:00<00:00, 196MB/s]
Downloading shards:   0%|          | 0/46 [00:00<?, ?it/s]
Downloading (…)of-00046.safetensors:   0%|          | 0.00/926M [00:00<?, ?B/s][A
Downloading (…)of-00046.safetensors:   1%|          | 10.5M/926M [00:00<00:21, 43.6MB/s][A
Downloading (…)of-00046.safetensors:   3%|▎         | 31.5M/926M [00:00<00:11, 79.7MB/s][A
Downloadi

In [2]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [3]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [4]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8, 
    lora_alpha=32, 
    target_modules=["query_key_value"], 
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 8650752 || all params: 10597552128 || trainable%: 0.08162971878329976


In [5]:
from datasets import load_dataset

data = load_dataset("Abirate/english_quotes")
data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)

Downloading readme: 100%|██████████| 5.55k/5.55k [00:00<00:00, 5.69MB/s]
Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]
Downloading data:   0%|          | 0.00/647k [00:00<?, ?B/s][A
Downloading data: 100%|██████████| 647k/647k [00:00<00:00, 2.57MB/s][A
Downloading data files: 100%|██████████| 1/1 [00:00<00:00,  3.83it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 976.78it/s]
Generating train split: 2508 examples [00:00, 6432.05 examples/s]
Map: 100%|██████████| 2508/2508 [00:00<00:00, 2557.17 examples/s]


In [6]:
import transformers

# needed for gpt-neo-x tokenizer
tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=10,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,2.1312
2,2.4357
3,2.5255
4,3.2576
5,2.5079
6,1.5224
7,2.1849
8,2.5835
9,1.5958
10,1.8796


TrainOutput(global_step=10, training_loss=2.2624030709266663, metrics={'train_runtime': 182.4787, 'train_samples_per_second': 0.219, 'train_steps_per_second': 0.055, 'total_flos': 100490233282560.0, 'train_loss': 2.2624030709266663, 'epoch': 0.02})