In [1]:
model_names = ['mosaicml/mpt-7b-8k-instruct', 'EleutherAI/gpt-neox-20b', 'openlm-research/open_llama_13b', 'meta-llama/Llama-2-13b-hf']

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig

model_name = model_names[1]

config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
#config.attn_config['attn_impl'] = 'triton'  # change this to use triton-based FlashAttention
config.init_device = 'cuda:0' # For fast initialization directly on GPU!

print (model_name)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
) 
# ~4GB for MPT-7b
# ~14GB for gpt-neox-20b

#bnb_config = None 
# ~13 GB for MPT-7b
# OOM for gpt-neox-20b

# What happens if I pass bfloat16 and Bitsandbytes config to the model?
model = AutoModelForCausalLM.from_pretrained(
  model_name,
  config=config,
  torch_dtype=torch.bfloat16, # Load model weights in bfloat16
  trust_remote_code=True,
  quantization_config=bnb_config
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

EleutherAI/gpt-neox-20b


Loading checkpoint shards:   0%|          | 0/46 [00:00<?, ?it/s]

## Inference

In [3]:
with torch.autocast('cuda', dtype=torch.bfloat16):
    inputs = tokenizer('Here is a 10 day iternary to Machu Pichu:\n', return_tensors="pt").to('cuda')
    outputs = model.generate(**inputs, max_new_tokens=400)
    print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Here is a 10 day iternary to Machu Pichu:

Day 1:

Arrive in Cusco.

Day 2:

Arrive in Cusco.

Day 3:

Cusco to Machu Pichu.

Day 4:

Machu Pichu.

Day 5:

Machu Pichu.

Day 6:

Machu Pichu.

Day 7:

Machu Pichu.

Day 8:

Machu Pichu.

Day 9:

Machu Pichu.

Day 10:

Machu Pichu.

Day 11:

Machu Pichu.

Day 12:

Machu Pichu.

Day 13:

Machu Pichu.

Day 14:

Machu Pichu.

Day 15:

Machu Pichu.

Day 16:

Machu Pichu.

Day 17:

Machu Pichu.

Day 18:

Machu Pichu.

Day 19:

Machu Pichu.

Day 20:

Machu Pichu.

Day 21:

Machu Pichu.

Day 22:

Machu Pichu.

Day 23:

Machu Pichu.

Day 24:

Machu Pichu.

Day 25:

Machu Pichu.

Day 26:

Machu Pichu.

Day 27:

Machu Pichu.

Day 28:

Machu Pichu.

Day 29:



## Training

#### QLORA

In [4]:
from peft import prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)

In [5]:
model.modules

<bound method Module.modules of GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50432, 6144)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-43): 44 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((6144,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((6144,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear4bit(in_features=6144, out_features=18432, bias=True)
          (dense): Linear4bit(in_features=6144, out_features=6144, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear4bit(in_features=6144, out_features=24576, bias=True)
          (dense_4h_to_h): Linear4bit(

#### PEFT 

In [6]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8, 
    lora_alpha=32, 
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 8,650,752 || all params: 10,597,552,128 || trainable%: 0.08162971878329976


### Dataset

In [7]:
from datasets import load_dataset

data = load_dataset("Abirate/english_quotes")
data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)

Found cached dataset json (/home/siddhesh1793/.cache/huggingface/datasets/Abirate___json/Abirate--english_quotes-6e72855d06356857/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at /home/siddhesh1793/.cache/huggingface/datasets/Abirate___json/Abirate--english_quotes-6e72855d06356857/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-8d82074056c19c4a.arrow


### Trainer

In [8]:
import transformers

# needed for gpt-neo-x tokenizer
tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=50,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir=f"outputs/{model_name}",
        optim="paged_adamw_8bit",
        save_steps=2,
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

  0%|          | 0/50 [00:00<?, ?it/s]

You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 2.1369, 'learning_rate': 0.0001, 'epoch': 0.0}
{'loss': 2.4352, 'learning_rate': 0.0002, 'epoch': 0.0}




{'loss': 2.4952, 'learning_rate': 0.00019583333333333334, 'epoch': 0.0}
{'loss': 3.1632, 'learning_rate': 0.00019166666666666667, 'epoch': 0.01}




{'loss': 2.4205, 'learning_rate': 0.0001875, 'epoch': 0.01}
{'loss': 1.5876, 'learning_rate': 0.00018333333333333334, 'epoch': 0.01}




{'loss': 2.1386, 'learning_rate': 0.0001791666666666667, 'epoch': 0.01}
{'loss': 2.6409, 'learning_rate': 0.000175, 'epoch': 0.01}




{'loss': 1.6992, 'learning_rate': 0.00017083333333333333, 'epoch': 0.01}
{'loss': 1.9618, 'learning_rate': 0.0001666666666666667, 'epoch': 0.02}




{'loss': 1.7577, 'learning_rate': 0.00016250000000000002, 'epoch': 0.02}
{'loss': 0.8182, 'learning_rate': 0.00015833333333333332, 'epoch': 0.02}




{'loss': 1.8487, 'learning_rate': 0.00015416666666666668, 'epoch': 0.02}
{'loss': 1.7609, 'learning_rate': 0.00015000000000000001, 'epoch': 0.02}




{'loss': 2.2098, 'learning_rate': 0.00014583333333333335, 'epoch': 0.02}
{'loss': 2.4736, 'learning_rate': 0.00014166666666666668, 'epoch': 0.03}




{'loss': 2.5606, 'learning_rate': 0.0001375, 'epoch': 0.03}
{'loss': 2.4593, 'learning_rate': 0.00013333333333333334, 'epoch': 0.03}




{'loss': 2.324, 'learning_rate': 0.00012916666666666667, 'epoch': 0.03}
{'loss': 2.2145, 'learning_rate': 0.000125, 'epoch': 0.03}




{'loss': 1.9277, 'learning_rate': 0.00012083333333333333, 'epoch': 0.03}
{'loss': 1.5122, 'learning_rate': 0.00011666666666666668, 'epoch': 0.04}




{'loss': 2.3103, 'learning_rate': 0.00011250000000000001, 'epoch': 0.04}
{'loss': 2.1635, 'learning_rate': 0.00010833333333333333, 'epoch': 0.04}




{'loss': 1.9069, 'learning_rate': 0.00010416666666666667, 'epoch': 0.04}
{'loss': 2.2187, 'learning_rate': 0.0001, 'epoch': 0.04}




{'loss': 1.81, 'learning_rate': 9.583333333333334e-05, 'epoch': 0.04}
{'loss': 2.7756, 'learning_rate': 9.166666666666667e-05, 'epoch': 0.04}




{'loss': 2.6671, 'learning_rate': 8.75e-05, 'epoch': 0.05}
{'loss': 2.2795, 'learning_rate': 8.333333333333334e-05, 'epoch': 0.05}




{'loss': 2.8735, 'learning_rate': 7.916666666666666e-05, 'epoch': 0.05}
{'loss': 1.739, 'learning_rate': 7.500000000000001e-05, 'epoch': 0.05}




{'loss': 2.559, 'learning_rate': 7.083333333333334e-05, 'epoch': 0.05}
{'loss': 1.4029, 'learning_rate': 6.666666666666667e-05, 'epoch': 0.05}




{'loss': 2.6741, 'learning_rate': 6.25e-05, 'epoch': 0.06}
{'loss': 3.0124, 'learning_rate': 5.833333333333334e-05, 'epoch': 0.06}




{'loss': 1.4609, 'learning_rate': 5.4166666666666664e-05, 'epoch': 0.06}
{'loss': 0.9397, 'learning_rate': 5e-05, 'epoch': 0.06}




{'loss': 2.8687, 'learning_rate': 4.5833333333333334e-05, 'epoch': 0.06}
{'loss': 1.4135, 'learning_rate': 4.166666666666667e-05, 'epoch': 0.06}




{'loss': 2.3522, 'learning_rate': 3.7500000000000003e-05, 'epoch': 0.07}
{'loss': 2.4627, 'learning_rate': 3.3333333333333335e-05, 'epoch': 0.07}




{'loss': 1.7225, 'learning_rate': 2.916666666666667e-05, 'epoch': 0.07}
{'loss': 1.2857, 'learning_rate': 2.5e-05, 'epoch': 0.07}




{'loss': 2.2648, 'learning_rate': 2.0833333333333336e-05, 'epoch': 0.07}
{'loss': 1.9939, 'learning_rate': 1.6666666666666667e-05, 'epoch': 0.07}




{'loss': 2.5065, 'learning_rate': 1.25e-05, 'epoch': 0.07}
{'loss': 2.0672, 'learning_rate': 8.333333333333334e-06, 'epoch': 0.08}




{'loss': 2.0336, 'learning_rate': 4.166666666666667e-06, 'epoch': 0.08}
{'loss': 1.771, 'learning_rate': 0.0, 'epoch': 0.08}
{'train_runtime': 269.0258, 'train_samples_per_second': 0.743, 'train_steps_per_second': 0.186, 'train_loss': 2.1216299855709075, 'epoch': 0.08}


TrainOutput(global_step=50, training_loss=2.1216299855709075, metrics={'train_runtime': 269.0258, 'train_samples_per_second': 0.743, 'train_steps_per_second': 0.186, 'train_loss': 2.1216299855709075, 'epoch': 0.08})