In [1]:
model_names = ['mosaicml/mpt-7b-8k-instruct', 'EleutherAI/gpt-neox-20b']

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig

model_name = model_names[1]

config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
#config.attn_config['attn_impl'] = 'triton'  # change this to use triton-based FlashAttention
config.init_device = 'cuda:0' # For fast initialization directly on GPU!

print (model_name)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
) 
# ~4GB for MPT-7b
# ~14GB for gpt-neox-20b

#bnb_config = None 
# ~13 GB for MPT-7b
# OOM for gpt-neox-20b

# What happens if I pass bfloat16 and Bitsandbytes config to the model?
model = AutoModelForCausalLM.from_pretrained(
  model_name,
  config=config,
  torch_dtype=torch.bfloat16, # Load model weights in bfloat16
  trust_remote_code=True,
  quantization_config=bnb_config
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

EleutherAI/gpt-neox-20b


Loading checkpoint shards:   0%|          | 0/46 [00:00<?, ?it/s]

## Inference

In [3]:
from transformers import pipeline

with torch.autocast('cuda', dtype=torch.bfloat16):
    inputs = tokenizer('Here is a 10 day iternary to Machu Pichu:\n', return_tensors="pt").to('cuda')
    outputs = model.generate(**inputs, max_new_tokens=400)
    print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Here is a 10 day iternary to Machu Pichu:

1.  From Cusco, take a bus to Aguas Calientes.

2.  From Aguas Calientes, take a bus to Machu Pichu.

3.  From Machu Pichu, take a bus to Cusco.

4.  From Cusco, take a bus to Aguas Calientes.

5.  From Aguas Calientes, take a bus to Machu Pichu.

6.  From Machu Pichu, take a bus to Cusco.

7.  From Cusco, take a bus to Aguas Calientes.

8.  From Aguas Calientes, take a bus to Machu Pichu.

9.  From Machu Pichu, take a bus to Cusco.

10.  From Cusco, take a bus to Aguas Calientes.

The bus from Machu Pichu to Cusco is a very long and winding road.  The bus from Cusco to Aguas Calientes is a very long and winding road.  The bus from Aguas Calientes to Machu Pichu is a very long and winding road.  The bus from Machu Pichu to Cusco is a very long and winding road.  The bus from Cusco to Aguas Calientes is a very long and winding road.  The bus from Aguas Calientes to Machu Pichu is a very long and winding road.  The bus from Machu Pichu to Cusco 

## Training

#### QLORA

In [4]:
from peft import prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)

In [5]:
model.modules

<bound method Module.modules of GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50432, 6144)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-43): 44 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((6144,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((6144,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear4bit(in_features=6144, out_features=18432, bias=True)
          (dense): Linear4bit(in_features=6144, out_features=6144, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear4bit(in_features=6144, out_features=24576, bias=True)
          (dense_4h_to_h): Linear4bit(

#### PEFT 

In [6]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8, 
    lora_alpha=32, 
    target_modules=["query_key_value"],
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 8,650,752 || all params: 10,597,552,128 || trainable%: 0.08162971878329976


### Dataset

In [7]:
from datasets import load_dataset

data = load_dataset("Abirate/english_quotes")
data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)

Found cached dataset json (/home/siddhesh1793/.cache/huggingface/datasets/Abirate___json/Abirate--english_quotes-6e72855d06356857/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at /home/siddhesh1793/.cache/huggingface/datasets/Abirate___json/Abirate--english_quotes-6e72855d06356857/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-8d82074056c19c4a.arrow


### Trainer

In [8]:
import transformers

# needed for gpt-neo-x tokenizer
tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=10,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

  0%|          | 0/10 [00:00<?, ?it/s]

You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 2.1369, 'learning_rate': 0.0001, 'epoch': 0.0}
{'loss': 2.4352, 'learning_rate': 0.0002, 'epoch': 0.0}
{'loss': 2.4872, 'learning_rate': 0.000175, 'epoch': 0.0}
{'loss': 3.1341, 'learning_rate': 0.00015000000000000001, 'epoch': 0.01}
{'loss': 2.4746, 'learning_rate': 0.000125, 'epoch': 0.01}
{'loss': 1.5631, 'learning_rate': 0.0001, 'epoch': 0.01}
{'loss': 2.1726, 'learning_rate': 7.500000000000001e-05, 'epoch': 0.01}
{'loss': 2.5679, 'learning_rate': 5e-05, 'epoch': 0.01}
{'loss': 1.6739, 'learning_rate': 2.5e-05, 'epoch': 0.01}
{'loss': 1.9144, 'learning_rate': 0.0, 'epoch': 0.02}
{'train_runtime': 42.2753, 'train_samples_per_second': 0.946, 'train_steps_per_second': 0.237, 'train_loss': 2.255991780757904, 'epoch': 0.02}


TrainOutput(global_step=10, training_loss=2.255991780757904, metrics={'train_runtime': 42.2753, 'train_samples_per_second': 0.946, 'train_steps_per_second': 0.237, 'train_loss': 2.255991780757904, 'epoch': 0.02})