In [1]:
import torch, os, multiprocessing
from datasets import load_dataset
from peft import LoraConfig, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    set_seed
)
from trl import SFTTrainer, SFTConfig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
!huggingface-cli login --token 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

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
set_seed(1234)

In [6]:
if torch.cuda.is_bf16_supported():
  os.system('pip install flash_attn')
  compute_dtype = torch.bfloat16
  attn_implementation = 'flash_attention_2'
else:
  compute_dtype = torch.float32
  attn_implementation = 'sdpa'

In [7]:
model_name = "meta-llama/Meta-Llama-3.1-8B"

In [8]:
ds = load_dataset("tatsu-lab/alpaca")

Downloading readme: 100%|██████████| 7.47k/7.47k [00:00<00:00, 27.4MB/s]


Downloading and preparing dataset parquet/tatsu-lab--alpaca to /root/.cache/huggingface/datasets/tatsu-lab___parquet/tatsu-lab--alpaca-2b32f0433506ef5f/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data: 100%|██████████| 24.2M/24.2M [00:00<00:00, 76.8MB/s]
Downloading data files: 100%|██████████| 1/1 [00:02<00:00,  2.61s/it]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 1170.29it/s]
                                                                    

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/tatsu-lab___parquet/tatsu-lab--alpaca-2b32f0433506ef5f/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:00<00:00, 254.56it/s]


In [12]:
for d in ds["train"]:
    print(d)
    break

{'instruction': 'Give three tips for staying healthy.', 'input': '', 'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.', 'text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nGive three tips for staying healthy.\n\n### Response:\n1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.'}


In [16]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = "<|finetune_right_pad_id|>"
tokenizer.pad_token_id = 128004
tokenizer.padding_side = 'right'

In [8]:
def process(row):
    row["text"] = row["text"]+"<|end_of_text|>"
    return row
ds = ds.map(
    process,
    num_proc= multiprocessing.cpu_count(),
    load_from_cache_file=False,
)

                                                                                   

In [9]:
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_storage=compute_dtype,
)

In [11]:
model = AutoModelForCausalLM.from_pretrained(
          model_name, quantization_config=bnb_config, torch_dtype=compute_dtype
)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards: 100%|██████████| 4/4 [00:08<00:00,  2.04s/it]


In [17]:
for name, param in model.named_parameters():
    # freeze base model's layers
    param.requires_grad = False

In [18]:
def make_inputs_require_grad(module, input, output):
    output.requires_grad_(True)
model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)

<torch.utils.hooks.RemovableHandle at 0x7fd32a729cd0>

In [19]:
model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={'use_reentrant':True})

In [20]:
peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.05,
        r=16,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"]
)

In [21]:
output_dir = "./Llama3.1_8b_QLoRA/"

In [30]:
training_arguments = SFTConfig(
        output_dir=output_dir ,
        eval_strategy="steps",
        evaluation_strategy="no",
        do_eval=False,
        optim="adamw_torch",
        per_device_train_batch_size=1,
        gradient_accumulation_steps=16,
        per_device_eval_batch_size=1,
        log_level="debug",
        logging_steps=1,
        learning_rate=1e-4,
        bf16 = False,
        max_steps=50,
        warmup_ratio=0.1,
        lr_scheduler_type="linear",
        dataset_text_field="text",
        max_seq_length=512
        # save_strategy="steps",
        # save_steps=5,  # Save every 5 epochs
        # save_total_limit=3
)
trainer = SFTTrainer(
        model=model,
        train_dataset=ds['train'],
        peft_config=peft_config,
        tokenizer=tokenizer,
        args=training_arguments,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
PyTorch: setting up devices
Loading cached processed dataset at /root/.cache/huggingface/datasets/tatsu-lab___parquet/tatsu-lab--alpaca-2b32f0433506ef5f/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7/cache-7941d3e8084d1433.arrow
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
max_steps is given, it will override any value given in num_train_epochs


In [31]:
trainer.train()
trainer.save_model(output_dir)

Currently training with a batch size of: 2
***** Running training *****
  Num examples = 52,002
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Training with DataParallel so batch size has been adjusted to: 2
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 16
  Total optimization steps = 50
  Number of trainable parameters = 41,943,040
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss
1,13.8125
2,13.7578
3,13.1797
4,12.0312
5,10.3906
6,9.5469
7,9.0938
8,8.9297
9,8.4141
10,8.1562


Saving model checkpoint to ./Llama3.1_8b_QLoRA/checkpoint-50
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B/snapshots/48d6d0fc4e02fb1269b36940650a1b7233035cbb/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 8.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfl

In [5]:
# model = BertForMaskedLM.from_pretrained('/path/to/pytorch_model.bin',config='../config.json', local_files_only=True)
model = AutoModelForCausalLM.from_pretrained('/root/data/rrr/usr/finetune_llama/Llama3.1_8b_QLoRA/checkpoint-50/')

Loading checkpoint shards:  75%|███████▌  | 3/4 [00:09<00:03,  3.13s/it]

: 

In [41]:
model = model.to('cpu')

In [42]:
def evaluate_prompts(model, tokenizer, prompts, device):
    model.eval()
    results = []
    for prompt in prompts:
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=50)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        results.append({"prompt": prompt, "response": response})
    return results

evaluation_prompts = [
    "Summarize the main ideas of democracy.",
    "Write a haiku about spring.",
    "Explain the concept of gravity to a 5-year-old."
]
prompt_results = evaluate_prompts(model, tokenizer, evaluation_prompts, 'cpu')

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


RuntimeError: "addmm_impl_cpu_" not implemented for 'Half'