In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
!nvidia-smi

Mon Apr 28 16:33:53 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.58.02              Driver Version: 555.58.02      CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          On  |   00000000:01:00.0 Off |                    0 |
| N/A   24C    P0             60W /  500W |     138MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
!pip install --upgrade transformers

Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl (10.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m84.6 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
Collecting safetensors>=0.4.3
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (471 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 KB[0m [31m124.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.30.0
  Downloading huggingface_hub-0.30.2-py3-none-any.whl (481 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m481.4/481.4 KB[0m [31m117.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.22,>=0.21
  Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m156.6 MB/s[0m eta [36m0:00:00[0m
Collecting numpy>=1.17
  Downloading numpy-2.2.5-cp3

In [4]:
!pip install torch tensorboard protobuf peft sentencepiece datasets bitsandbytes

Collecting torch
  Downloading torch-2.7.0-cp310-cp310-manylinux_2_28_x86_64.whl (865.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m865.2/865.2 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting tensorboard
  Downloading tensorboard-2.19.0-py3-none-any.whl (5.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m127.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
Collecting protobuf
  Downloading protobuf-6.30.2-cp39-abi3-manylinux2014_x86_64.whl (316 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.2/316.2 KB[0m [31m106.4 MB/s[0m eta [36m0:00:00[0m
Collecting peft
  Downloading peft-0.15.2-py3-none-any.whl (411 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m411.1/411.1 KB[0m [31m129.7 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━

In [5]:
!git lfs install

Updated git hooks.
Git LFS initialized.


In [6]:
TOKEN = "" # HF token
MODEL_NAME = "laco-layerskip-llama2-7b"
MODEL_SAVE_DIR = "./laco-layerskip-llama2-7b-finetuned"
DEVICE = "cuda:0"
SEED = 123

In [7]:
import torch
import math

from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling

from datasets import load_dataset
from peft import LoraConfig, get_peft_model

from huggingface_hub import login, snapshot_download

login(token=TOKEN)

In [5]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16, 
    trust_remote_code=True,
    token=TOKEN
)

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME, 
    token=TOKEN,
    use_fast=True
)

model = model.to(DEVICE)

In [6]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

### Fine-tuning

In [None]:
wikipedia_dataset = load_dataset("wikimedia/wikipedia", "20231101.en", split="train")
train_size = 7000
val_size = 1500
wikipedia_dataset = wikipedia_dataset.shuffle(seed=SEED)
wikipedia_dataset_subset = wikipedia_dataset.select(range(train_size))

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=2048)

tokenized_wiki = wikipedia_dataset_subset.map(
    tokenize_function,
    batched=True,
    num_proc=4,
    remove_columns=["text"],
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

In [None]:
val_dataset = wikipedia_dataset.select(range(train_size, train_size + val_size))
tokenized_val = val_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=4,
    remove_columns=["text"],
)

val_dataset_ready = tokenized_val

In [15]:
total_train_tokens = sum(len(example["input_ids"]) for example in tokenized_wiki)
print(f"Total train tokens: {total_train_tokens}")

total_val_tokens = sum(len(example["input_ids"]) for example in val_dataset_ready)
print(f"Total val tokens: {total_val_tokens}")

Total train tokens: 4472103
Total val tokens: 976274


In [16]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)

In [17]:
training_args = TrainingArguments(
    output_dir=MODEL_SAVE_DIR,
    per_device_train_batch_size=2, 
    gradient_accumulation_steps=2,  
    max_steps=3600,
    logging_steps=50,
    save_steps=500,
    eval_strategy="steps",
    eval_steps=250,
    save_total_limit=1,
    bf16=True,  
    optim="paged_adamw_32bit", 
    report_to="tensorboard",
    logging_dir="/workspace/tensorboard_laco_layerskip_llama",
    group_by_length=True,
    learning_rate=2e-4,
    warmup_steps=100,
    weight_decay=0.0,
    lr_scheduler_type="cosine",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_wiki,
    eval_dataset=val_dataset_ready,
    data_collator=data_collator,
)

trainer.train()

# training_args = TrainingArguments(
#     output_dir="./llama2-7b-finetuned",
#     per_device_train_batch_size=2, 
#     gradient_accumulation_steps=8,   
#     num_train_epochs=2,
#     logging_steps=10,
#     save_steps=100,
#     save_total_limit=2,
#     eval_strategy="no",
#     bf16=True,
#     optim="adamw_torch",
#     learning_rate=2e-4,
#     weight_decay=0.01,
#     max_grad_norm=1.0,
#     lr_scheduler_type="cosine",
#     warmup_ratio=0.05,
#     ddp_find_unused_parameters=False,
#     report_to="none",
# )

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss
250,1.9698,1.935256
500,1.9029,1.905965
750,1.8624,1.890535
1000,1.8729,1.87516
1250,1.8404,1.865805
1500,1.8621,1.856319
1750,1.8891,1.850141
2000,1.8351,1.849035
2250,1.8052,1.845148
2500,1.8297,1.840263


TrainOutput(global_step=3600, training_loss=1.8541072252061632, metrics={'train_runtime': 1835.413, 'train_samples_per_second': 7.846, 'train_steps_per_second': 1.961, 'total_flos': 2.1034469407408128e+17, 'train_loss': 1.8541072252061632, 'epoch': 2.057142857142857})

In [18]:
trainer.save_model(MODEL_SAVE_DIR)
tokenizer.save_pretrained(MODEL_SAVE_DIR)

('./laco-layerskip-llama2-7b-finetuned/tokenizer_config.json',
 './laco-layerskip-llama2-7b-finetuned/special_tokens_map.json',
 './laco-layerskip-llama2-7b-finetuned/tokenizer.model',
 './laco-layerskip-llama2-7b-finetuned/added_tokens.json',
 './laco-layerskip-llama2-7b-finetuned/tokenizer.json')

In [19]:
metrics = trainer.evaluate(eval_dataset=val_dataset_ready)
eval_loss = metrics["eval_loss"]
eval_perplexity = math.exp(eval_loss)

print(f"Validation Loss: {eval_loss:.4f}")
print(f"Validation Perplexity: {eval_perplexity:.2f}")

Validation Loss: 1.8362
Validation Perplexity: 6.27


In [None]:
encoded_inputs = tokenizer(['Meta vs google, the winner is'], return_tensors='pt').to(DEVICE)

output = model.generate(
    **encoded_inputs,
    max_new_tokens=32,
    do_sample=False,     
    temperature=0     
)

output_text = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
print(output_text)

In [None]:
model.push_to_hub("sidsr/laco-llama2-7b-finetuned")
tokenizer.push_to_hub("sidsr/laco-llama2-7b-finetuned")

In [8]:
!pip install lm-eval

Collecting lm-eval
  Downloading lm_eval-0.4.8-py3-none-any.whl (3.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.9/3.9 MB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 KB[0m [31m47.2 MB/s[0m eta [36m0:00:00[0m
Collecting numexpr
  Downloading numexpr-2.10.2-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (397 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m397.3/397.3 KB[0m [31m120.8 MB/s[0m eta [36m0:00:00[0m
Collecting pytablewriter
  Downloading pytablewriter-1.2.1-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.1/91.1 KB[0m [31m54.4 MB/s[0m eta [36m0:00:00[0m
Collecting sacrebleu>=1.5.0
  Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/

In [None]:
snapshot_download("sidsr/laco-layerskip-llama2-7b", local_dir="/workspace/layerskip-modeling/laco-layerskip-llama2-7b")
snapshot_download("sidsr/laco-layerskip-llama2-7b-finetuned", local_dir="/workspace/layerskip-modeling/laco-layerskip-llama2-7b-finetuned")

In [24]:
!ls "/workspace/layerskip-modeling/eval/"

laco-layerskip-llama2-7b	    laco-llama2-7b
laco-layerskip-llama2-7b-finetuned  laco-llama2-7b-finetuned


In [8]:
!mkdir -p "/workspace/layerskip-modeling/eval/laco-llama2-7b"

In [18]:
!ls -al "/workspace/layerskip-modeling/eval/laco-llama2-7b/.__laco-llama2-7b-finetuned"

total 100
drwxrwsr-x+ 2 root user     61 Apr 28 16:53 .
drwxrwsr-x+ 4 root user     78 Apr 28 16:53 ..
-rw-rw-r--+ 1 root user 100578 Apr 28 16:53 results_2025-04-28T16-53-40.913934.json


In [25]:
!lm_eval \
    --model hf \
    --model_args "pretrained=./laco-layerskip-llama2-7b" \
    --tasks mmlu \
    --output_path "/workspace/layerskip-modeling/eval/laco-layerskip-llama2-7b"

2025-04-28:17:52:25,522 INFO     [lm_eval.__main__:379] Selected Tasks: ['mmlu']
2025-04-28:17:52:25,524 INFO     [lm_eval.evaluator:169] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
2025-04-28:17:52:25,524 INFO     [lm_eval.evaluator:206] Initializing hf model, with arguments: {'pretrained': './laco-layerskip-llama2-7b'}
2025-04-28:17:52:25,865 INFO     [lm_eval.models.huggingface:136] Using device 'cuda'
2025-04-28:17:52:25,936 INFO     [lm_eval.models.huggingface:376] Model parallel was set to False, max memory was not set, and device map was set to {'': 'cuda'}
2025-04-28:17:53:15,174 INFO     [lm_eval.api.task:420] Building contexts for mmlu_abstract_algebra on rank 0...
100%|████████████████████████████████████████| 100/100 [00:00<00:00, 650.55it/s]
2025-04-28:17:53:15,332 INFO     [lm_eval.api.task:420] Building contexts for mmlu_anatomy on rank 0...
100%|████████████████████████████████████████| 

In [22]:
!lm_eval \
    --model hf \
    --model_args "pretrained=./laco-llama2-7b" \
    --tasks mmlu \
    --output_path "/workspace/layerskip-modeling/eval/laco-llama2-7b"

2025-04-28:17:41:52,813 INFO     [lm_eval.__main__:379] Selected Tasks: ['mmlu']
2025-04-28:17:41:52,815 INFO     [lm_eval.evaluator:169] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
2025-04-28:17:41:52,816 INFO     [lm_eval.evaluator:206] Initializing hf model, with arguments: {'pretrained': './laco-llama2-7b'}
2025-04-28:17:41:53,133 INFO     [lm_eval.models.huggingface:136] Using device 'cuda'
2025-04-28:17:41:53,231 INFO     [lm_eval.models.huggingface:376] Model parallel was set to False, max memory was not set, and device map was set to {'': 'cuda'}
2025-04-28:17:42:47,577 INFO     [lm_eval.api.task:420] Building contexts for mmlu_abstract_algebra on rank 0...
100%|████████████████████████████████████████| 100/100 [00:00<00:00, 653.69it/s]
2025-04-28:17:42:47,735 INFO     [lm_eval.api.task:420] Building contexts for mmlu_anatomy on rank 0...
100%|████████████████████████████████████████| 135/135 [0

In [26]:
!lm_eval \
    --model hf \
    --model_args "pretrained=./laco-layerskip-llama2-7b,peft=./laco-layerskip-llama2-7b-finetuned" \
    --tasks mmlu \
    --output_path "/workspace/layerskip-modeling/eval/laco-layerskip-llama2-7b-finetuned"

2025-04-28:17:58:18,751 INFO     [lm_eval.__main__:379] Selected Tasks: ['mmlu']
2025-04-28:17:58:18,753 INFO     [lm_eval.evaluator:169] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
2025-04-28:17:58:18,753 INFO     [lm_eval.evaluator:206] Initializing hf model, with arguments: {'pretrained': './laco-layerskip-llama2-7b', 'peft': './laco-layerskip-llama2-7b-finetuned'}
2025-04-28:17:58:19,075 INFO     [lm_eval.models.huggingface:136] Using device 'cuda'
2025-04-28:17:58:19,143 INFO     [lm_eval.models.huggingface:376] Model parallel was set to False, max memory was not set, and device map was set to {'': 'cuda'}
2025-04-28:17:59:09,758 INFO     [lm_eval.api.task:420] Building contexts for mmlu_abstract_algebra on rank 0...
100%|████████████████████████████████████████| 100/100 [00:00<00:00, 660.52it/s]
2025-04-28:17:59:09,915 INFO     [lm_eval.api.task:420] Building contexts for mmlu_anatomy on rank 0...

In [21]:
!lm_eval \
    --model hf \
    --model_args "pretrained=./laco-llama2-7b,peft=./laco-llama2-7b-finetuned" \
    --tasks mmlu \
    --output_path "/workspace/layerskip-modeling/eval/laco-llama2-7b-finetuned"

2025-04-28:17:33:23,628 INFO     [lm_eval.__main__:379] Selected Tasks: ['mmlu']
2025-04-28:17:33:23,630 INFO     [lm_eval.evaluator:169] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
2025-04-28:17:33:23,630 INFO     [lm_eval.evaluator:206] Initializing hf model, with arguments: {'pretrained': './laco-llama2-7b', 'peft': './laco-llama2-7b-finetuned'}
2025-04-28:17:33:23,990 INFO     [lm_eval.models.huggingface:136] Using device 'cuda'
2025-04-28:17:33:24,089 INFO     [lm_eval.models.huggingface:376] Model parallel was set to False, max memory was not set, and device map was set to {'': 'cuda'}
2025-04-28:17:34:15,043 INFO     [lm_eval.api.task:420] Building contexts for mmlu_abstract_algebra on rank 0...
100%|████████████████████████████████████████| 100/100 [00:00<00:00, 643.48it/s]
2025-04-28:17:34:15,204 INFO     [lm_eval.api.task:420] Building contexts for mmlu_anatomy on rank 0...
100%|██████████████