In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    AdamW,
    get_linear_schedule_with_warmup,
)

# Loading Model

In [2]:
import transformers
import torch

local_model_path = "facebook/layerskip-llama2-7B"
tokenizer = transformers.AutoTokenizer.from_pretrained(local_model_path)
model = transformers.AutoModelForCausalLM.from_pretrained(
    local_model_path,
    use_safetensors=True,
    device_map="auto",
    torch_dtype=torch.float32
)

Loading checkpoint shards: 100%|██████████| 3/3 [00:06<00:00,  2.12s/it]


In [3]:
if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = model.config.eos_token_id

# Preprocess Datset

In [5]:
ds = load_dataset("gsm8k", "main")

In [6]:
def preprocess(ex):
    # Format: model sees "Question: ...\nAnswer:" and learns to predict the rest
    inputs = [f"Question: {q}\nAnswer:" for q in ex["question"]]
    tokenized = tokenizer(
        inputs,
        truncation=True,
        max_length=512,
    )
    # Causal LM: labels = input_ids
    tokenized["labels"] = [ids.copy() for ids in tokenized["input_ids"]]
    return tokenized

In [7]:
tokenized = ds.map(
    preprocess,
    batched=True,
    remove_columns=ds["train"].column_names,
)

# Data Collator
| Making batches out of the data

In [8]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Optimizer and Hyperparameters

In [9]:
train_bs = 1
grad_accum = 8
epochs = 3
steps_per_epoch = len(tokenized["train"]) // (train_bs * grad_accum)
total_steps = steps_per_epoch * epochs
warmup_steps = total_steps // 10

In [10]:
optimizer = AdamW(
    model.parameters(),
    lr=2e-5,
    betas=(0.9, 0.999),
    eps=1e-8,
    weight_decay=0.01,
)



In [11]:
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps,
)

# Training Arguments

In [12]:
training_args = TrainingArguments(
        output_dir="./finetuned-layerskip-gsm8k",
        per_device_train_batch_size=train_bs,
        gradient_accumulation_steps=grad_accum,
        per_device_eval_batch_size=1,
        num_train_epochs=epochs,
        fp16=False,
        optim="adamw_torch",
        logging_steps=50,
        evaluation_strategy="steps",
        eval_steps=500,
        save_steps=500,
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="loss",
        # disable HF’s built-in clipping so no unscale error
        max_grad_norm=0.0,
    )



In [13]:
trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized["train"],
        eval_dataset=tokenized["test"],
        data_collator=data_collator,
        tokenizer=tokenizer,
        optimizers=(optimizer, scheduler),
    )

# Train

In [14]:
trainer.train()
trainer.save_model(training_args.output_dir)
tokenizer.save_pretrained(training_args.output_dir)

OutOfMemoryError: CUDA out of memory. Tried to allocate 172.00 MiB. GPU 0 has a total capacity of 79.22 GiB of which 11.62 MiB is free. Including non-PyTorch memory, this process has 79.20 GiB memory in use. Of the allocated memory 77.05 GiB is allocated by PyTorch, and 1.43 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)