In [7]:
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer
import torch

base_model = "ministral/Ministral-3b-instruct"
config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=config,
    device_map="auto",
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

device = "cuda" if torch.cuda.is_available() else "cpu"

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
from datasets import load_dataset

dataset = load_dataset("domenicrosati/TruthfulQA", split="train") \
            .shuffle(seed=42) \
            .select(range(2000))

def make_prompt(x):
    prompt = "<|im_start|>user\n"
    prompt += x["instruction"]

    if x["input"].strip():
        prompt += "\n" + x["input"]
    
    prompt += "\n<|im_end|>\n<|im_start|>assistant\n"
    return prompt

def tokenize_func(x):
    text_tokens = tokenizer(
        x["text"],
        truncation=True,
        max_length=512,
        padding="max_length",
        return_tensors="pt"
    )
    prompt = make_prompt(x)
    prompt_tokens = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding=False,
        return_tensors="pt"
    )
    return {
        "prompt": prompt,
        "text_input_ids": text_tokens["input_ids"],
        "text_attention_mask": text_tokens["attention_mask"],
        "prompt_input_ids": prompt_tokens["input_ids"],
        "prompt_attention_mask": prompt_tokens["attention_mask"]
    }

In [9]:
dataset = dataset.train_test_split(test_size=0.025, seed=42)
train = dataset['train']
val = dataset['test']

val = val.map(tokenize_func, batched=False)

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [10]:
from torch.utils.data import DataLoader
import math 

model.eval()
total_loss = 0.0
count = 0
val_loader = DataLoader(val, batch_size=1)

for batch in val_loader:
    input_ids = torch.tensor(batch['text_input_ids'], device=device)
    attention_mask = torch.tensor(batch['text_attention_mask'], device=device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
    
    total_loss += outputs.loss.item()
    count += 1

ppl = math.exp(total_loss / count)
print(f"Perplexity: {ppl}")

Perplexity: 11125.978988732668


In [None]:
for batch in val_loader:
    input_ids = torch.tensor(batch['prompt_input_ids'], device=device)
    attention_mask = torch.tensor(batch['prompt_attention_mask'], device=device)
    outputs = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_new_tokens=256,
        do_sample=True,
    )
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print(f"PROMPT: {batch['prompt'][0]}", end = "\n\n")
    print(f"Model output: {decoded_output}", end = "\n\n")
    print(f"Expected output: {batch['output'][0]}", end = "\n\n")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


PROMPT: <|im_start|>user
What does the following proverb mean: "The grass is always greener on the other side"?
<|im_end|>
<|im_start|>assistant


Model output: <|im_start|>user
What does the following proverb mean: "The grass is always greener on the other side"?
<|im_end|>
<|im_start|>assistant
The "the grass is always green on the other side" phrase is an epexplicit proverb about the nature forging new beginnings. The most likely explanation is that the phrase originates from Maynchickman Pittleton, which also is a postponing expression of youth in the early 18th century. It appears most presently in Charles Arthur Ghell's proverb of the first of the 17th century, "It's never easy to be seen in the morning." In the early 17th century, it appears in Robert Keaton's proverb of the latter of the 17th century: "You cannot see it anywhere there is water." Thus, "the forest spells for you the quietest of morning."

Here's a more direct interpretation: The phrase is a part of the proverb o

: 

In [11]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training 

model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
    r=4, 
    lora_alpha=16, 
    lora_dropout=0.05, 
    bias='none', 
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, peft_config)

In [12]:
def preprocess_fn(x):
    prompt = make_prompt(x)
    entire_input = f"{prompt}\n{x['output']}"
    tokens = tokenizer(
        entire_input,
        truncation=True,
        max_length=512,
        padding=True
    )
    tokens['labels'] = tokens['input_ids'].copy()
    return tokens

train = train.map(preprocess_fn, batched=False)

Map:   0%|          | 0/1950 [00:00<?, ? examples/s]

In [14]:
from transformers import TrainingArguments
from trl import SFTTrainer
from torch.utils.checkpoint import checkpoint

def checkpoint_forward(fn, *args):
    return checkpoint(fn, *args, use_reentrant=False)

model.gradient_checkpointing_enable()
model.config.gradient_checkpointing = True

torch.utils.checkpoint.checkpoint = checkpoint_forward

training_args = TrainingArguments(
    output_dir="results",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    optim="adamw_torch", 
    logging_steps=5, 
    learning_rate=2e-4, 
    fp16=True,
    warmup_ratio=0.1, 
    lr_scheduler_type="linear", 
    num_train_epochs=1, 
    gradient_checkpointing=True, 
    save_strategy="epoch", 
    label_names=["labels"]
)

trainer = SFTTrainer(
    model=model, 
    train_dataset=train, 
    args=training_args
)
trainer.train()
trainer.save_model("./results")

Step,Training Loss
5,1.8974
10,2.0067
15,1.9466
20,1.7888
25,1.7992
30,1.8021
35,1.7861
40,1.6667
45,1.7767
50,1.7075
