In [None]:
import wandb
wandb.login()

In [None]:
from datasets import load_dataset

# https://huggingface.co/datasets/HuggingFaceTB/smoltalk
dataset = load_dataset("HuggingFaceTB/smoltalk", 'all')

In [None]:
dataset

In [None]:
dataset['train'][0]

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# https://huggingface.co/microsoft/DialoGPT-medium
# model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")
# tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")

# https://huggingface.co/Qwen/Qwen2.5-0.5B
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B")
# tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B", use_fast=True)

# https://huggingface.co/distilbert/distilgpt2
# model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
# tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2", use_fast=True)

# https://huggingface.co/microsoft/lts-gpt2-sm
tokenizer = AutoTokenizer.from_pretrained("microsoft/lts-gpt2-sm")
# version with 8.23M params -> 93.88MB RAM
model = AutoModelForCausalLM.from_pretrained("microsoft/lts-gpt2-sm", subfolder="gpt2_538d4b101df48595a935d90dbf4a7fb2ac09ac01")


In [None]:
import multiprocessing
num_proc = multiprocessing.cpu_count()

def chatml_tokenize(batch):
    texts = []
    for messages in batch["messages"]:
        chat = ""
        for msg in messages:
            if msg["role"] == "user":
                chat += "<|user|> " + msg["content"].strip() + " " + tokenizer.eos_token + " "
            elif msg["role"] == "assistant":
                chat += "<|assistant|> " + msg["content"].strip() + " " + tokenizer.eos_token + " "
        texts.append(chat.strip())
    return tokenizer(texts, padding=False, truncation=False)

tokenized_train = dataset["train"].map(
    chatml_tokenize, batched=True, batch_size=1000, num_proc=num_proc, remove_columns=["messages"]
)
tokenized_test = dataset["test"].map(
    chatml_tokenize, batched=True, batch_size=1000, num_proc=num_proc, remove_columns=["messages"]
)

In [None]:
# Should show ChatML-formatted text
print("Sample training example:")
print(tokenizer.decode(tokenized_train[0]["input_ids"])) 
# Expected: "<|user|> ... <|assistant|> ..."

In [None]:
import torch

device = "mps" if torch.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

print(device)

In [None]:
# # Add special tokens to tokenizer
# special_tokens = ["<|user|>", "<|assistant|>"]
# tokenizer.add_special_tokens({"additional_special_tokens": special_tokens})
# model.resize_token_embeddings(len(tokenizer))

# Add special tokens
special_tokens = ["<|user|>", "<|assistant|>"]
tokenizer.add_special_tokens({"additional_special_tokens": special_tokens})

# Full device migration for resize operation
model = model.to("cpu")  # Move entire model to CPU

# Perform resize on CPU
model.resize_token_embeddings(len(tokenizer))

# Move back to original device
model = model.to(device)

# Verify
print(f"Embeddings device: {model.get_input_embeddings().weight.device}")
print(f"New vocab size: {len(tokenizer)}")

In [None]:
# Evaluate WITHOUT ChatML formatting
def base_model_eval(question):
    encoded = tokenizer(question, return_tensors="pt").to(device)
    generated = model.generate(**encoded, max_new_tokens=20)
    return tokenizer.decode(generated[0], skip_special_tokens=True)

print("BEFORE TRAINING (Raw model):")
print(base_model_eval("The capital of France is"))
print(base_model_eval("What is the capital of France?"))

In [None]:
import random

# sample random indices from the test set
random_indices = random.sample(range(len(tokenized_test)), 50)

# create a new Dataset with only those
sampled_eval_dataset = tokenized_test.select(random_indices)

In [None]:
import os

os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"

In [None]:
from transformers import DataCollatorForLanguageModeling
from trl import SFTConfig, SFTTrainer

# Memory optimization setup
model.gradient_checkpointing_enable()
model.config.use_cache = False

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = SFTConfig(
    output_dir="./trainer_output",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    max_steps=len(tokenized_train) // 100,
    learning_rate=1e-5,
    bf16=True,
    logging_steps=10,
    save_total_limit=2,  # Keep last 2 checkpoints
    save_strategy="steps",
    save_steps=50,  # Save every 50 steps
    eval_strategy="steps",
    eval_steps=50,
    dataloader_num_workers=1,
    gradient_checkpointing=True,
    optim="adafactor",
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    num_train_epochs=1,
    run_name="m2-sm"
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=sampled_eval_dataset,
    data_collator=data_collator,
)

trainer.train()

In [None]:
# Evaluate WITH ChatML formatting
def chatml_eval(question):
    formatted_prompt = f"<|user|> {question} <|assistant|>"
    encoded = tokenizer(formatted_prompt, return_tensors="pt").to(device)
    generated = model.generate(**encoded, max_new_tokens=100)
    return tokenizer.decode(generated[0], skip_special_tokens=True)

print("\nAFTER TRAINING (ChatML-formatted):")
print(chatml_eval("The capital of France is"))
print('...')
print(chatml_eval("What is the capital of France?"))