In [None]:
import wandb
wandb.login()



True

In [1]:
from datasets import load_dataset

# https://huggingface.co/datasets/HuggingFaceTB/smoltalk
dataset = load_dataset("HuggingFaceTB/smoltalk", 'all')

In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['messages', 'source'],
        num_rows: 1043917
    })
    test: Dataset({
        features: ['messages', 'source'],
        num_rows: 54948
    })
})

In [3]:
dataset['train'][0]

{'messages': [{'content': 'The function \\( g(x) \\) satisfies the functional equation\n\\[ g(x + y) = g(x) + g(y) \\]\nfor all real numbers \\( x \\) and \\( y \\), and it is given that \\( g(3) = 4 \\). Find \\( g(10) \\).',
   'role': 'user'},
  {'content': 'Given the functional equation and the specific value \\( g(3) = 4 \\), we can find \\( g(1) \\) by using the equation multiple times:\n\\[\ng(3) = g(2) + g(1)\n\\]\n\\[\ng(2) = g(1) + g(1) = 2g(1)\n\\]\nThus,\n\\[\n4 = 2g(1) + g(1) = 3g(1)\n\\]\n\\[\ng(1) = \\frac{4}{3}\n\\]\nNow we can find \\( g(10) \\) using \\( g(1) \\):\n\\[\ng(10) = 10g(1) = 10 \\times \\frac{4}{3} = \\frac{40}{3}\n\\]\nHence, the value of \\( g(10) \\) is \\(\\boxed{\\frac{40}{3}}\\).',
   'role': 'assistant'}],
 'source': 'numina-cot-100k'}

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# https://huggingface.co/microsoft/DialoGPT-medium
# model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")
# tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")

# https://huggingface.co/Qwen/Qwen2.5-0.5B
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B")
# tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B", use_fast=True)

# https://huggingface.co/distilbert/distilgpt2
model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2", use_fast=True)

In [5]:
import multiprocessing
num_proc = multiprocessing.cpu_count()

def chatml_tokenize(batch):
    texts = []
    for messages in batch["messages"]:
        chat = ""
        for msg in messages:
            if msg["role"] == "user":
                chat += "<|user|> " + msg["content"].strip() + " " + tokenizer.eos_token + " "
            elif msg["role"] == "assistant":
                chat += "<|assistant|> " + msg["content"].strip() + " " + tokenizer.eos_token + " "
        texts.append(chat.strip())
    return tokenizer(texts, padding=False, truncation=False)

tokenized_train = dataset["train"].map(
    chatml_tokenize, batched=True, batch_size=1000, num_proc=num_proc, remove_columns=["messages"]
)
tokenized_test = dataset["test"].map(
    chatml_tokenize, batched=True, batch_size=1000, num_proc=num_proc, remove_columns=["messages"]
)

In [6]:
# Should show ChatML-formatted text
print("Sample training example:")
print(tokenizer.decode(tokenized_train[0]["input_ids"])) 
# Expected: "<|user|> ... <|assistant|> ..."

Sample training example:
<|user|> The function \( g(x) \) satisfies the functional equation
\[ g(x + y) = g(x) + g(y) \]
for all real numbers \( x \) and \( y \), and it is given that \( g(3) = 4 \). Find \( g(10) \). <|endoftext|> <|assistant|> Given the functional equation and the specific value \( g(3) = 4 \), we can find \( g(1) \) by using the equation multiple times:
\[
g(3) = g(2) + g(1)
\]
\[
g(2) = g(1) + g(1) = 2g(1)
\]
Thus,
\[
4 = 2g(1) + g(1) = 3g(1)
\]
\[
g(1) = \frac{4}{3}
\]
Now we can find \( g(10) \) using \( g(1) \):
\[
g(10) = 10g(1) = 10 \times \frac{4}{3} = \frac{40}{3}
\]
Hence, the value of \( g(10) \) is \(\boxed{\frac{40}{3}}\). <|endoftext|>


In [7]:
import torch

device = "mps" if torch.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

print(device)

mps


In [8]:
# # Add special tokens to tokenizer
# special_tokens = ["<|user|>", "<|assistant|>"]
# tokenizer.add_special_tokens({"additional_special_tokens": special_tokens})
# model.resize_token_embeddings(len(tokenizer))

# Add special tokens
special_tokens = ["<|user|>", "<|assistant|>"]
tokenizer.add_special_tokens({"additional_special_tokens": special_tokens})

# Full device migration for resize operation
model = model.to("cpu")  # Move entire model to CPU

# Perform resize on CPU
model.resize_token_embeddings(len(tokenizer))

# Move back to original device
model = model.to(device)

# Verify
print(f"Embeddings device: {model.get_input_embeddings().weight.device}")
print(f"New vocab size: {len(tokenizer)}")

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embeddings device: mps:0
New vocab size: 50259


In [9]:
# Evaluate WITHOUT ChatML formatting
def base_model_eval(question):
    encoded = tokenizer(question, return_tensors="pt").to(device)
    generated = model.generate(**encoded, max_new_tokens=20)
    return tokenizer.decode(generated[0], skip_special_tokens=True)

print("BEFORE TRAINING (Raw model):")
print(base_model_eval("The capital of France is"))
print(base_model_eval("What is the capital of France?"))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


BEFORE TRAINING (Raw model):


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The capital of France is the capital of the French Republic. The capital of France is the capital of the French Republic. The
What is the capital of France?






















In [10]:
import random

# sample random indices from the test set
random_indices = random.sample(range(len(tokenized_test)), 50)

# create a new Dataset with only those
sampled_eval_dataset = tokenized_test.select(random_indices)

In [11]:
import os

os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"

In [12]:
from transformers import DataCollatorForLanguageModeling
from trl import SFTConfig, SFTTrainer

# Memory optimization setup
model.gradient_checkpointing_enable()
model.config.use_cache = False

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = SFTConfig(
    output_dir="./trainer_output",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    max_steps=50,
    learning_rate=1e-5,
    bf16=True,
    logging_steps=10,
    save_total_limit=2,  # Keep last 2 checkpoints
    save_strategy="steps",
    save_steps=50,  # Save every 50 steps
    eval_strategy="steps",
    eval_steps=50,
    dataloader_num_workers=1,
    gradient_checkpointing=True,
    optim="adafactor",
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    num_train_epochs=1,
    run_name="m2"
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=sampled_eval_dataset,
    data_collator=data_collator,
)

trainer.train()

Converting eval dataset to ChatML:   0%|          | 0/50 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/50 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/50 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33membereagle[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
50,2.9252,2.761764


TrainOutput(global_step=50, training_loss=3.282400588989258, metrics={'train_runtime': 49.1378, 'train_samples_per_second': 1.018, 'train_steps_per_second': 1.018, 'total_flos': 6218046111744.0, 'train_loss': 3.282400588989258})

---

In [None]:
# Evaluate WITH ChatML formatting
def chatml_eval(question):
    formatted_prompt = f"<|user|> {question} <|assistant|>"
    encoded = tokenizer(formatted_prompt, return_tensors="pt").to(device)
    generated = model.generate(**encoded, max_new_tokens=100)
    return tokenizer.decode(generated[0], skip_special_tokens=True)

print("\nAFTER TRAINING (ChatML-formatted):")
print(chatml_eval("The capital of France is"))
print('...')
print(chatml_eval("What is the capital of France?"))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



AFTER TRAINING (ChatML-formatted):


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


, a city of Â Â Â Â Â Â Â Â
is the capital of France.


In [14]:
assert model.get_output_embeddings().weight.data_ptr() == model.get_input_embeddings().weight.data_ptr()

In [15]:
# Continue training, use last checkpoint
trainer.args.max_steps += 50
trainer.train(resume_from_checkpoint=True)

There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Step,Training Loss,Validation Loss


RuntimeError: MPS backend out of memory (MPS allocated: 5.13 GB, other allocations: 12.67 GB, max allowed: 18.13 GB). Tried to allocate 1.53 GB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
# Evaluate WITH ChatML formatting
def chatml_eval(question):
    formatted_prompt = f"<|user|> {question} <|assistant|>"
    encoded = tokenizer(formatted_prompt, return_tensors="pt").to(device)
    generated = model.generate(**encoded, max_new_tokens=100)
    return tokenizer.decode(generated[0], skip_special_tokens=True)

print("\nAFTER TRAINING (ChatML-formatted):")
print(chatml_eval("The capital of France is"))
print('...')
print(chatml_eval("What is the capital of France?"))