# Finetune Pretrained Language Models into a Chat Model

In [None]:
# Install the requirements in Google Colab
# !pip install transformers datasets trl huggingface_hub

# from huggingface_hub import login
# login("")

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer, setup_chat_format
import torch


device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)

# Load the model and tokenizer
model_name = "HuggingFaceTB/SmolLM2-360M"
model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=model_name
).to(device)

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)

# Set up the chat format
model, tokenizer = setup_chat_format(model=model, tokenizer=tokenizer)

# Set our name for the finetune to be saved &/ uploaded to
finetune_name = "SmolLM2-360M-Arch-Talk-Chat-Model"
finetune_tags = ["arch-talk", "smoltalk", "everyday-conversations"]


  from .autonotebook import tqdm as notebook_tqdm


# Generate with the base model

In [2]:
# Let's test the base model before training
prompt = "What is the meaning of life?"

# Format with template
messages = [{"role": "user", "content": prompt}]
formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False)

# Generate response
inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=500)
print(tokenizer.decode(outputs[0], skip_special_tokens=False))


# It is responding... but not with the chat format we wanted.

<|im_start|>user
What is the meaning of life?<|im_end|>
What is the meaning of life?

What is the meaning of life?

What is the meaning of life?

What is the meaning of life?

What is the meaning of life?

What is the meaning of life?

What is the meaning of life?

What is the meaning of life?

What is the meaning of life?

What is the meaning of life?

What is the meaning of life?

What is the meaning of life?

What is the meaning of life?

What is the meaning of life?

What is the meaning of life?

What is the meaning of life?

What is the meaning of life?

What is the meaning of life?

What is the meaning of life?

What is the meaning of life?

What is the meaning of life?

What is the meaning of life?

What is the meaning of life?

What is the meaning of life?

What is the meaning of life?

What is the meaning of life?

What is the meaning of life?

What is the meaning of life?

What is the meaning of life?

What is the meaning of life?

What is the meaning of life?

What is the me

## Dataset Preparation

We will load a sample dataset and format it for training. The dataset should be structured with input-output pairs (Supervised Training), where each input is a prompt and the output is the expected response from the model.

**Transformers Reinforcement Learning(TRL)** - Library by Hugging Face that provides tools for training language models, particularly for fine-tuning and alignment tasks.

**TRL will format input messages based on the model's chat templates.** They need to be represented as a list of dictionaries with the keys: `role` and `content`,.

## Dataset

https://huggingface.co/datasets/HuggingFaceTB/smoltalk/viewer/everyday-conversations?views%5B%5D=everyday_conversations_train

# Split Dataset

In [14]:

# Load the dataset first - using 'everyday-conversations' config for chat training
# Could also not split the data, but for this one we'll get the train set only
ds = load_dataset("HuggingFaceTB/smoltalk", "everyday-conversations", split="train")
# Split the dataset for training and evaluation
ds = ds.train_test_split(test_size=0.5, seed=42)

print(f"Training samples: {len(ds['train'])}")
print(f"Test samples: {len(ds['test'])}")
print(f"Example conversation: {ds['train'][0]['messages']}")

Training samples: 1130
Test samples: 1130
Example conversation: [{'content': 'Hi there', 'role': 'user'}, {'content': 'Hello! How can I help you today?', 'role': 'assistant'}, {'content': "I'm having trouble with my work schedule. I have a lot of night shifts this week.", 'role': 'user'}, {'content': 'Night shifts can be challenging. Are you finding it hard to adjust to the new sleep schedule?', 'role': 'assistant'}, {'content': 'Yes, I am. Do you have any tips for staying awake during my shift?', 'role': 'user'}, {'content': 'Try to get some fresh air during your breaks, and avoid heavy meals before your shift starts. Also, stay hydrated by drinking plenty of water throughout the night.', 'role': 'assistant'}, {'content': 'That sounds helpful, thanks.', 'role': 'user'}]


## Configuring the  Supervised Fine-Tuning Trainer (SFTTrainer)

The `SFTTrainer` is configured with various parameters that control the training process. These include the number of training steps, batch size, learning rate, and evaluation strategy. Adjust these parameters based on your specific requirements and computational resources.

In [15]:

# Configure the SFTTrainer
sft_config = SFTConfig(
    output_dir="./sft_output",
    max_steps=1000,  # Adjust based on dataset size and desired training duration
    per_device_train_batch_size=8,  # Set according to your GPU memory capacity
    learning_rate=5e-5,  # Common starting point for fine-tuning
    logging_steps=10,  # Frequency of logging training metrics
    save_steps=100,  # Frequency of saving model checkpoints
    eval_steps=50,  # Frequency of evaluation
    use_mps_device=(
        True if device == "mps" else False
    ),  # Use MPS for mixed precision training on Apple Silicon
    hub_model_id=finetune_name,  # Set a unique name for your model
    push_to_hub=False,  # Push the model to Hugging Face Hub?
    hub_private_repo=False,  # Make the model public
    report_to=None,  # Disable wandb/tensorboard logging for simplicity
)

# Initialize the SFTTrainer
trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    processing_class=tokenizer,  # Use processing_class instead of tokenizer
)

## Training the Model

With the trainer configured, we can now proceed to train the model. The training process will involve iterating over the dataset, computing the loss, and updating the model's parameters to minimize this loss.

# DO NOT RUN THIS IN THE DEMO!!!!!!!!!!!!!!!!!!!!!!!!!

In [None]:
# Train the model
trainer.train()

# Save the model
trainer.save_model(f"./{finetune_name}")

# Training loss represents a measure of how well the model's predicted output aligns with the actual target output in the training data. 

# Decreasing trend: The loss value consistently goes down over time (epochs or iterations), showing that the model is improving its predictions. 
# Low and stable value: After a period of decrease, the loss should settle at a low, stable value, indicating the model has learned the underlying patterns in the data. 

Step,Training Loss
10,1.821
20,1.082
30,0.9462
40,0.944
50,0.8726
60,0.8759
70,0.8838
80,0.9
90,0.8769
100,0.8654


model.safetensors: 100%|██████████| 1.45G/1.45G [03:27<00:00, 6.98MB/s]



# DO NOT RUN THIS IN THE DEMO

In [None]:
trainer.push_to_hub(tags=finetune_tags)

CommitInfo(commit_url='https://huggingface.co/salhernandez/SmolLM2-360M-Arch-Talk-Chat-Model/commit/a7e5f284a9a65fab324d01a3e3a0680ccd26eeca', commit_message='End of training', commit_description='', oid='a7e5f284a9a65fab324d01a3e3a0680ccd26eeca', pr_url=None, repo_url=RepoUrl('https://huggingface.co/salhernandez/SmolLM2-360M-Arch-Talk-Chat-Model', endpoint='https://huggingface.co', repo_type='model', repo_id='salhernandez/SmolLM2-360M-Arch-Talk-Chat-Model'), pr_revision=None, pr_num=None)

# Generate with fine-tuned model

In [62]:

model_name = "salhernandez/SmolLM2-360M-Arch-Talk-Chat-Model"
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_name).to(device)

prompt = "What is the meaning of life?"

# Format with template
messages = [{"role": "user", "content": prompt}]
formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False)

# Generate response
inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=500)
print(tokenizer.decode(outputs[0], skip_special_tokens=False))

<|im_start|>user
What is the meaning of life?<|im_end|>
User: I don't know, but I think it's to enjoy the present moment.<|im_end|>
