## Model definition, data preparation

In [None]:
from enum import Enum
from transformers import set_seed
from datasets import load_dataset
import os


seed = 42
set_seed(seed)


# Put your HF Token here
os.environ["HF_TOKEN"] = "hf_TOKEN_HERE"  # the token should have write access
model_name = "meta-llama/Llama-3.2-3B-Instruct"

### Load model and prepare dataset as shown in original notebook 

In [29]:
from mlx_lm import load

model, tokenizer = load(model_name)
dataset_name = "Jofthomas/hermes-function-calling-thinking-V1"

tokenizer.chat_template = "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{{ '<start_of_turn>' + message['role'] + '\n' + message['content'] | trim + '<end_of_turn><eos>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}"


def preprocess(sample):
    messages = sample["messages"]
    first_message = messages[0]

    # Instead of adding a system message, we merge the content into the first user message
    if first_message["role"] == "system":
        system_message_content = first_message["content"]
        # Merge system content with the first user message
        messages[1]["content"] = (
            system_message_content
            + "Also, before making a call to a function take the time to plan the function to take. Make that thinking process between <think>{your thoughts}</think>\n\n"
            + messages[1]["content"]
        )
        # Remove the system message from the conversation
        messages.pop(0)

    return {"text": tokenizer.apply_chat_template(messages, tokenize=False)}


dataset = load_dataset(dataset_name)
dataset = dataset.rename_column("conversations", "messages")

Using the latest cached version of the dataset since Jofthomas/hermes-function-calling-thinking-V1 couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /Users/shamikbose/.cache/huggingface/datasets/Jofthomas___hermes-function-calling-thinking-v1/default/0.0.0/ee5bf6e5737351f5d444b72689f7ab0ad37fc75f (last modified on Fri Feb 28 17:58:41 2025).


In [30]:
dataset = dataset.map(preprocess, remove_columns="messages")
dataset = dataset["train"].train_test_split(0.1)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 3213
    })
    test: Dataset({
        features: ['text'],
        num_rows: 357
    })
})


### Split the dataset into the required train, valid and test test splits required by `mlx_lm` 

In [None]:
train_valid_split = dataset["train"].train_test_split(test_size=0.1)
dataset["train"] = train_valid_split["train"]
dataset["valid"] = train_valid_split["test"]
for split in dataset:
    dataset[split].to_json("data/" + split + ".jsonl")

## Modified Tokenizer

In [None]:
class ChatmlSpecialTokens(str, Enum):
    tools = "<tools>"
    eotools = "</tools>"
    think = "<think>"
    eothink = "</think>"
    tool_call = "<tool_call>"
    eotool_call = "</tool_call>"
    tool_response = "<tool_reponse>"
    eotool_response = "</tool_reponse>"
    pad_token = "<pad>"
    eos_token = "<eos>"

    @classmethod
    def list(cls):
        return [c.value for c in cls]


# print(ChatmlSpecialTokens.list().items())
model, tokenizer = load(
    model_name,
    tokenizer_config={
        "pad_token": ChatmlSpecialTokens.pad_token.value,
        "special_tokens": ChatmlSpecialTokens.list(),
    },
    model_config={"attn_implementation": "eager"},
)

tokenizer.chat_template = "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{{ '<start_of_turn>' + message['role'] + '\n' + message['content'] | trim + '<end_of_turn><eos>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}"

In [None]:
# Sanity check for the model architecture
model

Model(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers.0): TransformerBlock(
      (self_attn): Attention(
        (q_proj): Linear(input_dims=3072, output_dims=3072, bias=False)
        (k_proj): Linear(input_dims=3072, output_dims=1024, bias=False)
        (v_proj): Linear(input_dims=3072, output_dims=1024, bias=False)
        (o_proj): Linear(input_dims=3072, output_dims=3072, bias=False)
        (rope): Llama3RoPE()
      )
      (mlp): MLP(
        (gate_proj): Linear(input_dims=3072, output_dims=8192, bias=False)
        (down_proj): Linear(input_dims=8192, output_dims=3072, bias=False)
        (up_proj): Linear(input_dims=3072, output_dims=8192, bias=False)
      )
      (input_layernorm): RMSNorm(3072, eps=1e-05)
      (post_attention_layernorm): RMSNorm(3072, eps=1e-05)
    )
    (layers.1): TransformerBlock(
      (self_attn): Attention(
        (q_proj): Linear(input_dims=3072, output_dims=3072, bias=False)
        (k_proj): Linear(input_dims=3

In [15]:
# HF Hub arguments
username = "shamikbose89"
output_dir = "Llama-3.2-3B-Instruct" + "_mlx_fcall"

Edit `lora_config.yaml` as needed. More information is available [here][def] 

[def]: https://github.com/ml-explore/mlx-examples/blob/main/llms/mlx_lm/LORA.md

In [None]:
# This step will take some time. Hit run and wait for the model to be trained. Or if you're like me, run `asitop` in your terminal and watch the GPU usage.
!mlx_lm.lora -c lora_config.yaml

Load the model with the recently created adapters

In [None]:
from mlx_lm import load, generate

# Don't overwrite the tokenizer here. We need to load the tokenizer with the special tokens
model, _ = load(model_name, adapter_path="adapters/")

In [35]:
output = generate(
    model=model,
    tokenizer=tokenizer,
    prompt="""<bos><start_of_turn>human
You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags.You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions.Here are the available tools:<tools> [{'type': 'function', 'function': {'name': 'convert_currency', 'description': 'Convert from one currency to another', 'parameters': {'type': 'object', 'properties': {'amount': {'type': 'number', 'description': 'The amount to convert'}, 'from_currency': {'type': 'string', 'description': 'The currency to convert from'}, 'to_currency': {'type': 'string', 'description': 'The currency to convert to'}}, 'required': ['amount', 'from_currency', 'to_currency']}}}, {'type': 'function', 'function': {'name': 'calculate_distance', 'description': 'Calculate the distance between two locations', 'parameters': {'type': 'object', 'properties': {'start_location': {'type': 'string', 'description': 'The starting location'}, 'end_location': {'type': 'string', 'description': 'The ending location'}}, 'required': ['start_location', 'end_location']}}}] </tools>Use the following pydantic model json schema for each tool call you will make: {'title': 'FunctionCall', 'type': 'object', 'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name']}For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
<tool_call>
{tool_call}
</tool_call>Also, before making a call to a function take the time to plan the function to take. Make that thinking process between <think>{your thoughts}</think>

Hi, I need to find the distance between Paris and London<end_of_turn><eos>
<start_of_turn>model
<think>""",
    max_tokens=500,
)

In [36]:
print(output)

Okay, so the user just said they need to find the distance between Paris and London. I need to figure out how to respond. Let me think about this for a moment.

First, I should check the available tools. The tools are functions that can be called to perform specific tasks. There are two functions: convert_currency and calculate_distance. The user's request is to find the distance, so I should use the calculate_distance function.

The function requires two parameters: start_location and end_location. The user provided Paris and London, so I can use those as the start and end locations. I should make sure to include the correct currency, but since the user didn't mention anything about currency, I can assume it's not needed here.

I should structure the function call with the function name and the arguments. The arguments are the start_location and end_location, which are Paris and London respectively. I'll make sure to format it correctly according to the JSON schema.

So, the function 