In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import TextIteratorStreamer

import torch

Set up the tokenizer and model. device_map="auto" will automatically distribute the model across available GPUs.

In [None]:
model_id = "meta-llama/Llama-3.1-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id)
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

Create a function to generate text using the model. The function takes a an array of chat messages and returns the response from the model.

In [None]:
def generate(messages, tools=None) -> str:
    input_tokens = tokenizer.apply_chat_template(
        messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt"
    ).to(model.device)

    inputs = {k: v for k, v in input_tokens.items()}

    # generate_kwargs = {"do_sample": True, "temperature": 0.7, "top_k": 50, "top_p": 0.95}

    output_tokens = model.generate(
        **inputs,
        max_new_tokens=1024,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
    )

    response_tokens = output_tokens[0][input_tokens["input_ids"].shape[-1] :]
    response = tokenizer.decode(response_tokens, skip_special_tokens=True)

    return response

Send the chat messages to the model. For our first tests, we won't be using any tools.

In [None]:
messages = [
    {
        "role": "system",
        "content": "You are a friendly chatbot that assists the user with research in infectious diseases and infectious disease modeling.",
    }
]

response = generate(messages)
response

We can not continue the conversation by capturing the response from the model in the chat messages array and adding a new user message.

In [None]:
messages.append(
    {
        "role": "assistant",
        "content": response,
    }
)

messages.append(
    {
        "role": "user",
        "content": "What is the mechanism of action of penicillin?",
    }
)

response = generate(messages)

messages.append(
    {
        "role": "assistant",
        "content": response,
    }
)

response

We'll now define a function that the model can use as a tool.

In [None]:
def get_current_temperature(location: str) -> float:
    """
    Get the current temperature at a location.

    Args:
        location: The location to get the temperature for, in the format "City, Country"
    Returns:
        The current temperature at the specified location in the specified units, as a float.
    """

    return 22.0


tools = [get_current_temperature]

We can now call generate with the tools, and the model will identify taht it needs to use a tool to answer the question.

In [None]:
messages.append(
    {
        "role": "user",
        "content": "What is the current temperature in Paris, France?",
    }
)

response = generate(messages, tools=tools)
response

We will add the tool call as well as the tool response to the chat messsage array. The model will then be able to continue the conversation with the tool response.

In [None]:
import json

# currently an issue with the tokenizer. the model returns 'parameters', but the tokenizer expects 'arguments'
tool_call = json.loads(response)
tool_call["arguments"] = tool_call.pop("parameters")

messages.append({"role": "assistant", "tool_calls": [{"type": "function", "function": tool_call}]})

## TODO call the function and get the results

messages.append({"role": "tool", "name": "get_current_temperature", "content": "22.0"})

response = generate(messages, tools=tools)
response