In [None]:
# model_id = "mistralai/Mistral-Nemo-Instruct-2407"

## Quantized Gemma model, stored locally
# model_id = "models/google__gemma-2-27b-it"

model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

## Quantized Llama model, stored locally
# model_id = "models/meta-llama__Meta-Llama-3-70B-Instruct"

model_kwargs = {
    "low_cpu_mem_usage": True,
    "device_map": "sequential", # load the model into GPUs sequentially, to avoid memory allocation issues with balancing
    "torch_dtype": "auto"
}

generate_kwargs = {
    "max_new_tokens": 1024,
    "do_sample": True,
    "temperature": 0.7,
    "top_k": 50,
    "top_p": 0.95
}

In [None]:
from transformers import AutoTokenizer
from transformers import TextStreamer
from transformers import pipeline

import torch

tokenizer = AutoTokenizer.from_pretrained(model_id)
streamer = TextStreamer(tokenizer)

pipe = pipeline(
    "text-generation",
    model=model_id,
    model_kwargs=model_kwargs,
    tokenizer=tokenizer,
    streamer=streamer
)

In [None]:
import utils
utils.print_model_info(pipe.model)
utils.print_device_info()

In [None]:
generate_kwargs = {
    "max_new_tokens": 1024,
    "do_sample": True,
    "temperature": 0.7,
    "top_k": 50,
    "top_p": 0.95,
    "bos_token_id": tokenizer.bos_token_id,
    "pad_token_id": tokenizer.eos_token_id,
    "eos_token_id": tokenizer.eos_token_id
}

if 'llama' in model_id.lower():
    generate_kwargs["eos_token_id"] = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

## Mistral states that lower temperatures should be used with Nemo
if 'nemo' in model_id.lower():
    generate_kwargs["temperature"] = 0.3

In [None]:
system_prompt = """
You are the user's friend and you care about their well being.
You will converse with the user and attempt to get to know them better.
You will only ask one question in each of your responses.
"""

messages = [
    {
        "role": "system",
        "content": system_prompt,
    }
]

_ = pipe(messages, **generate_kwargs)

In [None]:
messages.append(
    {
        "role": "user",
        "content": "Everything is good! I'm excited to be developing AI technology lately."
    }
)

_ = pipe(messages, **generate_kwargs)

In [None]:
messages.append(
    {
        "role": "user",
        "content": "I want to develop advanced tools for personalizing AI interactions."
    }
)

_ = pipe(messages, **generate_kwargs)