In [None]:
## Mistral-style models
# model_id = "GritLM/GritLM-7B"
# model_id = "mistralai/Mistral-7B-Instruct-v0.3"
# model_id = "google/gemma-1.1-7b-it"

## Llama-style models
# model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
# model_id = "nvidia/Llama3-ChatQA-1.5-8B"
# model_id = "meta-llama/Meta-Llama-3-70B-Instruct"
model_id = "models/meta-llama__Meta-Llama-3-70B-Instruct"

model_kwargs = {
    "low_cpu_mem_usage": True,
    "device_map": "sequential"
}

generate_kwargs = {
    "max_new_tokens": 1024,
    "do_sample": True,
    "temperature": 0.7,
    "top_k": 50,
    "top_p": 0.95
}

In [None]:
from transformers import AutoTokenizer
from transformers import TextStreamer
from transformers import pipeline

import torch

tokenizer = AutoTokenizer.from_pretrained(model_id)
streamer = TextStreamer(tokenizer)

pipe = pipeline(
    "conversational",
    model=model_id,
    model_kwargs=model_kwargs,
    tokenizer=tokenizer,
    streamer=streamer
)

In [None]:
import utils
utils.print_model_info(pipe.model)
utils.print_device_info()

In [None]:
system_role = "system"
# system_role = "user"

system_prompt = """
You are the user's friend and you care about their well being.
You will converse with the user and attempt to get to know them better.
You will only ask one question in each of your responses.
"""

messages = [
    {
        "role": system_role,
        "content": system_prompt,
    }
]

_ = pipe(messages, kwargs=generate_kwargs)

In [None]:
messages.append(
    {
        "role": "user",
        "content": "Everything is good! I'm excited to be developing AI technology lately."
    }
)

_ = pipe(messages, kwargs=generate_kwargs)

In [None]:
messages.append(
    {
        "role": "user",
        "content": "I want to develop advanced tools for personalizing AI interactions."
    }
)

_ = pipe(messages, kwargs=generate_kwargs)