Mistral models

In [1]:
from transformers import BitsAndBytesConfig

import torch

model_name = "GritLM/GritLM-7B"
# model_name = "mistralai/Mistral-7B-Instruct-v0.2"
# model_name = "google/gemma-7b-it"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

pad_token_id = 1

messages = [
    {
        "role": "user",
        "content": "You are a friendly chatbot that always answers accurately with short and accurate answers.",
    },
    {
        "role": "assistant",
        "content": ""
    },
    {
        "role": "user",
        "content": "What is electromicroscopy?",
    }
]

Llama models

In [1]:
from transformers import BitsAndBytesConfig

model_name = "meta-llama/Llama-2-7b-chat-hf"

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
)

pad_token_id = 2

messages = [
    {
        "role": "system",
        "content": "You are a friendly chatbot that always answers accurately with short and accurate answers.",
    },
    {
        "role": "user",
        "content": "What is electromicroscopy?",
    }
]

In [2]:
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
bnb_config = None

In [2]:
model_kwargs = {
    "torch_dtype": "auto",
    "quantization_config": bnb_config,
    "attn_implementation": "flash_attention_2"
}

generate_kwargs = {
    "max_new_tokens": 1024,
    "do_sample": True,
    "temperature": 0.7,
    "top_k": 50,
    "top_p": 0.95
}

In [3]:
from transformers import AutoTokenizer
from transformers import TextStreamer
from transformers import pipeline

import torch

tokenizer = AutoTokenizer.from_pretrained(model_name)
streamer = TextStreamer(tokenizer)

pipe = pipeline(
    "conversational",
    model=model_name,
    model_kwargs=model_kwargs,
    device_map="auto",
    tokenizer=tokenizer,
    streamer=streamer
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
model = pipe.model

print(f'Model size : {model.get_memory_footprint() // 1024 ** 2} MB')

try:
    print(f'Model is quantized : {model.is_quantized}')
    print(f'Model quantization method : {model.quantization_method}')
except:
    pass
    
try:
    print(f'Model 8-bit quantized : {model.is_loaded_in_8bit}')
except:
    pass

try:
    print(f'Model 4-bit quantized : {model.is_loaded_in_4bit}')
except:
    pass

print(f'Model on GPU : {next(model.parameters()).is_cuda}')

Model size : 4340 MB
Model is quantized : True
Model quantization method : bitsandbytes
Model 4-bit quantized : True
Model on GPU : True


In [5]:
_ = pipe(messages, kwargs=generate_kwargs, pad_token_id=pad_token_id)

<s> <|user|>
You are a friendly chatbot that always answers accurately with short and accurate answers.
<|assistant|>
</s> 
<|user|>
What is electromicroscopy?
<|assistant|>
Electromicroscopy is a type of microscopy that uses electrical signals to create images of samples. It is used to study the electrical properties of materials and can provide information about the distribution of charges within a sample. There are several types of electromicroscopy, including scanning probe microscopy (SPM), scanning electron microscopy (SEM), and transmission electron microscopy (TEM).</s>
