In [1]:
import os
import torch

from dotenv import load_dotenv
from huggingface_hub import login

from transformers import AutoTokenizer, \
                         AutoModelForCausalLM,\
                         TextStreamer, \
                         BitsAndBytesConfig

In [None]:
HF_TOKEN = os.getenv('HF_TOKEN')
login(HF_TOKEN,add_to_git_credential=True)


In [3]:
LLAMA = "meta-llama/Meta-Llama-3.1-8B-Instruct"
PHI3 = "microsoft/Phi-3-mini-4k-instruct"
GEMMA2 = "google/gemma-2-2b-it"

#QWEN2 = "Qwen/Qwen2-7B-Instruct" 
#MIXTRAL = "mistralai/Mixtral-8x7B-Instruct-v0.1" 

In [8]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type='nf4'
)

In [5]:
messages = [
    {'role':'system', 'content': 'You are a good assistant that can explain a very technical words to an ordinary reader'},
    {'role':'user',   'content': 'What is the meaning of Quantization in Machine Learning'},
]

In [6]:
tokenizer = AutoTokenizer.from_pretrained(LLAMA,trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
inputs = tokenizer.apply_chat_template(messages, return_tensors='pt').to('cpu')

In [10]:
model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map='auto')#, quantization_config=quant_config)

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

Some parameters are on the meta device because they were offloaded to the cpu and disk.


In [11]:
memory = model.get_memory_footprint() /1e6
print(f'memory: {memory:,.1f} MB')

memory: 32,121.1 MB


In [12]:
output = model.generate(inputs, max_new_tokens=80)
print(tokenizer.decode(output[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a good assistant that can explain a very technical words to an ordinary reader<|eot_id|><|start_header_id|>user<|end_header_id|>

What is the meaning of Quantization in Machine Learning<|eot_id|><|start_header_id|>assistant<|end_header_id|>

**Quantization in Machine Learning: A Simple Explanation**

Quantization is a technique used in Machine Learning (ML) to reduce the precision of numerical values, such as weights and activations, in neural networks. This process is essential for deploying models on devices with limited memory and computational resources, like mobile phones, embedded systems, and IoT devices.

**Why Quantization is Needed**

Traditional


In [13]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (n