<a href="https://colab.research.google.com/github/preetham-p-m/llm/blob/main/google_colab/quantizing_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate

In [None]:
from google.colab import userdata
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch

In [None]:
hf_token = userdata.get('HUGGINGFACE_API_KEY')
login(token=hf_token, add_to_git_credential=True)

In [None]:
LLAMA = "meta-llama/Meta-Llama-3.1-8B-Instruct"
PHI3 = "microsoft/Phi-3-mini-4k-instruct"
GEMMA2 = "google/gemma-2-2b-it"
QWEN2 = "Qwen/Qwen2-7B-Instruct" # exercise for you
MIXTRAL = "mistralai/Mixtral-8x7B-Instruct-v0.1" # If this doesn't fit it your GPU memory, try others from the hub

In [None]:
ACTIVE_MODEL = LLAMA
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Tell a light hearted joke for room of Data Scientists"}
]

In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
# Generate Tokenizer
tokenizer = AutoTokenizer.from_pretrained(ACTIVE_MODEL)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_type_id = tokenizer.eos_token_id
inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")

In [None]:
# Create Model
model = AutoModelForCausalLM.from_pretrained(ACTIVE_MODEL, quantization_config=quantization_config, device_map="auto")

In [None]:
momory = model.get_memory_footprint() / 1e6
print(f"Memory footprint: {momory} MB")

In [None]:
model

In [None]:
# Executing request
outputs = model.generate(inputs, max_new_tokens=80)
print(tokenizer.decode(outputs[0]))

In [None]:
# Cleanup

del inputs, outputs, model
torch.cuda.empty_cache()

In [None]:
# Evenything in Single Function

def generate(model, messages):
  tokenizer = AutoTokenizer.from_pretrained(model)
  tokenizer.pad_token = tokenizer.eos_token
  input = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
  streamer = TextStreamer(tokenizer)
  model = AutoModelForCausalLM.from_pretrained(model, quantization_config=quantization_config, device_map="auto")
  outputs = model.generate(input, max_new_tokens=80, streamer=streamer)
  del input, outputs, model
  torch.cuda.empty_cache()


In [None]:
generate(LLAMA, messages)
generate(QWEN2, messages)

In [None]:
messages = [
    {"role": "user", "content": "Tell a light hearted joke for room of Data Scientists"}
]
generate(GEMMA2, messages)