Required for progress meters and other notebook widgets required by the following packages.

In [None]:
!pip install --quiet --upgrade ipywidgets

In [None]:
!pip install --quiet --upgrade huggingface_hub

Log into Huggingface for accessing gated models (e.g. Meta's Llama models).

In [None]:
from huggingface_hub import login

login()

For models that we want to use in a quantized state (e.g. Llama 3 70B), compute and store quantized version of the models to reduce load times.

In [None]:
!pip install --quiet --upgrade transformers

In [None]:
!pip install --quiet --upgrade bitsandbytes accelerate

In [None]:
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
from transformers import BitsAndBytesConfig

model_id = "meta-llama/Meta-Llama-3-70B-Instruct"

## 8-bit quantization
# bnb_config = BitsAndBytesConfig(
#     load_in_8bit=True
# )

## 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    # bnb_4bit_compute_dtype=torch.bfloat16
    bnb_4bit_compute_dtype="bfloat16"
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    # torch_dtype=torch.bfloat16,
    torch_dtype="auto",
    trust_remote_code=True,
    low_cpu_mem_usage=True,
    # device_map="auto"
    device_map="sequential" ## using sequential instead of auto/balanced since otherwise lm_head gets put on CPU
)

quantized_model_id = "models/" + model_id.replace("/", "__")

model.save_pretrained(quantized_model_id)

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    trust_remote_code=True
)

tokenizer.save_pretrained(quantized_model_id)

In [None]:
import utils
utils.print_model_info(model)
utils.print_device_info()

In [None]:
print(model.hf_device_map)

In [None]:
import gc
import torch

del model
gc.collect()
torch.cuda.empty_cache()

In [None]:
import utils
utils.print_device_info()

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM

quantized_model_id = "models/meta-llama__Meta-Llama-3-70B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(
    quantized_model_id,
    local_files_only=True
)

model = AutoModelForCausalLM.from_pretrained(
    quantized_model_id,
    device_map="sequential",
    local_files_only=True
)

In [None]:
import utils
utils.print_model_info(model)