In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers.models.llama import LlamaConfig
import os

# GPU使用量を計算する関数
def get_total_gpu_memory_used():
    gpu_memory_used = 0
    for i in range(torch.cuda.device_count()):
        gpu_memory_used += torch.cuda.memory_allocated(i)
    return gpu_memory_used / (1024 ** 3)  # GiB単位に変換

# モデル名と保存先の設定
base_model_name = "meta-llama/Llama-3.1-70B-Instruct"
user_dir = os.environ.get("USER")
save_directory = f"/raid/{user_dir}/{base_model_name.replace('/', '_')}"
os.makedirs(save_directory, exist_ok=True)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

# GPU使用量を記録（モデルロード前）
torch.cuda.empty_cache()
memory_used_before = get_total_gpu_memory_used()
print(f"GPU memory used before loading model: {memory_used_before:.2f} GiB")

# モデル構成の設定
config = LlamaConfig.from_pretrained(base_model_name, cache_dir=save_directory)

# 量子化モデルのロード
model_quantized = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    config=config,
    device_map="auto",
    quantization_config=bnb_config,
    cache_dir=save_directory,
    token=os.environ.get("HUGGINGFACE_TOKEN")
)

# GPU使用量を記録（モデルロード後）
memory_used_after = get_total_gpu_memory_used()
print(f"GPU memory used after loading model: {memory_used_after:.2f} GiB")

# ロードにより増加したGPU使用量を表示
memory_used_by_model = memory_used_after - memory_used_before
print(f"GPU memory used by the quantized model: {memory_used_by_model:.2f} GiB")


GPU memory used before loading base model: 0.00 GiB


Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

GPU memory used after loading base model: 525.67 GiB
GPU memory used by the base model: 525.67 GiB
