In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3'
os.environ['HF_HUB_CACHE'] = '/next_share/hf_cache/hub'

import torch
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, PreTrainedModel, AutoModelForSeq2SeqLM, 
    AutoModelForSequenceClassification, AutoConfig, AutoModel, BitsAndBytesConfig
)

from peft import get_peft_model, LoraConfig

In [2]:
def build_gen_model(
    model_name, 
    lora = False, 
    dtype = torch.bfloat16, 
    device_map = None,
    quantization = False
):
    """
    Build generation model, support quantization and lora
    """
    # Determin model auto class by is_encoder_decoder
    config = AutoConfig.from_pretrained(model_name)
    is_seq2seq = getattr(config, 'is_encoder_decoder', False)
    mod_cls = AutoModelForSeq2SeqLM if is_seq2seq  else AutoModelForCausalLM

    # Determin the keyword args of from_pretrained
    ## Determin device_map. Default to the first GPU
    if device_map is None:
        device_map = 0
    
    ## Quantization config for qlora
    if quantization:
        quant_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
        )
    else:
        quant_config = None

    kws = dict(trust_remote_code = True,
               torch_dtype = dtype,
               device_map = device_map,
               quantization_config = quant_config)

    # Build hf model
    model = mod_cls.from_pretrained(model_name, **kws)
    
    # 3. Add lora adapter
    if lora:
        peft_config = LoraConfig(
            r = 16, lora_alpha = 16,
            target_modules = 'all-linear',
            lora_dropout= 0.1,
            bias = "none"
        )
        # determin task_type
        task_type = "SEQ_2_SEQ_LM" if is_seq2seq else "CAUSAL_LM"
        peft_config.task_type = task_type

        model = get_peft_model(model, peft_config)
        model.print_trainable_parameters()
    return model

In [3]:
model = build_gen_model('gpt2', quantization = True)

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.


In [4]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Linear4bit(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear4bit(in_features=768, out_features=768, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Linear4bit(in_features=768, out_features=3072, bias=True)
          (c_proj): Linear4bit(in_features=3072, out_features=768, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affin

In [5]:
model.device

device(type='cuda', index=0)

In [6]:
x = torch.randint(0, 200, (1,100))
out = model(input_ids = x)

In [9]:
torch.utils.cpp_backtrace.CUDA_HOME

AttributeError: module 'torch.utils.cpp_backtrace' has no attribute 'CUDA_HOME'

In [10]:
torch.version.cuda

'12.1'

In [12]:
print("CUDA_PATH:", os.environ.get('CUDA_PATH'))
print("CUDA_HOME:", os.environ.get('CUDA_HOME'))

CUDA_PATH: None
CUDA_HOME: /usr/local/cuda


In [13]:
os.environ.get('LD_LIBRARY_PATH')

In [14]:
torch._C._cuda_getDriverVersion()

AttributeError: module 'torch._C' has no attribute '_cuda_getDriverVersion'

In [15]:
torch.version.cuda

'12.1'