In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification
from huggingface_hub import login
from peft import PeftModel
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, matthews_corrcoef
from tqdm import tqdm

2025-04-26 22:14:08.721962: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-26 22:14:08.729242: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745723648.737258 1396278 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745723648.739617 1396278 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-26 22:14:08.748733: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
print(torch.cuda.is_available())  # Should return True if CUDA is available
print(torch.cuda.device_count())  # Number of GPUs detected
print(torch.cuda.get_device_name(0))

True
1
NVIDIA GeForce RTX 4090


In [3]:
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

## Configurations

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda":
    compute_dtype = torch.bfloat16 # Or torch.float16 depending on your GPU
else:
    compute_dtype = torch.float32

In [5]:
model_id = "mistralai/Mistral-7B-v0.1"

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=compute_dtype,
    # device_map={"":"cuda"}, # Automatically distributes across GPUs if available/needed
    device_map='auto',
    # offload_folder='offload/'
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token 

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
# add adapter, if not then base model
# model = base_model

# peft_model_hellaswag_id = "predibase/hellaswag"
# adapter_name_hellaswag = "hellaswag"

# model = PeftModel.from_pretrained( # not loading from local
#     base_model,
#     peft_model_hellaswag_id,
#     adapter_name=adapter_name_hellaswag, # You can name the first adapter here
#     device_map="auto", # Apply device mapping here if needed
#     # low_cpu_mem_usage=True,
#     # offload_folder='offload/'
# )
model = PeftModel.from_pretrained(
    base_model,
    "./weights/element_add/gsm8k_magicoder_hellaswag/gsm8k_magicoder_hellaswag",
    device_map='auto'
    )
# model = model.half()        # convert to half precision (if your GPU supports it)
# model = torch.compile(model) 
model.eval()

This can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.
If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=
If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH
For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_cuda_dir/lib64



PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear

In [7]:
dataset = load_dataset("Rowan/hellaswag")
test_dataset = dataset["validation"]  # or "test" if it includes labels

In [8]:
test_dataset

Dataset({
    features: ['ind', 'activity_label', 'ctx_a', 'ctx_b', 'ctx', 'endings', 'source_id', 'split', 'split_type', 'label'],
    num_rows: 10042
})

In [None]:
correct = 0
total = 0

for i in tqdm(range(len(test_dataset))):
    example = dataset['validation'][i]  # ← Access the i-th example correctly
    ctx = example["ctx"]
    endings = example["endings"]
    label = example["label"]
    losses = []

    for ending in endings:
        prompt = ctx.strip() + " " + ending.strip()
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        input_ids = inputs["input_ids"]

        # GPT-style scoring: compute loss of predicting each token (shifted)
        with torch.no_grad():
            outputs = model(**inputs, labels=input_ids)
            loss = outputs.loss.item()
            losses.append(loss)
    
    pred = losses.index(min(losses))
    # print(str(pred) + " == " + str(label) + "? " + str(int(pred) == int(label)))
    
    if int(pred) == int(label):
        correct += 1
    total += 1
    # if i == 10:
    #     break

accuracy = correct / total
print(f"HellaSwag Accuracy (causal LLM): {accuracy:.4f}")


100%|██████████| 10042/10042 [17:05<00:00,  9.79it/s]

HellaSwag Accuracy (causal LLM): 0.2582





: 