In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import login
from peft import PeftModel
import torch

2025-04-06 17:14:49.840368: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-06 17:14:49.842637: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-06 17:14:49.846587: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743977689.854075 2936357 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743977689.856285 2936357 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been regist

## Configurations

In [3]:
model_id = "mistralai/Mistral-7B-v0.1"
peft_model_gsm8k_id = "predibase/gsm8k"
peft_model_magicoder_id = "predibase/magicoder"
adapter_name_gsm8k = "gsm8k"
adapter_name_magicoder = "magicoder"
merged_adapter_name = "gsm8k_magicoder_avg"

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda":
    compute_dtype = torch.bfloat16 # Or torch.float16 depending on your GPU
else:
    compute_dtype = torch.float32

## Load model and Adaptors

In [5]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=compute_dtype,
    device_map="auto" # Automatically distributes across GPUs if available/needed
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
model = PeftModel.from_pretrained(
    base_model,
    peft_model_gsm8k_id,
    adapter_name=adapter_name_gsm8k, # You can name the first adapter here
    device_map="auto" # Apply device mapping here if needed
)
model.load_adapter(peft_model_magicoder_id, adapter_name=adapter_name_magicoder)

This can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.
If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=
If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH
For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_cuda_dir/lib64



<All keys matched successfully>

In [7]:
model.add_weighted_adapter(
    adapters=[adapter_name_gsm8k, adapter_name_magicoder],
    weights=[0.5, 0.5],
    adapter_name=merged_adapter_name,
    combination_type="linear" # 'linear' is the default for weighted sum
)

In [8]:
model.set_adapter(merged_adapter_name)

save_directory = f"weights/{merged_adapter_name}"
print(f"\nSaving the merged adapter '{merged_adapter_name}' to {save_directory}...")
model.save_pretrained(save_directory, selected_adapters=[merged_adapter_name])
tokenizer.save_pretrained(save_directory) 


Saving the merged adapter 'gsm8k_magicoder_avg' to weights/gsm8k_magicoder_avg...


('weights/gsm8k_magicoder_avg/tokenizer_config.json',
 'weights/gsm8k_magicoder_avg/special_tokens_map.json',
 'weights/gsm8k_magicoder_avg/tokenizer.model',
 'weights/gsm8k_magicoder_avg/added_tokens.json',
 'weights/gsm8k_magicoder_avg/tokenizer.json')

## Testing

In [None]:
print("\n--- Testing the merged model ---")
# Make sure the merged adapter is active (we did this in step 5)
print(f"Current active adapter: {model.active_adapter}") # Verify it's the merged one

prompt_gsm8k = "What is 5 * 8 + 3?" # Example GSM8K style
prompt_magicoder = "def fibonacci(n):" # Example Magicoder style

# Ensure tokenizer pads correctly if needed for batching (though here we do one by one)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

for prompt in [prompt_gsm8k, prompt_magicoder]:
    print(f"\nPrompt: {prompt}")
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(model.device) # Use model.device
    with torch.no_grad(): # Ensure no gradients are calculated during inference
        outputs = model.generate(
            **inputs,
            max_new_tokens=50,
            temperature=0.7,
            pad_token_id=tokenizer.pad_token_id # Important for generation
            )
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Generated Output:\n{decoded_output}")


--- Testing the merged model ---
Current active adapter: gsm8k_magicoder_avg

Prompt: What is 5 * 8 + 3?




Generated Output:
What is 5 * 8 + 3?

##### Answer: 41
##### 5 * 8 = 40
##### 40 + 3 = 43
##### 43 - 3 = 40


Prompt: def fibonacci(n):
Generated Output:
def fibonacci(n):
    if n == 0:
        return 0
    if n == 1:
        return 1
    return fibonacci(n-1) + fibonacci(n-2)

n =

--- Example of loading the saved merged adapter later ---
