## Evaluation

In [1]:
from transformers import (
	AutoModelForCausalLM, 
	AutoTokenizer, 
	AutoConfig
)
from datasets import load_dataset
import torch
import os
import json
from peft import PeftModel

torch_dtype = torch.bfloat16
device_map = {"": 0}

# model_id = "../model_hub/Qwen2.5-3B-Instruct/"
model_id = "../models/merged-test2/"
# model_id = "../models/Llama-3.1-8B-Stheno-v3.4"
# model_id = "../models/Meta-Llama-3.1-8B-Instruct/"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    # config=AutoConfig.from_pretrained(model_id),
    device_map=device_map,
    torch_dtype=torch_dtype,
    # load_in_8bit=True
)
# model = PeftModel.from_pretrained(model, peft_dolphin)
# model = model.merge_and_unload()

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [2]:
from transformers import GenerationConfig, TextStreamer
def generate(prompt, max_new_tokens=1024):
    input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to(model.device)
    model.eval()
    with torch.no_grad():
        generation_config = GenerationConfig(
            repetition_penalty=1.13,
            max_new_tokens=max_new_tokens,
            temperature=0.4,
            top_p=0.95,
            # top_k=20,
            # bos_token_id=tokenizer.bos_token_id,
            # eos_token_id=tokenizer.eos_token_id,
            # eos_token_id=0, # for open-end generation.
            pad_token_id=tokenizer.pad_token_id,
            do_sample=True,
            use_cache=True,
            return_dict_in_generate=True,
            output_attentions=False,
            output_hidden_states=False,
            output_scores=False,
        )
        streamer = TextStreamer(tokenizer, skip_prompt=True)
        generated = model.generate(
            inputs=input_ids,
            generation_config=generation_config,
            streamer=streamer,
        )
    gen_tokens = generated["sequences"].cpu()[:, len(input_ids[0]):]
    output = tokenizer.batch_decode(gen_tokens)[0]
    output = output.split(tokenizer.eos_token)[0]
    return output.strip()

In [20]:
prompt = "Who created you?"
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

In [24]:
answer = generate(text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


I was created by Meta AI, which is a part of Meta Platforms, Inc. My primary function is to assist users with information and tasks to the best of my abilities based on my training data. I'm constantly learning and improving through machine learning algorithms.<|eot_id|>


In [6]:
import numpy as np
# np.degrees(0.00004)
np.arccos(1)

np.float64(0.0)

## Mergekit

In [1]:
# actually do merge
import torch
import yaml

from mergekit.config import MergeConfiguration
from mergekit.merge import MergeOptions, run_merge

In [5]:
OUTPUT_PATH = "../models/merged-test"  # folder to store the result in
LORA_MERGE_CACHE = "/tmp"  # change if you want to keep these for some reason
CONFIG_YML = "slerp-config.yaml"  # merge configuration file
COPY_TOKENIZER = True  # you want a tokenizer? yeah, that's what i thought
LAZY_UNPICKLE = False  # experimental low-memory model loader
LOW_CPU_MEMORY = False  # enable if you somehow have more VRAM than RAM+swap

In [6]:
with open(CONFIG_YML, "r", encoding="utf-8") as fp:
    merge_config = MergeConfiguration.model_validate(yaml.safe_load(fp))

options=MergeOptions(
	lora_merge_cache=LORA_MERGE_CACHE,
	# cuda=torch.cuda.is_available(),
	copy_tokenizer=COPY_TOKENIZER,
	lazy_unpickle=LAZY_UNPICKLE,
	low_cpu_memory=LOW_CPU_MEMORY,
)

In [7]:
run_merge(
    merge_config,
    out_path=OUTPUT_PATH,
    options=MergeOptions(
        lora_merge_cache=LORA_MERGE_CACHE,
        # cuda=torch.cuda.is_available(),
        copy_tokenizer=COPY_TOKENIZER,
        lazy_unpickle=LAZY_UNPICKLE,
        low_cpu_memory=LOW_CPU_MEMORY,
    ),
)
print("Done!")

Warmup loader cache: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 728.49it/s]
Executing graph: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1457/1457 [03:49<00:00,  6.35it/s]

Done!





## From scratch

In [1]:
from typing import Any, Dict, List, Optional, Union

import numpy as np
import torch
import os
import shutil
import json
import math
from tqdm import tqdm
from safetensors import safe_open
from safetensors.torch import save_file

In [2]:
def lerp(
    t: float, v0: Union[np.ndarray, torch.Tensor], v1: Union[np.ndarray, torch.Tensor]
) -> Union[np.ndarray, torch.Tensor]:
    return (1 - t) * v0 + t * v1


def slerp(
    t: Union[float, np.ndarray],
    v0: Union[np.ndarray, torch.Tensor],
    v1: Union[np.ndarray, torch.Tensor],
    DOT_THRESHOLD: float = 0.9995,
    eps: float = 1e-8,
):
    """
    Spherical linear interpolation

    From: https://gist.github.com/dvschultz/3af50c40df002da3b751efab1daddf2c
    Args:
        t (float/np.ndarray): Float value between 0.0 and 1.0
        v0 (np.ndarray): Starting vector
        v1 (np.ndarray): Final vector
        DOT_THRESHOLD (float): Threshold for considering the two vectors as
                               colinear. Not recommended to alter this.
    Returns:
        v2 (np.ndarray): Interpolation vector between v0 and v1
    """
    is_torch = False
    if not isinstance(v0, np.ndarray):
        is_torch = True
        v0 = v0.detach().cpu().float().numpy()
    if not isinstance(v1, np.ndarray):
        is_torch = True
        v1 = v1.detach().cpu().float().numpy()

    # Copy the vectors to reuse them later
    v0_copy = np.copy(v0)
    v1_copy = np.copy(v1)

    # Normalize the vectors to get the directions and angles
    v0 = normalize(v0, eps)
    v1 = normalize(v1, eps)
    # import ipdb; ipdb.set_trace()

    # Dot product with the normalized vectors (can't use np.dot in W)
    dot = np.sum(v0 * v1)

    # If absolute value of dot product is almost 1, vectors are ~colinear, so use lerp
    if np.abs(dot) > DOT_THRESHOLD:
        res = lerp(t, v0_copy, v1_copy)
        return maybe_torch(res, is_torch)

    # Calculate initial angle between v0 and v1
    theta_0 = np.arccos(dot)
    sin_theta_0 = np.sin(theta_0)

    # Angle at timestep t
    theta_t = theta_0 * t
    sin_theta_t = np.sin(theta_t)

    # Finish the slerp algorithm
    s0 = np.sin(theta_0 - theta_t) / sin_theta_0
    s1 = sin_theta_t / sin_theta_0
    res = s0 * v0_copy + s1 * v1_copy

    return maybe_torch(res, is_torch)


def maybe_torch(v: np.ndarray, is_torch: bool):
    if is_torch:
        return torch.from_numpy(v)
    return v


def normalize(v: np.ndarray, eps: float):
    norm_v = np.linalg.norm(v)
    if norm_v > eps:
        v = v / norm_v
    return v

In [3]:
def load_tensors(path, signature):
    state_dict = {}
    shard_paths = [f for f in os.listdir(path) if f.endswith('.safetensors')]
    for shard_path in sorted(shard_paths, key=lambda x: int(x.split('-')[1])):
        apath = os.path.join(path, shard_path)
        with safe_open(apath, framework="pt", device="cpu") as f:
            for key in f.keys():
                if signature in key:
                    state_dict[key] = f.get_tensor(key)
    return state_dict

In [4]:
import yaml
CONFIG_FILE = "slerp-config-custom.yaml"

with open(CONFIG_FILE, "r", encoding="utf-8") as f:
    merge_config = yaml.safe_load(f)
merge_config

{'sources': [{'model': '/workspace/models/Meta-Llama-3.1-8B-Instruct'},
  {'model': '/workspace/models/Llama-3.1-8B-Stheno-v3.4'}],
 'layer_range': [0, 32],
 'merge_method': 'slerp',
 'base_model': '/workspace/models/Meta-Llama-3.1-8B-Instruct',
 'output_dir': '/workspace/models/merged-test2',
 'parameters': {'t': [{'filter': 'self_attn', 'value': [0, 0.5, 0.3, 0.7, 1]},
   {'filter': 'mlp', 'value': [1, 0.5, 0.7, 0.3, 0]},
   {'filter': 'fallback_value', 'value': 0.5}]},
 'dtype': 'bfloat16'}

In [5]:
import os
import re
from safetensors import safe_open

def get_layer_signatures(path):
    """
    Extracts and organizes layer signatures from safetensors files in the specified directory.

    Args:
        path (str): Path to the directory containing safetensors files.

    Returns:
        list: A combined list of non-layer and sorted layer signatures.
    """
    def extract_layer_signature(key):
        key = key.strip(".weight")
        match = re.search(r"layers\.[^\.]*\.", key)
        return match.group(0) if match else key

    # Collect all keys from safetensors files
    shard_paths = sorted(
        (f for f in os.listdir(path) if f.endswith('.safetensors')),
        key=lambda x: int(x.split('-')[1])
    )
    keys = []
    for shard_path in shard_paths:
        full_path = os.path.join(path, shard_path)
        with safe_open(full_path, framework="pt", device="cpu") as f:
            keys.extend(f.keys())

    # Separate and process keys
    block_signatures = sorted(
        {extract_layer_signature(key) for key in keys if "layers" in key},
        key=lambda x: int(x.split(".")[-2])
    )
    other_signatures = [extract_layer_signature(key) for 
                        key in keys if "layers" not in key]

    return other_signatures + block_signatures


layer_signatures = get_layer_signatures('/workspace/models/Meta-Llama-3.1-8B-Instruct')
layer_signatures

['model.embed_tokens',
 'lm_head',
 'model.norm',
 'layers.0.',
 'layers.1.',
 'layers.2.',
 'layers.3.',
 'layers.4.',
 'layers.5.',
 'layers.6.',
 'layers.7.',
 'layers.8.',
 'layers.9.',
 'layers.10.',
 'layers.11.',
 'layers.12.',
 'layers.13.',
 'layers.14.',
 'layers.15.',
 'layers.16.',
 'layers.17.',
 'layers.18.',
 'layers.19.',
 'layers.20.',
 'layers.21.',
 'layers.22.',
 'layers.23.',
 'layers.24.',
 'layers.25.',
 'layers.26.',
 'layers.27.',
 'layers.28.',
 'layers.29.',
 'layers.30.',
 'layers.31.']

In [6]:
merge_config["parameters"]['t']

[{'filter': 'self_attn', 'value': [0, 0.5, 0.3, 0.7, 1]},
 {'filter': 'mlp', 'value': [1, 0.5, 0.7, 0.3, 0]},
 {'filter': 'fallback_value', 'value': 0.5}]

In [7]:
def calculate_model_size(model_path):
    files = [os.path.join(model_path, filename)
            for filename in os.listdir(model_path)
            if filename.endswith(".safetensors")]
    
    total_size = sum(
        f.get_tensor(key).nbytes
        for file in files
        for f in [safe_open(file, framework="pt", device="cpu")]
        for key in f.keys()
    )
    return total_size

def make_weightmap(directory):
    files = [f for f in os.listdir(directory) 
             if f.startswith("shard_") and f.endswith(".safetensors")]
    num_shards = len(files)
    files.sort()  # Ensure proper ordering of shard files
    model_index = {
        "metadata": {
            "total_size": calculate_model_size(directory)
        },
        "weight_map": {}
    }
    for index, file in enumerate(files, start=1):
        old_path = os.path.join(directory, file)
        new_name = f"model-{index:05d}-of-{num_shards:05d}.safetensors"
        new_path = os.path.join(directory, new_name)
        os.rename(old_path, new_path)
        
        with safe_open(new_path, framework="pt", device="cpu") as f:
            for key in f.keys():
                model_index["weight_map"][key] = new_name
                
    outfile = os.path.join(directory, "model.safetensors.index.json")
    with open(outfile, "w") as f:
        json.dump(model_index, f, indent=4)
        
    print(f"Saved model weight map to {outfile}.")
    
def shard_model(tmp_dir, output_dir, size_limit=2*1024*1024*1024):
    files = [os.path.join(tmp_dir, filename)
            for filename in sorted(os.listdir(tmp_dir))
            if filename.endswith(".safetensors")]
    
    shard, shard_size, idx = {}, 0, 1
    for file in files:
        with safe_open(file, framework="pt", device="cpu") as f:
            for key in f.keys():
                tensor = f.get_tensor(key)
                tensor_size = tensor.nbytes
                if shard_size + tensor_size > size_limit:
                    output_file = os.path.join(output_dir, f"shard_{idx}.safetensors")
                    save_file(shard, output_file, metadata={"format":"pt"})
                    print(f"Saved {output_file} with size {shard_size / 1024**3:.2f} GB")

                    # start a new shard
                    shard, shard_size = {}, 0
                    idx += 1
                    
                # add new tensor to the current shard
                shard[key] = tensor
                shard_size += tensor_size
                
    # save the last shard
    if shard:
        output_file = os.path.join(output_dir, f"shard_{idx}.safetensors")
        save_file(shard, output_file, metadata={"format":"pt"})
        print(f"Saved {output_file} with size {shard_size / 1024**3:.2f} GB")

    # rename and add model.safetensors.index.json
    make_weightmap(output_dir)

def copy_small_files(base_path, output_dir):
    print(f"Copying files to {output_dir}")
    files_to_copy = [
        "config.json",
        "generation_config.json",
        "special_tokens_map.json",
        "tokenizer_config.json",
        "tokenizer.json"
    ]
    for filename in files_to_copy:
        src_path = os.path.join(base_path, filename)
        dst_path = os.path.join(output_dir, filename)
        try:
            shutil.copy2(src_path, dst_path)
        except FileNotFoundError:
            print(f"File {filename} not found in {base_path}. Skipping.")    
    
def save_checkpoint(config):
    base_path = config['sources'][0]['model']
    output_dir = config["output_dir"]
    tmp_dir = os.path.join(output_dir, "tmp_dir")

    ## copying files
    copy_small_files(base_path, output_dir)

    ## sharding model weights
    shard_model(tmp_dir, output_dir)

In [8]:
def run_merge(
    merge_config
):
    ## Read configs
    model_paths = [x['model'] for x in merge_config['sources']]
    layer_signatures = get_layer_signatures(model_paths[0])
    output_dir = merge_config["output_dir"]
    tmp_dir = os.path.join(output_dir, "tmp_dir")
        
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)

    ## Merge models
    for signature in (
        pbar := tqdm(
            layer_signatures,
            desc="Merging ...",
            ncols=150
        )
    ):
        pbar.set_description(f"Merging {signature}")
        models_tensors = [load_tensors(path, signature) for path in model_paths]
        # result = pseudo_merge_layer(models_tensors, merge_config)
        # for k, v in result.items():
        #     print(f"{k}: {v}")
        # print("---" * 20)
        
        merged_tensors = merge_layer(models_tensors, merge_config)
        
        # label = load_tensors("../models/merged-test/", signature)
        # for key in label:
        #     try:
        #         torch.testing.assert_close(
        #             merged_tensors[key],
        #             label[key]
        #         )
        #     except:
        #         print(key)
        outfile = os.path.join(tmp_dir, f"{signature.strip('.')}.safetensors")
        save_file(merged_tensors, outfile)

    ## Save models
    save_checkpoint(merge_config)


def pseudo_merge_layer(tensors, merge_config):
    """
    Currently merge layer using SLERP.
    """
    assert len(tensors) == 2 
    conditions = merge_config["parameters"]["t"]
    conditions = {c['filter']: c['value'] for c in conditions}

    layer_begin, layer_end = merge_config["layer_range"]
    num_layers = layer_end - layer_begin
    
    weight_names = [key for key in tensors[0].keys()]
    result = {}
    for weight_name in weight_names:
        t = compute_t(weight_name, conditions, num_layers)
        result.update({weight_name: t})
    return result

In [9]:
def merge_layer(tensors, merge_config):
    """
    Currently merge layer using SLERP.
    """
    assert len(tensors) == 2
    conditions = merge_config["parameters"]["t"]
    conditions = {c['filter']: c['value'] for c in conditions}

    layer_begin, layer_end = merge_config["layer_range"]
    num_layers = layer_end - layer_begin
    
    weight_names = [key for key in tensors[0].keys()]
    
    for weight_name in weight_names:
        t = compute_t(weight_name, conditions, num_layers)
        tensor_a = tensors[0][weight_name]
        tensor_b = tensors[1][weight_name]
        
        # tensor_merged = tensors_merged[name]
        tensor_computed = (
            slerp(
                t,
                tensor_a,
                tensor_b,
            )
            .to(tensor_a.dtype)
            .to(tensor_a.device)
        )
        tensors[0][weight_name] = tensor_computed
        # torch.testing.assert_close(tensor_merged, tensor_computed)
    return tensors[0]

In [10]:
def compute_t(weight_name, conditions, num_layers):
    """
    gradient
    """
    anchors = conditions.get("fallback_value")
    if not isinstance(anchors, list):
        anchors = [anchors]

    for filter_name in conditions.keys():
        if filter_name in weight_name:
            anchors = conditions.get(filter_name)
            break
            
    match = re.search(r"layers\.([^\.]*)\.", weight_name)
    if match:
        layer_idx = int(match.group(1))
        layer_t = layer_idx / (num_layers - 1)
        scaled = layer_t * (len(anchors) - 1)
        i0 = math.floor(scaled)
        i1 = min(len(anchors) - 1, i0 + 1)
        frac = scaled - i0
        
        blend_value = (1 - frac) * anchors[i0] + frac * anchors[i1]
    else:
        blend_value = anchors[0]
        
    return blend_value

In [11]:
run_merge(merge_config)

Merging layers.31.: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 35/35 [01:52<00:00,  3.20s/it]


Copying files to /workspace/models/merged-test2
Saved /workspace/models/merged-test2/shard_1.safetensors with size 1.99 GB
Saved /workspace/models/merged-test2/shard_2.safetensors with size 1.99 GB
Saved /workspace/models/merged-test2/shard_3.safetensors with size 1.92 GB
Saved /workspace/models/merged-test2/shard_4.safetensors with size 1.92 GB
Saved /workspace/models/merged-test2/shard_5.safetensors with size 1.92 GB
Saved /workspace/models/merged-test2/shard_6.safetensors with size 1.99 GB
Saved /workspace/models/merged-test2/shard_7.safetensors with size 1.26 GB
Saved /workspace/models/merged-test2/shard_8.safetensors with size 1.96 GB
Saved model weight map to /workspace/models/merged-test2/model.safetensors.index.json.


In [69]:
save_checkpoint(merge_config)

Copying files to /workspace/models/merged-test2
Saved /workspace/models/merged-test2/shard_1.safetensors with size 1.99 GB
Saved /workspace/models/merged-test2/shard_2.safetensors with size 1.99 GB
Saved /workspace/models/merged-test2/shard_3.safetensors with size 1.92 GB
Saved /workspace/models/merged-test2/shard_4.safetensors with size 1.92 GB
Saved /workspace/models/merged-test2/shard_5.safetensors with size 1.92 GB
Saved /workspace/models/merged-test2/shard_6.safetensors with size 1.99 GB
Saved /workspace/models/merged-test2/shard_7.safetensors with size 1.26 GB
Saved /workspace/models/merged-test2/shard_8.safetensors with size 1.96 GB
Saved model weight map to /workspace/models/merged-test2/model.safetensors.index.json.


In [120]:
model_paths = [x['model'] for x in merge_config['sources']]
signature = "lm_head"
models_tensors = [load_tensors(path, signature) for path in model_paths]

In [123]:
tensor_result = merge_layer(models_tensors, merge_config)

In [129]:
tensor_result

{'lm_head.weight': tensor([[ 0.0082,  0.0072,  0.0125,  ...,  0.0095, -0.0215, -0.0138],
         [-0.0070,  0.0041,  0.0190,  ..., -0.0055,  0.0072, -0.0044],
         [ 0.0135,  0.0043,  0.0143,  ...,  0.0006, -0.0087, -0.0150],
         ...,
         [-0.0038,  0.0020,  0.0034,  ...,  0.0003,  0.0081,  0.0083],
         [-0.0038,  0.0020,  0.0034,  ...,  0.0003,  0.0081,  0.0083],
         [-0.0038,  0.0020,  0.0034,  ...,  0.0003,  0.0081,  0.0083]],
        dtype=torch.bfloat16)}

In [130]:
tensor_label = load_tensors("../models/merged-test/", signature)