In [1]:
from typing import Any, Dict, List, Optional, Union

import numpy as np
import torch
import os
import re
import math
import shutil

from safetensors import safe_open
from safetensors.torch import save_file
from tqdm.notebook import tqdm

def matching(pattern: str, text: str) -> bool:
   # Convert wildcard * to regex .*
   regex = pattern.replace('.', '\.').replace('*', '[^.]*')
   return bool(re.search(regex, text))

def _load_bin_tensors(path, signature):
    state_dict = {}
    for shard_path in sorted(
        f for f in os.listdir(path) if f.endswith('.bin')
    ):
        checkpoint = torch.load(
            os.path.join(path, shard_path), map_location="cpu", weights_only=True)
        state_dict.update({
            k: v for k, v in checkpoint.items() if matching(signature, k)
        })
    return state_dict

def _load_safe_tensors(path, signature): 
    state_dict = {}
    for shard_path in sorted(
        f for f in os.listdir(path) if f.endswith('.safetensors')
    ):
        with safe_open(
            os.path.join(path, shard_path), framework="pt", device="cpu"
        ) as f:
            state_dict.update({
                k: f.get_tensor(k) for k in f.keys() if matching(signature, k)
            })
    return state_dict

def load_tensors(path, signature="", tensor_format="safe"):
    loaders = {
        "torch": _load_bin_tensors,
        "safe": _load_safe_tensors
    }
    if tensor_format not in loaders:
        raise ValueError(f"Unsupported tensor format: {tensor_format}")
    return loaders[tensor_format](path, signature)

In [2]:
def lerp(
    t: float, v0: Union[np.ndarray, torch.Tensor], v1: Union[np.ndarray, torch.Tensor]
) -> Union[np.ndarray, torch.Tensor]:
    return (1 - t) * v0 + t * v1


def slerp(
    t: Union[float, np.ndarray],
    v0: Union[np.ndarray, torch.Tensor],
    v1: Union[np.ndarray, torch.Tensor],
    DOT_THRESHOLD: float = 0.9995,
    eps: float = 1e-8,
):
    """
    Spherical linear interpolation

    From: https://gist.github.com/dvschultz/3af50c40df002da3b751efab1daddf2c
    Args:
        t (float/np.ndarray): Float value between 0.0 and 1.0
        v0 (np.ndarray): Starting vector
        v1 (np.ndarray): Final vector
        DOT_THRESHOLD (float): Threshold for considering the two vectors as
                               colinear. Not recommended to alter this.
    Returns:
        v2 (np.ndarray): Interpolation vector between v0 and v1
    """
    is_torch = False
    if not isinstance(v0, np.ndarray):
        is_torch = True
        v0 = v0.detach().cpu().float().numpy()
    if not isinstance(v1, np.ndarray):
        is_torch = True
        v1 = v1.detach().cpu().float().numpy()

    # Copy the vectors to reuse them later
    v0_copy = np.copy(v0)
    v1_copy = np.copy(v1)

    # Normalize the vectors to get the directions and angles
    v0 = normalize(v0, eps)
    v1 = normalize(v1, eps)
    # import ipdb; ipdb.set_trace()

    # Dot product with the normalized vectors (can't use np.dot in W)
    dot = np.sum(v0 * v1)

    # If absolute value of dot product is almost 1, vectors are ~colinear, so use lerp
    if np.abs(dot) > DOT_THRESHOLD:
        # print("tensors vewi similar!")
        res = lerp(t, v0_copy, v1_copy)
        return maybe_torch(res, is_torch)

    # Calculate initial angle between v0 and v1
    theta_0 = np.arccos(dot)
    sin_theta_0 = np.sin(theta_0)

    # Angle at timestep t
    theta_t = theta_0 * t
    sin_theta_t = np.sin(theta_t)

    # Finish the slerp algorithm
    s0 = np.sin(theta_0 - theta_t) / sin_theta_0
    s1 = sin_theta_t / sin_theta_0
    res = s0 * v0_copy + s1 * v1_copy
    # print("tensors vewi distant...")
    return maybe_torch(res, is_torch)


def maybe_torch(v: np.ndarray, is_torch: bool):
    if is_torch:
        return torch.from_numpy(v)
    return v


def normalize(v: np.ndarray, eps: float):
    norm_v = np.linalg.norm(v)
    if norm_v > eps:
        v = v / norm_v
    return v

In [3]:
def blend(
    weight_name: str, 
    parameters: dict, 
    layer_idx: int, 
    num_layers: int
):
    assert isinstance(layer_idx, int) or layer_idx is None, (
        f"If the weight {weight_name} belongs to an i-th layer, "
        f"the argument `layer_idx` should be an integer. Otherwise "
        f"it should be a NoneType object. Found `layer_idx` = "
        f"{layer_idx}."
    )
    assert isinstance(num_layers, int), (
        f"You must specify proper argument `num_layers` "
        f"of type `int`. Found `num_layers` = {num_layers}."
    )
    if isinstance(layer_idx, int):
        assert layer_idx <= num_layers - 1, (
            f"The argument `layer_idx` must have lower value than "
            f"the argument `num_layers`. Found "
            f"`layer_idx` = {layer_idx}, `num_layers` = {num_layers}."
        )
    
    matching_filter = next(
        (f for f in parameters if matching(f, weight_name)),
        'default'
    )
    anchors = parameters[matching_filter]
    if not isinstance(anchors, list):
        anchors = [anchors]

    if layer_idx is None:
        return anchors[0]
        
    # Calculate interpolation for layer-specific weights
    layer_fraction = layer_idx / (num_layers - 1)
    anchor_position = layer_fraction * (len(anchors) - 1)
    
    lower_idx = math.floor(anchor_position)
    upper_idx = min(len(anchors) - 1, lower_idx + 1)
    fraction = anchor_position - lower_idx

    interpolated = (
        (1 - fraction) * anchors[lower_idx]
        + fraction * anchors[upper_idx]
    )
    return interpolated

In [4]:
def copy_small_files(base_path, output_dir):
    print(f"Copying files to {output_dir}")
    files_to_copy = [
        "config.json",
        "generation_config.json",
        "special_tokens_map.json",
        "tokenizer_config.json",
        "tokenizer.json",
        "sentencepiece.bpe.model",
        "sparse_linear.pt",
        "colbert_linear.pt",
        "config_sentence_transformers.json",
        "sentence_bert_config.json",
        "modules.json"
    ]
    for filename in files_to_copy:
        src_path = os.path.join(base_path, filename)
        dst_path = os.path.join(output_dir, filename)
        if os.path.exists(src_path):
            shutil.copy2(src_path, dst_path) 

In [5]:
def run_merge(merger_config):
    sd0 = load_tensors(merger_config["models"][0], tensor_format="torch")
    sd1 = load_tensors(merger_config["models"][1], tensor_format="safe")

    parameters = merger_config["parameters"]
    num_layers = merger_config["num_layers"]
    output_dir = merger_config["output_dir"]
    
    keys = list(sd0.keys())
    merged_dict = {}
    for weight_name in tqdm(keys, desc=f"Merging"):
        find_layer = re.search(r"layer\.([^\.]*)\.", weight_name)
        layer_idx = int(find_layer.group(1)) if find_layer else None
        t = blend(weight_name, parameters, layer_idx, num_layers)
        v0, v1 = sd0[weight_name], sd1[weight_name]
        merged_weight = slerp(t, v0, v1)
        merged_dict[weight_name] = merged_weight

    
    os.makedirs(output_dir, exist_ok=True)
    copy_small_files(
        base_path=merger_config["models"][0],
        output_dir=output_dir
    )
    model_outfile = os.path.join(output_dir, "model.safetensors")
    metadata = {"format": "pt"}
    save_file(merged_dict, model_outfile, metadata=metadata)

In [6]:
parameters = {
    "layer.*.attention": [0, 0.3, 0.5, 0.7, 1],
    "layer.*.intermediate": [1, 0.7, 0.5, 0.3, 0],
    "layer.*.output": [1, 0.7, 0.5, 0.3, 0],
    "default": 0.5
}

config_a = {
    "models": [
        "/workspace/models/bge-m3/",
        "/workspace/models/chieunq-embedding/checkpoint-10000/"
    ],
    "num_layers": 24,
    "parameters": parameters,
    "torch_dtype": "float32",
    "output_dir": "/workspace/models/chieunq/checkpoint-10002"
}

config_b = {
    "models": [
        "/workspace/models/bge-m3/",
        "/workspace/models/chieunq-embedding/checkpoint-60000/"
    ],
    "num_layers": 24,
    "parameters": parameters,
    "torch_dtype": "float32",
    "output_dir": "/workspace/models/chieunq/checkpoint-60002/"
}

In [7]:
run_merge(config_a)

Merging:   0%|          | 0/391 [00:00<?, ?it/s]

Copying files to /workspace/models/chieunq/checkpoint-10002


In [8]:
run_merge(config_b)

Merging:   0%|          | 0/391 [00:00<?, ?it/s]

Copying files to /workspace/models/chieunq/checkpoint-60002/


In [1]:
import argparse
from huggingface_hub import HfApi, create_repo
token = "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" ## DO
# token = "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" ## DAO
token = "HF_TOKEN"

def upload(repo_id, local_dir, target_dir=""):
    create_repo(
        repo_id=repo_id,
        private=False,
        repo_type="model",
        exist_ok=True,
        token=token
    )
    api = HfApi()
    api.upload_folder(
        folder_path=local_dir,
        repo_id=repo_id,
        repo_type="model",
        path_in_repo=target_dir,  # Specify target directory
        token=token
    )

def delete(repo_id, target_dir):
    api = HfApi()
    api.delete_folder(
        repo_id=repo_id,
        path_in_repo=target_dir,
        repo_type="model",
        token=token
    )

SyntaxError: invalid syntax (148060486.py, line 5)

In [16]:
# Example usage
upload(
    repo_id="nguyenthanhdo/merged-embedding",
    local_dir="/workspace/models/chieunq/checkpoint-60002",
    target_dir="checkpoint-60002"
)

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

In [11]:
from transformers import AutoModel
model = AutoModel.from_pretrained(
    "/workspace/models/chieunq/checkpoint-10002"
)