In [34]:
from typing import Any, Dict, List, Optional, Union

import numpy as np
import torch
import os
import shutil
import json
import math
import yaml
import re
from tqdm import tqdm
from safetensors import safe_open
from safetensors.torch import save_file
from IO import (
    load_tensors, 
    get_layer_signatures, 
    save_checkpoint
)

def sparse(
    v0: Union[np.ndarray, torch.Tensor],
    v1: Union[np.ndarray, torch.Tensor],
    eps: float = 1e-8,
    device: str = "cpu"
):

    # Convert inputs to PyTorch tensors and move to the specified device
    if not isinstance(v0, torch.Tensor):
        v0 = torch.from_numpy(v0)
    if not isinstance(v1, torch.Tensor):
        v1 = torch.from_numpy(v1)


    v0 = v0.to(device)
    v1 = v1.to(device)

    v0_copy = v0.clone()
    v1_copy = v1.clone()
    
    diff = torch.abs(v0 - v1)

    diff_indices = diff < eps
    v1[diff_indices] = v0[diff_indices]

    numel = torch.numel(diff)
    num_diff = torch.numel(diff[diff_indices]) / numel
    # print(f"Changing {num_diff:.3} of {numel} params.")
    if not torch.allclose(v1, v1_copy, atol=0.0, rtol=0.0):
        print("what")
    return v1


def merge_layer(tensors, merge_config):
    """
    Merges corresponding layers of two models using SLERP.
    
    Args:
        tensors (List[Dict[str, torch.Tensor]]): List of model weight dictionaries.
        merge_config (dict): Configuration dictionary containing merging parameters.
        
    Returns:
        Dict[str, torch.Tensor]: Merged model weights.
    """
    assert len(tensors) == 2    
    weight_names = [key for key in tensors[0].keys()]
    
    for weight_name in weight_names:
        tensor_a = tensors[0][weight_name]
        tensor_b = tensors[1][weight_name]
        
        tensor_computed = (
            sparse(
                tensor_a,
                tensor_b,
            )
            .to(tensor_a.dtype)
            .to(tensor_a.device)
        )
        # print(np.linalg.norm(tensor_computed.float()))
        tensors[0][weight_name] = tensor_computed
    return tensors[0]

def run_merge(
    merge_config
):
    ## Read configs
    model_paths = [x['model'] for x in merge_config['sources']]
    layer_signatures = get_layer_signatures(model_paths[0])
    output_dir = merge_config["output_dir"]
    tmp_dir = os.path.join(output_dir, "tmp_dir")
        
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)

    ## Merge models
    for signature in (
        pbar := tqdm(
            layer_signatures,
            desc="Merging ...",
            ncols=150
        )
    ):
        pbar.set_description(f"Merging {signature}")
        models_tensors = [load_tensors(path, signature) for path in model_paths]
        merged_tensors = merge_layer(models_tensors, merge_config)
        outfile = os.path.join(tmp_dir, f"{signature.strip('.')}.safetensors")
        save_file(merged_tensors, outfile)

    ## Save models
    save_checkpoint(merge_config)

In [35]:
if __name__ == "__main__":
    CONFIG_FILE = "config.yaml"
    with open(CONFIG_FILE, "r", encoding="utf-8") as f:
        merge_config = yaml.safe_load(f)
        
    run_merge(merge_config)

Merging model.embed_tokens:   0%|                                                                                              | 0/30 [00:00<?, ?it/s]

what


Merging layers.19.:  70%|█████████████████████████████████████████████████████████████████                            | 21/30 [00:12<00:04,  1.89it/s]

what


Merging layers.27.: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:17<00:00,  1.70it/s]


Copying files to /workspace/dont15/models/sparse-merged
Saved shard 1 with size 1.97 GB
Saved shard 2 with size 1.97 GB
Saved shard 3 with size 1.31 GB
Saved shard 4 with size 0.73 GB
Saved model weight map to /workspace/dont15/models/sparse-merged/model.safetensors.index.json.


In [None]:
def forward():
    ...
    return dict(
        outputs_components=[output_A, output_B, ..., output_D],
        output_merged=output_merged
    )


In [12]:
import math
from typing import List, Optional, Tuple, Union
from abc import ABC, abstractmethod

import datasets
import torch
import numpy as np
import torch.nn as nn
import logging
import copy
import gc

from datasets import load_dataset
from tqdm import tqdm
from transformers import (
    PreTrainedModel,
    PretrainedConfig,
    AutoConfig,
    AutoModelForCausalLM,
    LlamaForCausalLM,
    LlamaConfig,
    Trainer,
    TrainingArguments,
    AutoTokenizer,
    HfArgumentParser
)

from modeling_qwen2 import (
    Qwen2RMSNorm, 
    Qwen2RotaryEmbedding, 
    Qwen2MLP, 
    Qwen2Attention, 
    Qwen2FlashAttention2, 
    Qwen2SdpaAttention, 
    Qwen2DecoderLayer, 
    Qwen2PreTrainedModel, 
    Qwen2Model, 
    Qwen2ForCausalLM,
)

from configuration_qwen2 import Qwen2Config

from transformers.modeling_outputs import (
    BaseModelOutputWithPast,
)

# Configure logger
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [13]:
from transformers import GenerationConfig, TextStreamer
def generate(prompt, model, tokenizer, max_new_tokens=1024):
    input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to(model.device)
    model.eval()
    with torch.no_grad():
        generation_config = GenerationConfig(
            repetition_penalty=1.13,
            max_new_tokens=max_new_tokens,
            temperature=0.4,
            top_p=0.95,
            # top_k=20,
            # bos_token_id=tokenizer.bos_token_id,
            # eos_token_id=tokenizer.eos_token_id,
            # eos_token_id=0, # for open-end generation.
            pad_token_id=tokenizer.pad_token_id,
            do_sample=False,
            use_cache=True,
            return_dict_in_generate=True,
            output_attentions=False,
            output_hidden_states=False,
            output_scores=False,
        )
        streamer = TextStreamer(tokenizer, skip_prompt=True)
        generated = model.generate(
            inputs=input_ids,
            generation_config=generation_config,
            streamer=streamer,
        )
    gen_tokens = generated["sequences"].cpu()[:, len(input_ids[0]):]
    output = tokenizer.batch_decode(gen_tokens)[0]
    output = output.split(tokenizer.eos_token)[0]
    return output.strip()

def get_logits(text, model, tokenizer):
    input_ids = tokenizer(text, return_tensors="pt").to(model.device)
    model.eval()
    with torch.no_grad():
        logits = model(**input_ids).logits
    return logits

def get_hidden_states(text, model, tokenizer):
    input_ids = tokenizer(text, return_tensors="pt").to(model.device)
    model.eval()
    with torch.no_grad():
        outputs = model(**input_ids, output_hidden_states=True, use_cache=False)
    return outputs

In [25]:
model_id = "/workspace/dont15/models/sparse-merged/"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map={"":7},
    torch_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [26]:
system = "Provide a concise, objective summary of the input text in up to three sentences, focusing on key actions and intentions without using second or third person pronouns."
prompt = "Emily,\n\n\"English Sounds\" is a great recommendation - I'll definitely check it out. And I love the idea of having students record themselves. That kind of self-analysis can be so powerful.\n\nA virtual coffee chat next Wednesday sounds perfect. I'm free anytime after 2 pm my time (Eastern). Just let me know what time works for you and I'll send over a Zoom link.\n\nI'm really excited about the idea of proposing a workshop for the conference. Our combined insights could make for a really engaging and practical session. We could each present some strategies that have worked well for us and then facilitate a group discussion where attendees can share their own experiences and ideas.\n\nLet me know if you want to brainstorm a title or outline before our chat.\n\nLooking forward to catching up more next week!\n\nBest,\nJordan"
messages = [
    {"role": "system", "content": system},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
text

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nProvide a concise, objective summary of the input text in up to three sentences, focusing on key actions and intentions without using second or third person pronouns.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nEmily,\n\n"English Sounds" is a great recommendation - I\'ll definitely check it out. And I love the idea of having students record themselves. That kind of self-analysis can be so powerful.\n\nA virtual coffee chat next Wednesday sounds perfect. I\'m free anytime after 2 pm my time (Eastern). Just let me know what time works for you and I\'ll send over a Zoom link.\n\nI\'m really excited about the idea of proposing a workshop for the conference. Our combined insights could make for a really engaging and practical session. We could each present some strategies that have worked well for us and then facilitate a group discussion where attendees can share their own experiences and ideas.\n\nLet me know if you 

In [27]:
answer = generate(text, model, tokenizer, max_new_tokens=128)

Jordan agrees with Emily's suggestion to use "English Sounds" and supports the idea of students recording themselves. Jordan confirms availability for a virtual coffee chat next Wednesday after 2 pm Eastern and expresses enthusiasm for collaborating on a conference workshop. Jordan suggests presenting strategies and facilitating a group discussion, and invites Emily to brainstorm a title or outline before the chat. Jordan looks forward to catching up more next week.<|end_of_text|>


In [28]:
model_id = "/workspace/dont15/models/llama32_smol_summarize_20k/"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map={"":7},
    torch_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [29]:
answer = generate(text, model, tokenizer, max_new_tokens=128)



Jordan agrees with Emily's suggestion to use "English Sounds" and supports the idea of students recording themselves. Jordan confirms availability for a virtual coffee chat next Wednesday after 2 pm Eastern and expresses enthusiasm for collaborating on a conference workshop. Jordan suggests presenting strategies and facilitating a group discussion, and invites Emily to brainstorm a title or outline before the chat. Jordan looks forward to catching up more next week.<|end_of_text|>


## Sparse neuron training

In [None]:
torch.manual_seed(42)
dim = 1024
w = torch.rand(dim, dim) ## 1_000_000
mask = torch.rand(dim, dim) > 0.9 ## compressable. 1_000_000 / 4
p = nn.Parameter(torch.rand(1, torch.sum(mask).item())) ## 100_000

sparse = torch.zeros_like(w)
sparse[mask] = p

x = torch.rand(1, dim)
label = torch.ones(1, dim)
y = torch.matmul(x, w + sparse)

In [None]:
import torch
import torch.nn as nn

torch.manual_seed(42)
dim = 1024
w = torch.rand(dim, dim)
mask = torch.rand(dim, dim) > 0.9
p = nn.Parameter(torch.rand(1, torch.sum(mask).item()))

# Create a sparse tensor directly
indices = torch.nonzero(mask).t()  # Get indices of non-zero elements
values = p.squeeze()  # Use p for the non-zero values
sparse = torch.sparse_coo_tensor(indices, values, size=(dim, dim))

x = torch.rand(1, dim)
label = torch.ones(1, dim)
y = torch.matmul(x, w + sparse.to_dense()) # Convert to dense for matrix multiplication. Alternatively, you can use torch.sparse.mm.
diff = y - label
loss = torch.mean(diff ** 2)

In [1]:
from IO import load_tensors
import torch

def normalize(v: torch.Tensor, eps: float = 1e-8):
    norm_v = torch.norm(v)
    if norm_v > eps:
        v = v / norm_v
    return v

In [19]:
model_paths = [
    # "/workspace/dont15/models/llama32_smol_rewrite_50k/",
    # "/workspace/dont15/models/llama32_smol_summarize_20k/",
    "/workspace/HUB_LLM/Llama-3.2-3B",
    "/workspace/HUB_LLM/Llama-3.2-3B-Instruct"
]

In [20]:
signature = "layers.15"
weight_name = f'model.{signature}.mlp.up_proj.weight'
tensors_1 = load_tensors(model_paths[0], signature=signature)[weight_name]
tensors_2 = load_tensors(model_paths[1], signature=signature)[weight_name]

In [21]:
diff_indices = torch.abs(tensors_1 - tensors_2) > 1e-8

In [22]:
torch.sum(diff_indices) / torch.numel(diff_indices)

tensor(0.9881)

In [23]:
norm1 = normalize(tensors_1[diff_indices].float())
norm2 = normalize(tensors_2[diff_indices].float())
torch.sum(norm1 * norm2)

tensor(0.9901)

In [88]:
tensors_1 - tensors_2

tensor([[-0.0007, -0.0036,  0.0055,  ..., -0.0026,  0.0042,  0.0072],
        [ 0.0001,  0.0038, -0.0002,  ..., -0.0013, -0.0048,  0.0027],
        [ 0.0002,  0.0059,  0.0009,  ...,  0.0018, -0.0016, -0.0029],
        ...,
        [-0.0010,  0.0031, -0.0014,  ..., -0.0011,  0.0042,  0.0041],
        [ 0.0007, -0.0007, -0.0010,  ..., -0.0028, -0.0020, -0.0002],
        [-0.0005,  0.0048, -0.0015,  ...,  0.0026,  0.0031,  0.0006]],
       dtype=torch.bfloat16)

In [103]:
torch.norm(tensors_1) == torch.norm(tensors_2)

tensor(False)

In [104]:
torch.max(torch.abs(tensors_1)), torch.max(torch.abs(tensors_2))

(tensor(0.5078, dtype=torch.bfloat16), tensor(0.4941, dtype=torch.bfloat16))

In [105]:
torch.norm(tensors_1 - tensors_2)

tensor(13.8750, dtype=torch.bfloat16)

In [106]:
torch.norm(tensors_1.float())

tensor(89.9470)

In [93]:
torch.norm(tensors_2.float())

tensor(88.8236)

In [94]:
torch.sum(tensors_1 != tensors_2) / torch.numel(tensors_1)

tensor(0.9882)

In [95]:
diff_indices = tensors_1 != tensors_2

In [24]:
norm1 = normalize(tensors_1.float())
norm2 = normalize(tensors_2.float())
torch.sum(norm1 * norm2)

tensor(0.9904)

In [17]:
norm1 = normalize(tensors_1[diff_indices].float())
norm2 = normalize(tensors_2[diff_indices].float())
torch.sum(norm1 * norm2)

tensor(0.9900)

In [18]:
torch.sum(tensors_1.float() * tensors_2.float()) / \
(torch.norm(tensors_1.float()) * torch.norm(tensors_2.float()))

tensor(0.9903)