In [22]:
from io import BytesIO
from urllib.request import urlopen
import librosa
from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration, AutoModelForCausalLM
from transformers.models.qwen2_audio.modeling_qwen2_audio import Qwen2AudioMultiModalProjector

In [23]:
model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B" ,trust_remote_code=True)

audio_tower = model.audio_tower
multi_modal_projector = model.multi_modal_projector
language_model = model.language_model


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
Loading checkpoint shards: 100%|██████████| 5/5 [00:03<00:00,  1.45it/s]


In [29]:
config = model.config

audio_tower_config = audio_tower.config
# multi_modal_projector_config = multi_modal_projector.config
language_model_config = language_model.config

# print how many billions of parameters in each model 
# Function to count parameters in billions
def count_parameters_in_billions(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e9

# Count and print parameters for each component
print(f"Audio Tower: {count_parameters_in_billions(audio_tower):.2f} billion parameters")
print(f"Multi-Modal Projector: {count_parameters_in_billions(multi_modal_projector):.2f} billion parameters")
print(f"Language Model: {count_parameters_in_billions(language_model):.2f} billion parameters")
print(f"Total Model: {count_parameters_in_billions(model):.2f} billion parameters")

 
print(config)
print(config.text_config.hidden_size)
print("______________________")
print(audio_tower_config)
print("______________________")
print(language_model_config)

# get the model weights, and print all layer names
# weights = language_model.state_dict()
# for key in weights:
#     print(key, weights[key].shape)

Audio Tower: 0.64 billion parameters
Multi-Modal Projector: 0.01 billion parameters
Language Model: 7.75 billion parameters
Total Model: 1.39 billion parameters
Qwen2AudioConfig {
  "_name_or_path": "Qwen/Qwen2-Audio-7B",
  "architectures": [
    "Qwen2AudioForConditionalGeneration"
  ],
  "audio_config": {
    "model_type": "qwen2_audio_encoder"
  },
  "audio_token_index": 151646,
  "ignore_index": -100,
  "model_type": "qwen2_audio",
  "text_config": {
    "bos_token_id": 151643,
    "eos_token_id": 151645,
    "hidden_size": 2048,
    "intermediate_size": 2048,
    "max_position_embeddings": 8192,
    "model_type": "qwen2",
    "num_hidden_layers": 4,
    "rms_norm_eps": 1e-05,
    "torch_dtype": "bfloat16",
    "use_mrope": false,
    "vocab_size": 156032
  },
  "torch_dtype": "float32",
  "transformers_version": "4.45.0.dev0",
  "vocab_size": 156032
}

2048
______________________
Qwen2AudioEncoderConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "attention_d

In [25]:
import torch


def uniform_element_selection(wt, s_shape):
    assert wt.dim() == len(s_shape), "Tensors have different number of dimensions"
    ws = wt.clone()
    for dim in range(wt.dim()):
        assert wt.shape[dim] >= s_shape[dim], "Teacher's dimension should not be smaller than student's dimension"  # determine whether teacher is larger than student on this dimension
        if wt.shape[dim] % s_shape[dim] == 0:
            step = wt.shape[dim] // s_shape[dim]
            indices = torch.arange(s_shape[dim]) * step
        else:
            indices = torch.round(torch.linspace(0, wt.shape[dim]-1, s_shape[dim])).long()
        
        #print(indices)
        ws = torch.index_select(ws, dim, indices)
    assert ws.shape == s_shape
    return ws

In [26]:
import copy
#   "hidden_size": 4096,
#   "intermediate_size": 11008,
#   "num_attention_heads": 32,
#   "num_hidden_layers": 32,
#   "num_key_value_heads": 32,

def make_small_qwen_audio(large_qwenaudio, hidden_size=2048, num_hidden_layers=4, intermediate_size=2048):

    teacher_model = large_qwenaudio.language_model
    student_config = copy.deepcopy(teacher_model.config)

    student_config.hidden_size = hidden_size
    student_config.num_hidden_layers = num_hidden_layers
    student_config.intermediate_size = intermediate_size

    student_model = AutoModelForCausalLM.from_config(student_config, attn_implementation=config._attn_implementation)

    print(student_model.num_parameters() / 1e9)

    teacher_weights = teacher_model.state_dict()
    student_weights = student_model.state_dict()
    weight_selection = {}
    for key in student_weights.keys():
        # We don't perform weight selection on classification head by default. Remove this constraint if target dataset is the same as teacher's.
        # if "head" in key:
        #     continue
        print(key, teacher_weights[key].shape, student_weights[key].shape)
        s_shape = student_weights[key].shape
        weight_selection[key] = uniform_element_selection(teacher_weights[key], s_shape)
        
    student_weights.update(weight_selection)
    student_model.load_state_dict(student_weights)

    # 修改 multi_modal_projector 的 linear 层
    large_qwenaudio.config.text_config.hidden_size = hidden_size
    large_qwenaudio.config.text_config.num_hidden_layers = num_hidden_layers
    large_qwenaudio.config.text_config.intermediate_size = intermediate_size
    student_multi_modal_projector = Qwen2AudioMultiModalProjector(large_qwenaudio.config)
    
    large_qwenaudio.multi_modal_projector = student_multi_modal_projector
    large_qwenaudio.language_model = student_model

    return large_qwenaudio


small_model = make_small_qwen_audio(model)


0.756590592
model.embed_tokens.weight torch.Size([156032, 4096]) torch.Size([156032, 2048])
model.layers.0.self_attn.q_proj.weight torch.Size([4096, 4096]) torch.Size([2048, 2048])
model.layers.0.self_attn.q_proj.bias torch.Size([4096]) torch.Size([2048])
model.layers.0.self_attn.k_proj.weight torch.Size([4096, 4096]) torch.Size([2048, 2048])
model.layers.0.self_attn.k_proj.bias torch.Size([4096]) torch.Size([2048])
model.layers.0.self_attn.v_proj.weight torch.Size([4096, 4096]) torch.Size([2048, 2048])
model.layers.0.self_attn.v_proj.bias torch.Size([4096]) torch.Size([2048])
model.layers.0.self_attn.o_proj.weight torch.Size([4096, 4096]) torch.Size([2048, 2048])
model.layers.0.mlp.gate_proj.weight torch.Size([11008, 4096]) torch.Size([2048, 2048])
model.layers.0.mlp.up_proj.weight torch.Size([11008, 4096]) torch.Size([2048, 2048])
model.layers.0.mlp.down_proj.weight torch.Size([4096, 11008]) torch.Size([2048, 2048])
model.layers.0.input_layernorm.weight torch.Size([4096]) torch.Size(

In [27]:

 
small_model.eval()
# model.language_model = language_model
# test a bit
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B" ,trust_remote_code=True)

prompt = "<|audio_bos|><|AUDIO|><|audio_eos|>Generate the caption in English:"
url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Audio/glass-breaking-151256.mp3"
audio, sr = librosa.load(BytesIO(urlopen(url).read()), sr=processor.feature_extractor.sampling_rate)
inputs = processor(text=prompt, audios=audio, return_tensors="pt")

for k, v in inputs.items():
    print(k, v.shape)

generated_ids = small_model.generate(**inputs, max_length=256)
generated_ids = generated_ids[:, inputs.input_ids.size(1):]
print(generated_ids)
response = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
print("______________________")
print(response)

It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


input_ids torch.Size([1, 9])
attention_mask torch.Size([1, 9])
input_features torch.Size([1, 128, 3000])
feature_attention_mask torch.Size([1, 3000])


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Audio features shape: torch.Size([1, 750, 2048])
Input embeddings shape: torch.Size([1, 9, 2048])
Input IDs shape: torch.Size([1, 9])
Attention mask shape: torch.Size([1, 9])
Position IDs shape: torch.Size([1, 9])
Audio output lengths shape: torch.Size([1])
num_audios:  1  max_audio_tokens:  750  embed_dim:  2048
tensor([[ 86958, 151651,  23978,  58684, 101830,  87331,  85059, 151952, 101930,
          95011,  39426,  70226, 100446, 100562, 154925, 132258,  83050,  87331,
         138139,  48402,  42180,  57647, 134631,  95773,  94366, 109245, 101830,
         132258,   1680, 151651, 120172,  47499,  49646, 154922, 132258,  37968,
         144160,  87331,  61047,  87331,  38063, 121688, 112688,  39366, 102928,
          60399, 135873,  89039,  42339,  77698,  99636, 119706,  89195, 151994,
          80042,  89039,  92188,  47247,  90732,  80042,  98082,  68028, 141014,
         121000, 136432,  36316,  55356,  54803, 110973,  99846,  88291, 121688,
         114184,  50990,  22741,  243

In [28]:
# save the small model
save_name = "qwen2_audio_0.7b"
save_path = f"/home/AI_repo/sean/vt2/models/{save_name}"
small_model.save_pretrained(save_path)

In [None]:
from transformers import AutoModel
import torch

def compare_models(model1, model2):

    print("model1 training: ", model1.training, " model2 training: ", model2.training)

    # 先比较config
    # Compare the configurations
    config1 = model1.config.to_dict()
    config2 = model2.config.to_dict()

    print(config1)
    print("______________________")
    print(config2)

    for key in config1:
        if key in config2:
            if config1[key] != config2[key]:
                print(f"Different in {key}: Model1 - {config1[key]}, Model2 - {config2[key]}")
        else:
            print(f"Key {key} not found in Model2's config")

    for key in config2:
        if key not in config1:
            print(f"Key {key} not found in Model1's config")
   

    # Retrieve the state dictionaries
    state_dict1 = model1.state_dict()
    state_dict2 = model2.state_dict()

    # For each parameter in the state dictionaries
    for param1, param2 in zip(state_dict1.items(), state_dict2.items()):
        # Unpack the parameter names and tensors
        name1, tensor1 = param1
        name2, tensor2 = param2

        # Check the names are the same (they should be, if the models are of the same architecture)
        if name1 == name2:
            try:
                # Calculate the difference between the two tensors
                difference = torch.nn.functional.mse_loss(tensor1, tensor2)
                
                print(f"Difference in {name1}: {difference}")
            except:
                print(f"Cannot compare {name1} and {name2}")
        else:
            print(f"Parameter names do not match: {name1}, {name2}")



compare_models(language_model, small_model)