In [2]:
from datasets import Dataset
import random
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, BitsAndBytesConfig

In [3]:
model = AutoModelForCausalLM.from_pretrained(
    'autodl-tmp/qwen/Qwen2-7B-Instruct',
    device_map='auto',
    torch_dtype=torch.bfloat16,
    # cache_dir=cache_dir,
    # quantization_config=nf4_config,
    # low_cpu_mem_usage = True
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
from peft import PeftModel
peft_model = PeftModel.from_pretrained(model, model_id='autodl-tmp/peft/lora_mix1',torch_type=torch.float16,is_trainable=True)
#peft_model

In [6]:
merge_model = peft_model.merge_and_unload()
merge_model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(152064, 3584)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
          (k_proj): Linear(in_features=3584, out_features=512, bias=True)
          (v_proj): Linear(in_features=3584, out_features=512, bias=True)
          (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm()
        (post_attention_layernorm): Qwen2RMSNorm()
      )
    )
    (norm): Qwen2RMSNorm()
  )
  (lm_head): Lin

In [7]:
merge_model.save_pretrained('autodl-tmp/merge_model')

In [9]:
tokenizer = AutoTokenizer.from_pretrained('autodl-tmp/peft/lora_mix1', trust_remote_code=True)
tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Qwen2TokenizerFast(name_or_path='autodl-tmp/peft/lora_mix1', vocab_size=151643, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [10]:
tokenizer.save_pretrained('autodl-tmp/merge_model')

('autodl-tmp/merge_model/tokenizer_config.json',
 'autodl-tmp/merge_model/special_tokens_map.json',
 'autodl-tmp/merge_model/vocab.json',
 'autodl-tmp/merge_model/merges.txt',
 'autodl-tmp/merge_model/added_tokens.json',
 'autodl-tmp/merge_model/tokenizer.json')

In [4]:
model.base_model.layers[27].self_attn.q_proj.weight

Parameter containing:
tensor([[-0.0074,  0.0027, -0.0109,  ...,  0.0009, -0.0078,  0.0051],
        [-0.0093,  0.0206,  0.0021,  ...,  0.0044, -0.0091,  0.0051],
        [-0.0114,  0.0050,  0.0084,  ...,  0.0208,  0.0005,  0.0167],
        ...,
        [ 0.0001,  0.0162,  0.0110,  ..., -0.0244, -0.0183, -0.0173],
        [-0.0211,  0.0089, -0.0339,  ...,  0.0466,  0.0023,  0.0249],
        [-0.0148, -0.0029, -0.0159,  ...,  0.0172,  0.0077, -0.0063]],
       device='cuda:0', dtype=torch.bfloat16, requires_grad=True)

In [27]:
for param in merge_model.parameters():
    # param.requires_grad = True
    print(param)

Parameter containing:
tensor([[-3.0640e-02, -7.7515e-03,  2.7771e-03,  ...,  1.0925e-02,
          2.2339e-02, -8.9722e-03],
        [ 3.7079e-03,  4.1199e-03,  1.3367e-02,  ..., -4.0894e-03,
          1.5198e-02,  7.2632e-03],
        [-1.4526e-02, -8.3008e-03,  1.5259e-02,  ..., -6.0120e-03,
         -2.0630e-02,  3.5248e-03],
        ...,
        [-1.1755e-37,  1.1755e-37,  1.1755e-37,  ..., -1.1755e-37,
          1.1755e-37,  1.1755e-37],
        [ 1.1755e-37, -1.1755e-37,  1.1755e-37,  ..., -1.1755e-37,
          1.1755e-37, -1.1755e-37],
        [ 1.1755e-37, -1.1755e-37, -1.1755e-37,  ...,  1.1755e-37,
          1.1755e-37, -1.1755e-37]], device='cuda:0', dtype=torch.bfloat16,
       requires_grad=True)
Parameter containing:
tensor([[ 0.0045, -0.0039, -0.0107,  ...,  0.0042, -0.0062, -0.0039],
        [-0.0028, -0.0047, -0.0292,  ..., -0.0081,  0.0005, -0.0122],
        [-0.0064,  0.0036,  0.0086,  ...,  0.0043,  0.0047,  0.0135],
        ...,
        [ 0.0106,  0.0066,  0.0194,