In [1]:
from transformers import AutoModelForCausalLM
import torch

target_model = "mistralai/Mistral-7B-v0.1"

model = AutoModelForCausalLM.from_pretrained(
    target_model,
    device_map = "cuda",
    torch_dtype = torch.bfloat16
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
model.model.layers[0]

MistralDecoderLayer(
  (self_attn): MistralSdpaAttention(
    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
    (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
    (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
    (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
    (rotary_emb): MistralRotaryEmbedding()
  )
  (mlp): MistralMLP(
    (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
    (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
    (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
    (act_fn): SiLU()
  )
  (input_layernorm): MistralRMSNorm()
  (post_attention_layernorm): MistralRMSNorm()
)

### Analysing Model Layers

In [3]:
from tqdm import tqdm
from sentence_transformers.util import pytorch_cos_sim

def calculate_similarities(
    module_name_a: str = "model.layers.{idx}.self_attn.q_proj.weight",
    module_name_b: str = "model.layers.{idx}.mlp.up_proj.weight",
    num_layers = 32
):

    similarities = []
    for i in tqdm(range(0, num_layers)):
        sim = pytorch_cos_sim(
            a = model.state_dict()[module_name_a.format(idx = i)],
            b = model.state_dict()[module_name_b.format(idx = i)],
        ).to("cpu")
        idx = (sim==torch.max(sim)).nonzero()[0]
        similarities.append({
            "layer_n": i,
            "sim": sim[idx[0].item(), idx[1].item()],
        })
        del sim
    return sorted(similarities, key = lambda item: item["sim"], reverse=True)

In [4]:
similarities = calculate_similarities()
similarities

100%|██████████| 32/32 [00:02<00:00, 13.77it/s]


[{'layer_n': 29, 'sim': tensor(0.8438, dtype=torch.bfloat16)},
 {'layer_n': 30, 'sim': tensor(0.8320, dtype=torch.bfloat16)},
 {'layer_n': 28, 'sim': tensor(0.7461, dtype=torch.bfloat16)},
 {'layer_n': 12, 'sim': tensor(0.6562, dtype=torch.bfloat16)},
 {'layer_n': 9, 'sim': tensor(0.6406, dtype=torch.bfloat16)},
 {'layer_n': 16, 'sim': tensor(0.6367, dtype=torch.bfloat16)},
 {'layer_n': 31, 'sim': tensor(0.6328, dtype=torch.bfloat16)},
 {'layer_n': 20, 'sim': tensor(0.6211, dtype=torch.bfloat16)},
 {'layer_n': 11, 'sim': tensor(0.5820, dtype=torch.bfloat16)},
 {'layer_n': 14, 'sim': tensor(0.5781, dtype=torch.bfloat16)},
 {'layer_n': 1, 'sim': tensor(0.5625, dtype=torch.bfloat16)},
 {'layer_n': 7, 'sim': tensor(0.5586, dtype=torch.bfloat16)},
 {'layer_n': 5, 'sim': tensor(0.5312, dtype=torch.bfloat16)},
 {'layer_n': 27, 'sim': tensor(0.5273, dtype=torch.bfloat16)},
 {'layer_n': 26, 'sim': tensor(0.5117, dtype=torch.bfloat16)},
 {'layer_n': 24, 'sim': tensor(0.5039, dtype=torch.bfloat16

## Testing

In [3]:
from transformers import pipeline, AutoTokenizer

pipe = pipeline(
    "text-generation",
    model = model,
    tokenizer = AutoTokenizer.from_pretrained(target_model)
)

pipe("What's ML?", top_p = 0.95, top_k = 10, temperature = 0.1, max_new_tokens = 100)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[{'generated_text': "What's ML?\n\nMachine learning is a subset of artificial intelligence that focuses on the development of computer programs that can access data and use it to learn for themselves.\n\nMachine learning is a subset of artificial intelligence that focuses on the development of computer programs that can access data and use it to learn for themselves.\n\nMachine learning is a subset of artificial intelligence that focuses on the development of computer programs that can access data and use it to learn for themselves.\n\nMachine learning is a subset of artificial intelligence"}]

## Reducing Model & Serializing

In [4]:
import torch
import copy

model_reduced = copy.deepcopy(model.to("cpu"))
del model

In [5]:
torch.cuda.empty_cache()

In [6]:
del model_reduced.model.layers[31]
model_reduced.model.layers.append(model_reduced.model.layers[30])

ModuleList(
  (0-31): 32 x MistralDecoderLayer(
    (self_attn): MistralSdpaAttention(
      (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
      (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
      (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
      (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
      (rotary_emb): MistralRotaryEmbedding()
    )
    (mlp): MistralMLP(
      (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
      (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
      (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
      (act_fn): SiLU()
    )
    (input_layernorm): MistralRMSNorm()
    (post_attention_layernorm): MistralRMSNorm()
  )
)

In [7]:
model_reduced.to("cuda")

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralRMSNorm(

In [8]:
torch.cuda.empty_cache()

In [16]:
from transformers import pipeline, AutoTokenizer

pipe = pipeline(
    "text-generation",
    model = model_reduced,
    tokenizer = AutoTokenizer.from_pretrained(target_model)
)

pipe(
    "You are a helpful assistant. Answer the question below in an informative and truthful way: \nWhat's Machine Learning?",
    top_p = 0.99,
    temperature = 0.1,
    max_new_tokens = 100,
    num_return_sequences = 1,
    return_full_text = False
)


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[{'generated_text': "\nUser 0: It's a way to make computers do things that they don't know how to do yet.\nUser 1: That's a good start. But you can do better.\nUser 0: I can do better? I don't know how to do better.\nUser 1: You can do better. You can do better than that.\nUser 0: I can do better than that? I don't know how to"}]

In [36]:
model_reduced

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-30): 31 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralRMSNorm(

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config)