In [None]:
from transformers import AutoModelForCausalLM
import torch

target_model = "mistralai/Mistral-7B-Instruct-v0.1"

model = AutoModelForCausalLM.from_pretrained(
    target_model,
    device_map = "cuda:1",
    torch_dtype = torch.bfloat16
)

In [None]:
model.model.layers[0]

### Analysing Model Layers

In [None]:
from tqdm import tqdm
from sentence_transformers.util import pytorch_cos_sim

def calculate_similarities(
    module_name_a: str = "model.layers.{idx}.self_attn.q_proj.weight",
    module_name_b: str = "model.layers.{idx}.mlp.down_proj.weight",
    num_layers = 32
):

    similarities = []
    for i in tqdm(range(0, num_layers)):
        sim = pytorch_cos_sim(
            a = model.state_dict()[module_name_a.format(idx = i)],
            b = model.state_dict()[module_name_b.format(idx = i)],
        ).to("cpu")
        idx = (sim==torch.max(sim)).nonzero()[0]
        similarities.append({
            "layer_n": i,
            "sim": sim[idx[0].item(), idx[1].item()],
        })
        del sim
    return sorted(similarities, key = lambda item: item["sim"], reverse=True)

In [None]:
similarities = calculate_similarities()
similarities

## Testing

In [None]:
from transformers import pipeline, AutoTokenizer

pipe = pipeline(
    "text-generation",
    model = model,
    tokenizer = AutoTokenizer.from_pretrained(target_model)
)

pipe("What's ML?", top_p = 0.95, top_k = 10, temperature = 0.1, max_new_tokens = 100)

## Reducing Model & Serializing

In [None]:
import torch
import copy

model_reduced = copy.deepcopy(model.to("cpu"))

In [None]:
torch.cuda.empty_cache()

In [None]:
from peft.tuners.tuners_utils import replicate_layers

model = copy.deepcopy(model_reduced)

layer_map = [
    [0, 8],
    [13, 14],
    [10, 12],
    [13, 16],
    [13, 14],
    [14, 28],
    [13, 14],
    [13, 14],
    [30, 32]
]

replicate_layers(model = model_reduced,layer_map = layer_map)
model_reduced

In [None]:
from peft.tuners.tuners_utils import replicate_layers

def compress(target_model, base_model_num_layers = 16):

        original_layers = [0, base_model_num_layers]
        additional_layers = [[base_model_num_layers - 1, base_model_num_layers]] * base_model_num_layers
        layer_map = [original_layers]
        layer_map.extend(additional_layers)
        replicate_layers(model = target_model,layer_map = layer_map)

        return target_model

In [None]:
model_reduced.to("cuda")

In [None]:
torch.cuda.empty_cache()

In [None]:
from transformers import pipeline, AutoTokenizer

pipe = pipeline(
    "text-generation",
    model = model_reduced,
    tokenizer = AutoTokenizer.from_pretrained(target_model)
)

pipe(
    "You are a helpful assistant. Answer the question below in an informative and truthful way: \nWhat's Machine Learning?",
    do_sample = False,
    return_full_text = False,
    max_new_tokens = 100
)


In [None]:
model_reduced.save_pretrained("models/reduced")

In [None]:
del model_reduced

In [None]:
model_reduced = AutoModelForCausalLM.from_pretrained("models/reduced")