In [None]:
import gc
import json
import random
import time
from random import shuffle

import datasets
import numpy as np
import torch
from peft import PeftModel

# from lib.utils import gptq_data_utils
from tqdm import tqdm
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, M2M100ForConditionalGeneration, PreTrainedModel

In [None]:
class ModelModifier:
    def __init__(self, model_name):
        self.model_name = model_name
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.float16)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.original_weights = {}
        self.modified_layers = set()
        self.failed_attempts = set()
        self.layer_snr = {}

    def calculate_snr_for_layer(self, name):
        module = self.model.get_submodule(name)
        weights = module.weight.double()
        S = torch.linalg.svdvals(weights)
        max_singular_value = S[0].item()  # First singularity value
        weights = weights.detach().cpu()
        S = S.detach().cpu()
        sigma_estimated = self.estimate_sigma_with_full_iqr(S)
        n, m = weights.shape
        mp_threshold = self.marchenko_pastur_threshold(sigma_estimated, n, m)

        signal = S[S > mp_threshold].sum()
        noise = S[S <= mp_threshold].sum()
        snr = signal / noise if noise != 0 else float("inf")
        snr_ratio = snr / max_singular_value  # Calculates the ratio of SNR to the highest singularity value
        del S, weights
        torch.cuda.empty_cache()  # Clear PyTorch's CUDA memory cache
        gc.collect()
        return snr_ratio  # Returns the ratio

    def assess_layers_snr(self, layer_types, layer_numbers):
        self.layer_snr = {layer_type: [] for layer_type in layer_types}

        for name, _ in self.model.named_modules():
            for layer_number in layer_numbers:
                for layer_type in layer_types:
                    if layer_type in name and str(layer_number) in name:
                        print("*" * 50, flush=True)
                        print(f"Calculating Signal to Noise Ratio at layer {name}", flush=True)
                        snr_ratio = self.calculate_snr_for_layer(name)
                        self.layer_snr[layer_type].append(
                            (str(name), snr_ratio.item() if isinstance(snr_ratio, torch.Tensor) else snr_ratio)
                        )
                        print(f"Signal to Noise Ratio at layer {name} = {snr_ratio}", flush=True)
                        print("*" * 50, flush=True)

    def update_model_reduce_layer(self, layer_type, layer_number):
        layer_id = f"{layer_type}_{layer_number}"
        if layer_id in self.modified_layers:
            print(f"Layer {layer_id} has already been modified. Skipping.")
            return False

        for name, module in self.model.named_modules():
            if layer_type in name and str(layer_number) in name:
                print(f"Reconstructing layer: {name}")
                original_dtype = module.weight.dtype
                self.original_weights[name] = module.weight.detach().clone()
                weights = module.weight.double()
                U, S, V = torch.linalg.svd(weights, full_matrices=False)

                # Estimate sigma using the full IQR method
                sigma_estimated_full_iqr = self.estimate_sigma_with_full_iqr(S)

                # Calculate Marchenko-Pastur threshold
                n, m = weights.shape
                mp_threshold_full_iqr = self.marchenko_pastur_threshold(sigma_estimated_full_iqr, n, m)

                # Retain only the singular values above the MP threshold
                S_reduced = torch.zeros_like(S)
                k = (S > mp_threshold_full_iqr).sum().item()
                S_reduced[:k] = S[:k]
                print(f"Reduced from {S.shape} to {k}")

                # Reconstruct the matrix using the thresholded singular values
                reconstructed_weights = U @ torch.diag(S_reduced) @ V
                reconstructed_weights = reconstructed_weights.to(original_dtype)
                module.weight = torch.nn.Parameter(reconstructed_weights)
                self.modified_layers.add(layer_id)
                return True

    @staticmethod
    def marchenko_pastur_threshold(sigma, n, m):
        beta = n / m if n < m else m / n
        threshold = sigma * np.sqrt((1 + np.sqrt(beta)) ** 2)
        return threshold

    # Calculate an estimate of the standard deviation of the singular values based on Inter Quantile Range
    @staticmethod
    def estimate_sigma_with_full_iqr(S):
        q75 = torch.quantile(S, 0.75)
        q25 = torch.quantile(S, 0.25)
        iqr = q75 - q25
        sigma_estimated = iqr / 1.349  # 0.6745 * sigma is the expected range between the quantiles (Q1 and Q3)
        return sigma_estimated

    def restore_model_original_layer(self, layer_type, layer_number):
        layer_id = f"{layer_type}_{layer_number}"
        for name, module in self.model.named_modules():
            if layer_type in name and layer_number in name:
                if name in self.original_weights:
                    module.weight = torch.nn.Parameter(self.original_weights[name])
                    print(f"Restored original weights for layer: {name}")
                    if layer_id in self.modified_layers:
                        self.modified_layers.remove(layer_id)
                else:
                    print(f"No original weights saved for layer: {name}")

    # def calculate_model_perplexity(self, datasets=['wikitext2', 'c4', 'ptb'], seqlen=384, use_cuda_graph=False, use_flash_attn=False):
    #     model = self.model
    #     model_str = self.model_name
    #     acc_loss = 0.0
    #     total_samples = 0

    #     for dataset in datasets:
    #         input_tok = gptq_data_utils.get_test_tokens(dataset, seed=0, seqlen=seqlen, model=model_str)
    #         nsamples = input_tok.numel() // seqlen
    #         input_tok = input_tok[0, :(seqlen * nsamples)].view(nsamples, seqlen)
    #         total_samples += nsamples

    #         #if not use_cuda_graph:
    #         #    model.reset()

    #         loss_fct = torch.nn.CrossEntropyLoss().cuda()
    #         progress = tqdm(range(nsamples))
    #         for ii in progress:
    #             input = input_tok[ii, :].cuda().view(1, -1)
    #             output = model(input, use_cache=False, output_hidden_states=False, output_attentions=False)[0]
    #             shift_logits = output[:, :-1, :].contiguous()
    #             shift_labels = input[:, 1:]
    #             loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    #             acc_loss += loss.item()
    #             progress.set_description(f"avg_loss = {acc_loss/(ii+1)}")

    #     avg_loss = acc_loss / total_samples
    #     ppl = torch.exp(torch.tensor(avg_loss)).item()
    #     return ppl

    # ### Implement a Backward Search
    # # Search for the optimal lower ranking approximations from the top layers downwards
    # # Also, we try doing a greedy approach, in order to maximize the rank reduction.
    # # We tune the compression rate based on Marchenko-Pastur Random Matrix Theory
    # ######################################################################################

    # def search_optimal_layer_modification(self, layer_types, layer_numbers, max_mod=5):
    #     # Calculate initial perplexity with original model weights
    #     initial_perplexity = self.calculate_model_perplexity()
    #     print("="*50)
    #     print(f"The initial perplexity of the model is {initial_perplexity}")
    #     print("="*50)
    #     min_loss = initial_perplexity
    #     optimal_params = (None, None)
    #     mods = 0

    #     for layer_number in layer_numbers:
    #         for layer_type in layer_types:
    #             if mods >= max_mod and max_mod != -1:
    #                 return optimal_params, min_loss
    #             attempt = (layer_type, layer_number)
    #             if attempt in self.failed_attempts:
    #                 continue  # Skip this attempt if it has failed before

    #             try_update = self.update_model_reduce_layer(layer_type, layer_number)

    #             if not try_update:
    #                 continue  # Skip this attempt if it has already been modified before

    #             try:
    #                 loss = self.calculate_model_perplexity()
    #                 if loss < min_loss:
    #                     min_loss = loss
    #                     optimal_params = (layer_type, layer_number)
    #                     mods = mods + 1
    #                     # Break out of the loop as soon as a better configuration is found
    #                     print("*"*50)
    #                     print(f"Improved perplexity found: {min_loss} for layer {layer_type} {layer_number}. Total modifications is {mods}")
    #                     print("*"*50)
    #                 else:
    #                     self.restore_model_original_layer(layer_type, layer_number)
    #                     self.failed_attempts.add(attempt)  # Record the failed attempt

    #             except NotImplementedError:
    #                 print("Perplexity calculation method is not implemented yet.")
    #                 return False, min_loss

    #     return optimal_params, min_loss

    def get_top_snr_ratios(self, top_n=16):
        # Sort and extract the top n SNR values for each specific module
        top_snr_layers = {}
        for layer_type, snr_ratios in self.layer_snr.items():
            sorted_layers = sorted(snr_ratios, key=lambda x: x[1], reverse=True)  # Sort by SNR value
            top_snr_layers[layer_type] = [layer[0] for layer in sorted_layers[:top_n]]  # Saving the layer names

        return top_snr_layers

    def get_random_layers(self, layer_types, n=16):
        random_layers = []

        for layer_type in layer_types:
            layers = []
            for name, _ in self.model.named_modules():
                if layer_type in name:
                    layers.append(name)
            shuffle(layers)
            random_layers += layers[:n]

        return random_layers

    def save_layers_to_json(self, filename="zzz_rmt_laser/layer_snr_info.json"):
        with open(filename, "w") as file:
            json.dump(self.layer_snr, file, indent=4)

    def save_top_snr_ratios_to_json(self, top_snr_layers, filename="zzz_rmt_laser/top_snr_ratios.json"):
        with open(filename, "w") as file:
            json.dump(top_snr_layers, file, indent=4)

    def save_top_snr_ratios_to_txt(self, top_snr_layers, filename="zzz_rmt_laser/top_snr_ratios.txt"):
        names = []
        for layers in top_snr_layers.values():
            names += layers
        with open(filename, "w") as f:
            f.writelines([name + "\n" for name in names])

    def save_model(self, save_dir):
        self.model.save_pretrained(save_dir)
        self.tokenizer.save_pretrained(save_dir)

In [None]:
# only setup for nllb
def create_tied_embedding_weights(model: PreTrainedModel) -> PreTrainedModel:
    encoder_embeddings = torch.nn.Embedding(model.config.vocab_size, model.config.d_model, model.config.pad_token_id)
    decoder_embeddings = torch.nn.Embedding(model.config.vocab_size, model.config.d_model, model.config.pad_token_id)
    model.base_model.encoder.embed_tokens = encoder_embeddings
    model.base_model.decoder.embed_tokens = decoder_embeddings
    model.tie_weights()
    return model


model_name = "facebook/nllb-200-distilled-600M"
layer_numbers = [f".{layer}." for layer in range(11, -1, -1)]
layer_types = ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"]

modifier = ModelModifier(model_name)

# NOTE: not sure what this code was for
# # load normal fine tuned model
# modifier.model = AutoModelForSeq2SeqLM.from_pretrained(
#     "test_S/MT/experiments/NLLB.1.3B.id-XriAlasSplit_8001.btx-XriAlasSplit_8001/run/checkpoint-2000",
#     torch_dtype=torch.float16,
# )
# modifier.tokenizer = AutoTokenizer.from_pretrained(
#     "test_S/MT/experiments/NLLB.1.3B.id-XriAlasSplit_8001.btx-XriAlasSplit_8001"
# )

# # load and merge lora model
# tokenizer = AutoTokenizer.from_pretrained("test_S/MT/experiments/lora_32_all", use_fast=True)
# base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.float16)
# base_model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)
# base_model = create_tied_embedding_weights(base_model)
# model = PeftModel.from_pretrained(base_model, "test_S/MT/experiments/lora_32_all/run/checkpoint-14000")
# model.merge_and_unload()
# modifier.model = model
# modifier.tokenizer = tokenizer

# normal process
modifier.assess_layers_snr(layer_types, layer_numbers)
top_snr_ratios = modifier.get_top_snr_ratios(16)

modifier.save_layers_to_json()
modifier.save_top_snr_ratios_to_json(top_snr_ratios)
modifier.save_top_snr_ratios_to_txt(top_snr_ratios)

# # random layers
# n = 4
# random_layers = modifier.get_random_layers(layer_types, n)
# with open(f"random_layers_{n}_nllb.txt", "w") as f:
#     f.writelines([name + "\n" for name in random_layers])

### Code snippets that I inserted into HuggingFaceNMTModel.train in order to run the experiments
In my experiment directories, I had a `target_layers.txt` file that was the output of the `save_top_snr_ratios_to_txt` function above, which is just a list of layer names. The candidate layers are all of the "Linear" layers of the model. In (I believe) all of the experiments, the embedding layers were fully trained (unfrozen, no reduced rank).

**Targeting layers**

This method was used in combination with LoRA. The target modules were trained with reduced rank and the embeddings were trained with full rank.

In [None]:
# target layers
target_modules = []
target_layers_path = self._config.exp_dir / "target_layers.txt"
with target_layers_path.open() as f:
    target_modules = [line[:-1] for line in f.readlines()]

**Freezing layers**

This method just uses basic model functionality to freeze the layers.

In [None]:
# freeze layers
# example layer name: model.encoder.layers.0.self_attn.q_proj
modules_to_train = ["model.shared"] # embedding layers
target_layers_path = self._config.exp_dir / "target_layers.txt"
with target_layers_path.open() as f:
    modules_to_train += [line[:-1] for line in f.readlines()]

for name, param in model.named_parameters():
    freeze = True
    for module in modules_to_train:
        if name.startswith(module):
            freeze = False
    param.requires_grad = not freeze

**Model reduction**

This is the method proposed in the laserRMT paper/code. From what I remember, this is very CPU-instensive and ran extremely slow on the GPUs. I would recommend creating and saving the desired reduced model with a CPU (since the base model is what's being reduced) before fine-tuning it on a GPU. There is a model with all linear layers reduced located at `M/MT/experiments/Demo_Isaac/nllb_full_reduced` that was used for the 'full_reduced' experiments.

In [None]:
def update_model_reduce_layer(model: PreTrainedModel, layer_name: str) -> PreTrainedModel:
    module = model.get_submodule(layer_name)
    original_dtype = module.weight.dtype
    weights = module.weight.double()
    U, S, V = torch.linalg.svd(weights, full_matrices=False)

    # Estimate sigma using the full IQR method
    q75 = torch.quantile(S, 0.75)
    q25 = torch.quantile(S, 0.25)
    iqr = q75 - q25
    sigma_estimated = iqr / 1.349

    # Calculate Marchenko-Pastur threshold
    n, m = weights.shape
    beta = n / m if n < m else m / n
    mp_threshold_full_iqr = sigma_estimated * np.sqrt((1 + np.sqrt(beta)) ** 2)

    # Retain only the singular values above the MP threshold
    S_reduced = torch.zeros_like(S)
    k = (S > mp_threshold_full_iqr).sum().item()
    S_reduced[:k] = S[:k]
    print(f"Reduced from {S.shape} to {k}")

    # Reconstruct the matrix using the thresholded singular values
    reconstructed_weights = U @ torch.diag(S_reduced) @ V
    reconstructed_weights = reconstructed_weights.to(original_dtype)
    module.weight = torch.nn.Parameter(reconstructed_weights)

    return model

# reduce model
modules_to_train = []
target_layers_path = self.config.exp_dir / "target_layers.txt"
with target_layers_path.open() as f:
    modules_to_train += [line[:-1] for line in f.readlines()]
model = AutoModelForSeq2SeqLM.from_pretrained(self.config.model, device_map="auto")
for layer in modules_to_train:
    model = update_model_reduce_layer(model, layer)
model.save_pretrained(self.config.exp_dir / "reduced")
