In [1]:
from transformers import AutoTokenizer, BloomForCausalLM
import torch
from tqdm import tqdm
import matplotlib.pyplot as plt
import json

  from .autonotebook import tqdm as notebook_tqdm


[2024-01-12 13:23:41,779] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


2024-01-12 13:23:53.239472: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-12 13:23:53.286609: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-12 13:23:53.286647: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-12 13:23:53.286673: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-12 13:23:53.296403: I tensorflow/core/platform/cpu_feature_g

In [2]:
folder_path = "/data/lypan/llm_interpre/neuron_info/bloom-560m/"

with open(folder_path + 'lang_agnos.json', 'r') as json_file:
    lang_agnos = json.load(json_file)

with open(folder_path + 'lang_speci_by_lang.json', 'r') as json_file:
    lang_speci = json.load(json_file)

In [3]:
model_path = "/data/lypan/llms/bloom-560m"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = BloomForCausalLM.from_pretrained(model_path).to("cuda:2")

In [4]:
def find_specific_param(model, module_name):
    weight_name = module_name + ".weight"
    bias_name = module_name + ".bias"
    param_dict = {"weight": None, "bias": None}
    for name, param in model.named_parameters():
        if name == weight_name:
            param_dict["weight"] = param
        elif name == bias_name:
            param_dict["bias"] = param

    return param_dict

def get_mask_grad(model, module_name):
    """返回单一指定结构的mask梯度

    Args:
        model (_type_): _description_
        module_name (_type_): _description_

    Returns:
        mask_grad: _description_
    """
    # 寻找对应结构的weight和bias参数
    param_dict = find_specific_param(model, module_name)
    weight_grad, bias_grad = None, None
    # 计算对应参数的梯度矩阵
    if param_dict["weight"] is not None:
        weight_grad = param_dict["weight"].grad
    if param_dict["bias"] is not None:
        bias_grad = param_dict["bias"].grad

    # 计算当前结构神经元数量
    # neuron_num = weight_grad.shape[0] if weight_grad is not None else bias_grad.shape[0]
    # 初始化weight和bias的mask矩阵
    mask_weight_matrix = torch.zeros_like(weight_grad) if weight_grad is not None else None
    mask_bias_matrix = torch.zeros_like(bias_grad) if bias_grad is not None else None

    lang_agnos_neuron_index = torch.tensor(lang_agnos[module_name])
    
    # TODO: 后续要加上语言特定神经元也需要保留
    retain_index = lang_agnos_neuron_index

    # 构造weight和bias的mask矩阵
    if mask_weight_matrix is not None:
        mask_weight_matrix[retain_index] = 1
    if mask_bias_matrix is not None:
        mask_bias_matrix[retain_index] = 1

    # 进行mask操作
    if weight_grad is not None:
        weight_grad = weight_grad * mask_weight_matrix
    if bias_grad is not None:
        bias_grad = bias_grad * mask_bias_matrix

    mask_grad = {"mask_weight_grad": weight_grad, "mask_bias_grad": bias_grad}

    return mask_grad

def get_all_mask_grad(model, module_name_list):
    """返回指定结构列表的mask梯度

    Args:
        model (_type_): _description_
        module_name_list (_type_): _description_

    Returns:
        all_mask_grad: _description_
    """
    all_mask_grad = {}
    for i in range(len(module_name_list)):
        module_name = module_name_list[i]
        mask_grad = get_mask_grad(model, module_name)
        all_mask_grad[module_name + ".weight"] = mask_grad["mask_weight_grad"]
        all_mask_grad[module_name + ".bias"] = mask_grad["mask_bias_grad"]
    return all_mask_grad

In [7]:
module_name_list = ['transformer.h.0.mlp.dense_4h_to_h', 'transformer.h.1.mlp.dense_4h_to_h']

input_text = ["Your input text here."] * 50
input_ids = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to("cuda:2")

inputs = input_ids[:, :-1]
labels = input_ids[:, 1:]

learning_rate = 5e-5
batch_size = 8
num_epochs = 3

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
criterion = torch.nn.CrossEntropyLoss()

for epoch in range(num_epochs):
    for i in tqdm(range(0, len(inputs), batch_size), desc="Epoch "+ str(epoch)):
        batch_inputs = inputs[i:i+batch_size]
        batch_labels = labels[i:i+batch_size]

        # 正向传播
        outputs = model(batch_inputs, labels=batch_labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()

        # 对梯度进行修改
        all_mask_grad = get_all_mask_grad(model, module_name_list)

        weight_name_list = [module_name + ".weight" for module_name in module_name_list]
        bias_name_list = [module_name + ".bias" for module_name in module_name_list]

        for name, param in model.named_parameters():
            if name in weight_name_list:
                param.grad = all_mask_grad[name]
            elif name in bias_name_list:
                param.grad = all_mask_grad[name]

        optimizer.step()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}")

Epoch 0: 100%|██████████| 7/7 [03:25<00:00, 29.33s/it] 


Epoch 1/3, Loss: 1.589456815054291e-07


Epoch 1: 100%|██████████| 7/7 [00:01<00:00,  5.59it/s]


Epoch 2/3, Loss: 0.0


Epoch 2: 100%|██████████| 7/7 [00:01<00:00,  5.70it/s]

Epoch 3/3, Loss: 0.0





: 