In [7]:
import numpy as np
import os
dtype = np.float32

import torch
import torch.nn as nn
import torchvision
from tqdm import tqdm
import os

from transformers import CLIPVisionModelWithProjection, AutoModelForCausalLM, LlavaForConditionalGeneration
from transformers import AutoModel, AutoTokenizer, OPTForCausalLM, BloomForCausalLM
import numpy

from huggingface_hub import try_to_load_from_cache, _CACHED_NO_EXIST
from huggingface_hub import scan_cache_dir

import glob
import random
import json
import os

device = torch.device("cuda:0")
# device = torch.device("cpu")

def get_named_linears(module):
    return {name: m for name, m in module.named_modules() if isinstance(m, nn.Linear)}

In [6]:
get_named_linears(model)

{'vision_tower.vision_model.encoder.layers.0.self_attn.k_proj': Linear(in_features=1024, out_features=1024, bias=True),
 'vision_tower.vision_model.encoder.layers.0.self_attn.v_proj': Linear(in_features=1024, out_features=1024, bias=True),
 'vision_tower.vision_model.encoder.layers.0.self_attn.q_proj': Linear(in_features=1024, out_features=1024, bias=True),
 'vision_tower.vision_model.encoder.layers.0.self_attn.out_proj': Linear(in_features=1024, out_features=1024, bias=True),
 'vision_tower.vision_model.encoder.layers.0.mlp.fc1': Linear(in_features=1024, out_features=4096, bias=True),
 'vision_tower.vision_model.encoder.layers.0.mlp.fc2': Linear(in_features=4096, out_features=1024, bias=True),
 'vision_tower.vision_model.encoder.layers.1.self_attn.k_proj': Linear(in_features=1024, out_features=1024, bias=True),
 'vision_tower.vision_model.encoder.layers.1.self_attn.v_proj': Linear(in_features=1024, out_features=1024, bias=True),
 'vision_tower.vision_model.encoder.layers.1.self_attn.q

In [8]:
model_list = [
    # '/workspace/Weight_compression/Wparam_dataset/hf_model/llava-hf--llava-1.5-7b-hf',
    'llava-hf--llava-1.5-7b-hf',
]

size_list = [
    1024,
]

direction = 'col'
shuffle = False
drop_last = False
modelwise_norm = False

for model_name, size in zip(model_list, size_list):
    model_id = model_name
    model_name = model_name.replace('/', '--')
    print('model_name: ', model_name)
    
    model_path = f"./hf_model/{model_name}"

    model = LlavaForConditionalGeneration.from_pretrained(model_path)

    datas = []
    named_linears = get_named_linears(model)
    for n, m in named_linears.items():
        w = m.weight.data.detach()
        
        if direction == 'col':
            w = w.T
        w = w.reshape(-1, size)

        datas.append(w)
        
    datas = torch.cat(datas, dim = 0)

    print('total dataset shape: ', datas.shape)

    indices = torch.randperm(len(datas))
    split_index = int(len(datas) - 1000)
    train_indices = indices[:split_index]
    val_indices = indices[split_index:]

    dataset = {}
    dataset['train'] = datas[train_indices]
    dataset['val'] = datas[val_indices]
    print('train: ', dataset['train'].shape, 'val: ', dataset['val'].shape)

    dataset_stats = {}
    for split in ['train', 'val']:
        data = dataset[split]
        
        dataset_stats[split] = {
            'mean': data.mean().item(),
            'std': data.std().item(),
            # 'mean_channel': data.mean(dim=0).tolist(),
            # 'std_channel': data.std(dim=0).tolist(),
        }

    os.makedirs(f'./block_pt/{model_name}', exist_ok = True)
    torch.save(dataset, f'./block_pt/{model_name}/{direction}_{size}.pt')
    json_path = f'./block_pt/{model_name}/{direction}_{size}_dataset_stats.json'
    with open(json_path, 'w') as f:
        json.dump(dataset_stats, f)

model_name:  llava-hf--llava-1.5-7b-hf


Loading checkpoint shards: 100%|██████████| 6/6 [00:00<00:00,  6.89it/s]


total dataset shape:  torch.Size([6767872, 1024])
train:  torch.Size([6766872, 1024]) val:  torch.Size([1000, 1024])


## gaussian block

In [None]:
d = torch.load('/workspace/Weight_compression/Wparam_dataset/block_pt/meta-llama--Meta-Llama-3-8B/col_1024_gaussian_padding.pt')
print(d['train'].shape)
print(d['val'].shape)

In [None]:
dataset = {}
dataset['train'] = torch.normal(mean=0.0, std=1.0, size=d['train'].shape)
dataset['val'] = torch.normal(mean=0.0, std=1.0, size=d['val'].shape)


In [None]:
print('train: ', dataset['train'].shape, 'val: ', dataset['val'].shape)

dataset_stats = {}
for split in ['train', 'val']:
    data = dataset[split]
    
    mean_all = data.mean()
    std_all = data.std()
    
    dataset_stats[split] = {
        'mean': mean_all.item(),
        'std': std_all.item(),
    }

sub = 'llama8b_'
direction = 'col'
size = 1024
os.makedirs(f'./block_pt/gaussian', exist_ok = True)
torch.save(dataset, f'./block_pt/gaussian/{sub}{direction}_{size}.pt')
json_path = f'./block_pt/gaussian/{sub}{direction}_{size}_dataset_stats.json'
with open(json_path, 'w') as f:
    json.dump(dataset_stats, f)

# layerwise, channelwise normalized Block

In [None]:
model_list = [
    'meta-llama/Meta-Llama-3-8B',
    # 'meta-llama--Llama-2-7b-hf',
    # 'meta-llama--Llama-2-13b-hf',
    # 'openai--clip-vit-large-patch14',
]

size_list = [
    1024,
    # 4096,
    # 4096,
    # 256,
]

# direction = 'adapt'
direction = 'col'

for model_name, size in zip(model_list, size_list):
    
    model_name = model_name.replace('/', '--')
    print('model_name: ', model_name)
    
    model_path = f"./hf_model/{model_name}"

    model = AutoModelForCausalLM.from_pretrained(model_path, local_files_only=True)
    # model = AutoModel.from_pretrained(model_path, local_files_only=True)
    layers = get_blocks(model)

        
    datas = []
    for i in tqdm(range(len(layers))):
        named_linears = get_named_linears(layers[i])
        for n, m in named_linears.items():
            W = m.weight.data.detach().cuda()

            W = (W - W.mean(dim=0, keepdim=True)) / W.std(dim=0, keepdim=True)
            # W = (W - W.mean()) / W.std()
            
            if direction == 'col':
                W = W.T    
            W = W.reshape(-1, size).cpu()
                
            datas.append(W)
        
    datas = torch.cat(datas, dim = 0)
    print('total dataset shape: ', datas.shape)
    
    indices = torch.randperm(len(datas))
    split_index = int(len(datas) - 1000)
    train_indices = indices[:split_index]
    val_indices = indices[split_index:]

    dataset = {}
    dataset['train'] = datas[train_indices]
    dataset['val'] = datas[val_indices]
    print('train: ', dataset['train'].shape, 'val: ', dataset['val'].shape)

    dataset_stats = {}
    for split in ['train', 'val']:
        data = dataset[split]
        
        # mean_dim0 = data.mean(dim=0)
        # std_dim0 = data.std(dim=0)        
        # mean_all = data.mean()
        # std_all = data.std()
        
        # dataset_stats[split] = {
        #     'mean': mean_all.item(),
        #     'std': std_all.item(),
        #     'mean_channel': mean_dim0.tolist(),
        #     'std_channel': std_dim0.tolist(),
        # }
        dataset_stats[split] = {
            'mean': 0,
            'std': 1,
            'mean_channel': None,
            'std_channel': None,
        }
        


    os.makedirs(f'./block_pt/{model_name}', exist_ok = True)
    torch.save(dataset, f'./block_pt/{model_name}/{direction}_{size}_colwise_normed.pt')
    json_path = f'./block_pt/{model_name}/{direction}_{size}_colwise_normed_dataset_stats.json'
    # torch.save(dataset, f'./block_pt/{model_name}/{direction}_{size}_layerwise_normed.pt')
    # json_path = f'./block_pt/{model_name}/{direction}_{size}_layerwise_normed_dataset_stats.json'
    with open(json_path, 'w') as f:
        json.dump(dataset_stats, f)

In [None]:
layers

# gaussian padding

size가 안 맞으면 같은 row 나 col의 mean, std를 갖는 가우시안으로 padding

In [None]:
model_list = [
    # 'meta-llama/Meta-Llama-3-8B',
    'meta-llama--Llama-2-7b-hf',
    'meta-llama--Llama-2-13b-hf',
]

size_list = [
    # 1024,
    1024,
    1280,
]

# direction = 'adapt'
direction = 'col'

for model_name, size in zip(model_list, size_list):
    
    model_name = model_name.replace('/', '--')
    print('model_name: ', model_name)
    
    model_path = f"./hf_model/{model_name}"

    model = AutoModelForCausalLM.from_pretrained(model_path, local_files_only=True)
    layers = get_blocks(model)
    
    datas = []
    
    for i in tqdm(range(len(layers))):
        named_linears = get_named_linears(layers[i])
        for n, m in named_linears.items():
            W = m.weight.data.detach()

            r, c = W.shape

            if direction == 'col':
                if r % size != 0:
                    padding_size = size - r % size
                    mean_c = W.mean(0)
                    std_c = W.std(0)

                    g = torch.normal(mean_c.expand(padding_size, c), std_c.expand(padding_size, c))
                    W = torch.cat([W, g], dim=0)
            elif direction =='row':
                raise NotImplementedError
            else:
                raise KeyError
            
            if direction == 'col':
                W = W.T
            
            assert W.shape[1] % size == 0
            assert W.shape[1] >= size

            W = W.reshape(-1, size)
                
            datas.append(W)
    
    datas = torch.cat(datas, dim = 0)
    print('total dataset shape: ', datas.shape)
    
    indices = torch.randperm(len(datas))
    split_index = int(len(datas) - 1000)
    train_indices = indices[:split_index]
    val_indices = indices[split_index:]

    dataset = {}
    dataset['train'] = datas[train_indices]
    dataset['val'] = datas[val_indices]
    print('train: ', dataset['train'].shape, 'val: ', dataset['val'].shape)

    dataset_stats = {}
    for split in ['train', 'val']:
        data = dataset[split]
        
        mean_dim0 = data.mean(dim=0)
        std_dim0 = data.std(dim=0)
        
        mean_all = data.mean()
        std_all = data.std()
        
        dataset_stats[split] = {
            'mean': mean_all.item(),
            'std': std_all.item(),
            'mean_channel': mean_dim0.tolist(),
            'std_channel': std_dim0.tolist(),
        }

    os.makedirs(f'./block_pt/{model_name}', exist_ok = True)
    torch.save(dataset, f'./block_pt/{model_name}/{direction}_{size}_gaussian_padding.pt')
    json_path = f'./block_pt/{model_name}/{direction}_{size}_gaussian_padding_dataset_stats.json'
    with open(json_path, 'w') as f:
        json.dump(dataset_stats, f)

# RHT smoothed Weight Block

In [None]:
import sys
sys.path.append('/workspace/Weight_compression/Wparam_dataset')
from utils import *

def RHT_H(H, SU):
    return matmul_hadUt(matmul_hadUt(H * SU).T * SU)


def RHT_W(W, SU, SV):
    return matmul_hadUt(matmul_hadUt(W.T * SV).T * SU)


def incoherence_preprocess(H, W, args):
    # dtype_ = torch.float64 if args.use_fp64 else torch.float32
    dtype_ = torch.float32
    device = W.device
    # device = torch.device('cpu')
    (m, n) = W.shape

    def _dump(Hr, Lhr, msg=''):
        torch.save(Hr, f"{args.save_pfx}/Hr_debug_fft.pt")
        torch.save(Lhr, f"{args.save_pfx}/Lhr_debug_fft.pt")
        raise Exception(msg)

    # diagonally rescale W,H to minimize proxy loss
    scaleWH = None
    Wr = W
    Hr = H
    # if args.rescale_WH:
    if False:
        Hr = H / H.abs().max()
        diagH = torch.diag(Hr)
        diagW2 = torch.diag(W.T @ W)
        diagH = torch.clamp(diagH, min=1e-8)
        diagW2 = torch.clamp(diagW2, min=1e-8)
        scaleWH = (diagH / diagW2).sqrt().sqrt().to(torch.float32)
        scaleWH = scaleWH.clamp(min=1e-8)
        Wr = Wr * scaleWH[None, :]
        Hr = Hr / scaleWH[None, :]
        Hr = Hr / scaleWH[:, None]
        scaleWH = scaleWH.cpu()

    # randomized hadamard transformation on H, W
    if True:
        SU = (torch.randn(n, device=device).sign() + 1e-5).sign().to(dtype_)
        SV = (torch.randn(m, device=device).sign() + 1e-5).sign().to(dtype_)
        # Hr = RHT_H(Hr, SU)
        Wr = RHT_W(Wr, SU, SV)
    # randomized kronecker product on H, W
    elif args.incoh_mode == "kron":
        SU = utils.rand_ortho_butterfly_noblock(n).to(dtype_).to(device)
        SV = utils.rand_ortho_butterfly_noblock(m).to(dtype_).to(device)
        Hr = SU @ Hr @ SU.T
        Wr = SV @ Wr @ SU.T
    else:
        raise NotImplementedError
    SV = SV.cpu()
    SU = SU.cpu()

    # Lhr = torch.linalg.cholesky(Hr)
    Lhr = None
    # if not torch.all(torch.isfinite(Lhr)):
    #     return None

    Wr = Wr.to(device)

    return Lhr, Hr, Wr, SU, SV, scaleWH

In [None]:
model_list = [
    'meta-llama/Meta-Llama-3-8B',
    'meta-llama--Llama-2-7b-hf',
    # 'meta-llama--Llama-2-13b-hf',
]

size_list = [
    4096,
    # 4096,
    # 4096,
]

# direction = 'adapt'
direction = 'col'

for model_name, size in zip(model_list, size_list):
    
    model_name = model_name.replace('/', '--')
    print('model_name: ', model_name)
    
    model_path = f"./hf_model/{model_name}"

    model = AutoModelForCausalLM.from_pretrained(model_path, local_files_only=True)
    layers = get_blocks(model)
    
    datas = []
    
    for i in tqdm(range(len(layers))):
        named_linears = get_named_linears(layers[i])
        for n, m in named_linears.items():
            w = m.weight.data.detach()
            
            Lhr, H, w, SU, SV, scaleWH = incoherence_preprocess(None, w, None) 
            
            if direction == 'col':
                w = w.T
            
            # if w.size(0) % size == 0:
            #     w = w.T    
            #     w = w.reshape(-1, size)
            # else:
            #     w = w.reshape(-1, size)
                
            w = w.reshape(-1, size)
            
            datas.append(w)
    
    datas = torch.cat(datas, dim = 0)
    print('total dataset shape: ', datas.shape)
    
    indices = torch.randperm(len(datas))
    split_index = int(len(datas) - 1000)
    train_indices = indices[:split_index]
    val_indices = indices[split_index:]

    dataset = {}
    dataset['train'] = datas[train_indices]
    dataset['val'] = datas[val_indices]
    print('train: ', dataset['train'].shape, 'val: ', dataset['val'].shape)

    dataset_stats = {}
    for split in ['train', 'val']:
        data = dataset[split]
        
        mean_dim0 = data.mean(dim=0)
        std_dim0 = data.std(dim=0)
        
        mean_all = data.mean()
        std_all = data.std()
        
        dataset_stats[split] = {
            'mean': mean_all.item(),
            'std': std_all.item(),
            'mean_channel': mean_dim0.tolist(),
            'std_channel': std_dim0.tolist(),
        }

    os.makedirs(f'./block_pt/{model_name}', exist_ok = True)
    torch.save(dataset, f'./block_pt/{model_name}/{direction}_{size}_RHT.pt')
    json_path = f'./block_pt/{model_name}/{direction}_{size}_RHT_dataset_stats.json'
    with open(json_path, 'w') as f:
        json.dump(dataset_stats, f)