In [1]:
import torch
import torchvision
import tqdm
import os

from transformers import CLIPVisionModelWithProjection, ViTForImageClassification, AutoModelForCausalLM
from transformers import AutoModel, AutoTokenizer
import numpy as np

from huggingface_hub import try_to_load_from_cache, _CACHED_NO_EXIST
from huggingface_hub import scan_cache_dir

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def latest_version_path(cache_dir, model_name, branch = 'main'):
    model_name_dir =  "models--" + model_name.replace('/', '--')
    path = os.path.join(cache_dir, model_name_dir)

    if not os.path.isdir(os.path.join(path, 'snapshots')):
        return None
    
    branch_file =  os.path.join(path, 'refs', branch)

    with open(branch_file, 'r', encoding='utf-8') as file:
        revision = file.read()

    return os.path.join(path, 'snapshots', revision)

In [3]:
import os

import os

def count_files_with_name(directory, target_filename, keyword):
    count = 0
    for root, _, files in os.walk(directory):
        for file in files:
            if file == target_filename and keyword in os.path.join(root, file).lower():
                count += 1
    return count

# 사용 예시
directory_path = "/home/jgryu/Weight_compression/Wparam_dataset/model_zoo/huggingface"  # 대상 디렉토리 경로로 변경하세요.
target_file = "model.safetensors.index.json"
keyword = "phi"

file_count = count_files_with_name(directory_path, target_file, keyword)
print(f"Found {file_count} '{target_file}' files with '{keyword}' in the path.")


Found 0 'model.safetensors.index.json' files with 'phi' in the path.


In [None]:
import torch
from transformers import AwqConfig, AutoModelForCausalLM

# model_id = "meta-llama/Llama-Guard-3-8B"

# quantization_config = AwqConfig(
#     bits=4,
#     fuse_max_seq_len=512,
#     do_fuse=True,
# )

model_id = 'Efficient-ML/LLaMA-3-8B-AWQ-4bit-b128'
# model = AutoModelForCausalLM.from_pretrained('Efficient-ML/LLaMA-3-8B-AWQ-4bit-b128', trust_remote)
model_awq = AutoModelForCausalLM.from_pretrained(model_id, token="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", trust_remote_code=True)

Downloading shards: 100%|██████████| 4/4 [17:42<00:00, 265.67s/it]
  return self.fget.__get__(instance, owner)()
Loading checkpoint shards: 100%|██████████| 4/4 [00:12<00:00,  3.05s/it]


In [None]:
sd = model.state_dict()
for k, v in sd.items():
    print(k)
    print(v.dtype)

In [7]:
def get_ckpt_path(path, branch = 'main'):
    if not os.path.isdir(os.path.join(path, 'snapshots')):
        return None
    branch_file =  os.path.join(path, 'refs', branch)
    with open(branch_file, 'r', encoding='utf-8') as file:
        revision = file.read()
    return os.path.join(path, 'snapshots', revision)

model_list = [
              '/home/jgryu/Weight_compression/Wparam_dataset/model_zoo/huggingface/models--meta-llama--Meta-Llama-3-8B',
              ]
for model_path in model_list:
    ckpt_path = get_ckpt_path(model_path)
    model = AutoModelForCausalLM.from_pretrained(ckpt_path, local_files_only=True, trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards: 100%|██████████| 4/4 [00:09<00:00,  2.30s/it]


In [None]:
def quantize_weights(state_dict, bits=4):
    """
    Perform simple weight quantization on a model's state_dict.
    
    Args:
        state_dict (dict): The state_dict of the model.
        bits (int): Number of bits for quantization (default: 8).
    
    Returns:
        dict: Quantized state_dict.
    """
    quantized_state_dict = {}
    scale = 2 ** (bits - 1) - 1  # Scale factor for quantization
    
    for name, param in state_dict.items():
        if param.dtype in [torch.float32, torch.float64]:  # Quantize only float weights
            max_val = param.abs().max()
            scale_factor = scale / max_val
            quantized = (param * scale_factor).round().clamp(-scale, scale)  # Quantization
            quantized_state_dict[name] = quantized / scale_factor  # Dequantization for storage
        else:
            quantized_state_dict[name] = param  # Keep non-float parameters as is
    
    return quantized_state_dict

model_q = quantize_weights(fp_model.state_dict())

# # Example usage
# if __name__ == "__main__":
#     # Assume state_dict is available
#     model_state_dict = {
#         "linear.weight": torch.randn(4, 4),
#         "linear.bias": torch.randn(4),
#     }
    
#     quantized_state_dict = quantize_weights(model_state_dict)
#     for k, v in quantized_state_dict.items():
#         print(f"{k}: {v}")

In [None]:
import torch

def quantize_weights_with_zero_point(state_dict, bits=4):
    """
    Perform asymmetric weight quantization on a model's state_dict.

    Args:
        state_dict (dict): The state_dict of the model.
        bits (int): Number of bits for quantization (default: 4).

    Returns:
        dict: Quantized state_dict with scale and zero-point.
    """
    quantized_state_dict = {}
    scale_and_zero_point = {}  # To store scale and zero-point for each parameter

    qmin = 0
    qmax = 2 ** bits - 1  # Range for quantized values

    for name, param in state_dict.items():
        if param.dtype in [torch.float32, torch.float64]:  # Quantize only float weights
            min_val = param.min()
            max_val = param.max()

            # Calculate scale and zero-point
            scale = (max_val - min_val) / (qmax - qmin)
            zero_point = torch.round(qmin - min_val / scale)

            # Quantize the parameter
            quantized = torch.round(param / scale + zero_point).clamp(qmin, qmax)

            # Dequantization for storage
            dequantized = scale * (quantized - zero_point)
            quantized_state_dict[name] = dequantized

            # Save scale and zero-point for reference
            scale_and_zero_point[name] = {
                "scale": scale,
                "zero_point": zero_point
            }
        else:
            quantized_state_dict[name] = param  # Keep non-float parameters as is

    return quantized_state_dict, scale_and_zero_point

# Example usage
# if __name__ == "__main__":
#     # Example state_dict with random weights
#     model_state_dict = {
#         "linear.weight": torch.randn(4, 4),
#         "linear.bias": torch.randn(4),
#     }

#     quantized_state_dict, scale_and_zero_point = quantize_weights_with_zero_point(model_state_dict)

#     # Print results
#     for name, quantized_param in quantized_state_dict.items():
#         print(f"{name}: {quantized_param}")

#     print("\nScale and Zero-Point:")
#     for name, values in scale_and_zero_point.items():
#         print(f"{name}: Scale={values['scale']}, Zero-Point={values['zero_point']}")



In [18]:
import torch

# 두 모델의 state_dict 가져오기
# model1_state_dict = model.state_dict()
model1_state_dict = model.state_dict()

bits_list = [1, 2, 3, 4, 5, 6, 7, 8]
mse_list = []
for bits in [1, 2, 3, 4, 5, 6, 7, 8]:
    quantized_state_dict, scale_and_zero_point = quantize_weights_with_zero_point(model.state_dict(), bits = bits)
    model2_state_dict = quantized_state_dict


    total_squared_error = 0.0
    total_elements = 0

    for key in model1_state_dict.keys():
        if 'attn' not in key: continue
        # print(key)
        if key not in model2_state_dict:
            print(f"Key '{key}' is missing in model2.")
        else:
            # Shape 비교
            if model1_state_dict[key].shape != model2_state_dict[key].shape:
                print(f"Shape mismatch for key '{key}': model1={model1_state_dict[key].shape}, model2={model2_state_dict[key].shape}")
            else:
                # 전체 MSE를 위해 값 차이 누적
                diff = model1_state_dict[key] - model2_state_dict[key]
                squared_error = torch.sum(diff ** 2).item()  # 각 요소의 제곱 오차 합
                total_squared_error += squared_error
                total_elements += diff.numel()  # 전체 요소 수 추가

    # 전체 요소 MSE 계산 및 출력
    if total_elements > 0:
        overall_mse = total_squared_error / total_elements
        # print(f"Total MSE over all elements: {overall_mse:.6f}")
    else:
        pass
        # print("No elements to compare. Models may be incompatible.")

    for d in [16]:
        std = np.load(f'/home/jgryu/Weight_compression/Wparam_dataset/TFRecord/meta-llama--Meta-Llama-3-8B/mlp/d{d}/mlp_d{d}_train_std.npy')
        # print(std)
    overall_mse = overall_mse / std**2
    mse_list.append(overall_mse)
    print(bits, overall_mse)
print('mse: ', mse_list)
print('bits: ', bits_list)

1 1.6436466818640247
2 1.6236762742152016
3 1.2921279783070763
4 0.6426828348204263
5 0.22821856959557643
6 0.0611583548993061
7 0.01520839248038775
8 0.003791649803076995
mse:  [1.6436466818640247, 1.6236762742152016, 1.2921279783070763, 0.6426828348204263, 0.22821856959557643, 0.0611583548993061, 0.01520839248038775, 0.003791649803076995]
bits:  [1, 2, 3, 4, 5, 6, 7, 8]


### per tensor quantization MLP
* asymetric (4, 0.7952778297410023)
mse:  [1.0000164869236037, 0.9993737145706393, 0.9887189679460173, 0.7952778297410023, 0.37116162976546385, 0.10550744880290548, 0.02616760658136518, 0.006492661767538381]
bits:  [1, 2, 3, 4, 5, 6, 7, 8]
### per tensor quantization Attn
* asymetric (4, 0.6426828348204263) (3, 1.2921279783070763)


In [49]:
path = '/home/jgryu/Weight_compression/awq_cache/llama3-8b-w4-g128.pt'
try:
    model_weights = torch.load(path, map_location='cpu')  # CPU에 로드 (필요시 GPU로 변경)
    print("파일 로드 성공!")
    print(type(model_weights))  # 데이터 구조 확인 (예: dict, Tensor 등)
except Exception as e:
    print(f"파일 로드 실패: {e}")

파일 로드 성공!
<class 'dict'>


In [54]:
if isinstance(model_weights, dict):
    # state_dict 확인
    for key, value in model_weights['clip'].items():
        print(f"{key}: {value.shape if isinstance(value, torch.Tensor) else type(value)}")


AttributeError: 'list' object has no attribute 'items'

In [59]:
model_weights['clip'][0][1]

tensor([[[0.1338],
         [0.1393],
         [0.1440],
         ...,
         [0.2927],
         [0.2462],
         [0.2717]],

        [[0.2030],
         [0.1896],
         [0.1676],
         ...,
         [0.2498],
         [0.1958],
         [0.2107]],

        [[0.1211],
         [0.1073],
         [0.1519],
         ...,
         [0.1411],
         [0.1211],
         [0.1555]],

        ...,

        [[0.2306],
         [0.2010],
         [0.1537],
         ...,
         [0.2822],
         [0.1294],
         [0.1471]],

        [[0.2267],
         [0.1895],
         [0.1153],
         ...,
         [0.2335],
         [0.2355],
         [0.2172]],

        [[0.1915],
         [0.1089],
         [0.1740],
         ...,
         [0.1744],
         [0.1779],
         [0.2307]]], dtype=torch.float16)

In [1]:
import torch
from transformers import CLIPVisionModelWithProjection, ViTForImageClassification, AutoModelForCausalLM
from transformers import AutoModel, AutoTokenizer


# cache_directory = "/home/jgryu/Weight_compression/llm-awq/model_cache" 
# ver = "meta-llama/Meta-Llama-3-8B"

# net = AutoModelForCausalLM.from_pretrained(ver, cache_dir = cache_directory, token="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", trust_remote_code=True)
# tok = AutoTokenizer.from_pretrained(ver, cache_dir = cache_directory, token="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", trust_remote_code=True)

ckpt_path = '/home/jgryu/Weight_compression/llm-awq/model_cache/models--meta-llama--Meta-Llama-3-8B/snapshots/8cde5ca8380496c9a6cc7ef3a8b46a0372a1d920'
net = AutoModelForCausalLM.from_pretrained(ckpt_path, local_files_only=True)

meam = np.load(f'/home/jgryu/Weight_compression/Wparam_dataset/TFRecord/meta-llama--Meta-Llama-3-8B/mlp/d16/mlp_d16_train_mean.npy')
std = np.load(f'/home/jgryu/Weight_compression/Wparam_dataset/TFRecord/meta-llama--Meta-Llama-3-8B/mlp/d16/mlp_d16_train_std.npy')
size = 256
weight_condition = 'mlp'

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [00:06<00:00,  1.53s/it]


NameError: name 'np' is not defined

In [2]:
!pip list | grep transformers

transformers              4.44.1


In [20]:
for d in [16, 32, 64, 128, 256, 1024, 4096]:
    path  = f'/home/jgryu/Weight_compression/Wparam_dataset/TFRecord/meta-llama--Meta-Llama-3-8B/attn/d{d}/attn_d{d}_train_std_vector.npy'
    std = np.load(path)
    print(f'## {d} ##')
    print(std.mean(), std.max(), std.min(), std.std())

## 16 ##
0.013787343 0.013866982 0.013697659 4.6187477e-05
## 32 ##
0.014501593 0.014622698 0.014366101 6.392919e-05
## 64 ##
0.014888803 0.015070886 0.014723518 8.179265e-05
## 128 ##
0.015046192 0.015470855 0.014769139 0.00013238109
## 256 ##
0.015112722 0.015859565 0.014786505 0.00019158793
## 1024 ##
0.015144215 0.01755336 0.014522944 0.00037536578
## 4096 ##
0.015128609 0.021753341 0.013272276 0.0007092323


In [24]:
for d in [16, 32, 64, 128, 256, 1024, 4096]:
    try:
        try:
            path  = f'/home/jgryu/Weight_compression/Wparam_dataset/TFRecord/meta-llama--Meta-Llama-3-8B/mlp/d{d}/mlp_d{d}_train_std_channel.npy'
            std = np.load(path)
        except:
            try:
                path  = f'/home/jgryu/Weight_compression/Wparam_dataset/TFRecord/meta-llama--Meta-Llama-3-8B/mlp/d{d}/mlp_d{d}_train_std_vector.npy'
                std = np.load(path)
            except:
                try:
                    path  = f'/home/jgryu/Weight_compression/Wparam_dataset/TFRecord/meta-llama--Meta-Llama-3-8B/mlp/d{d}/mlp_d{d}_val_std_channel.npy'
                    std = np.load(path)
                except:
                    path  = f'/home/jgryu/Weight_compression/Wparam_dataset/TFRecord/meta-llama--Meta-Llama-3-8B/mlp/d{d}/mlp_d{d}_val_std_vector.npy'
                    std = np.load(path) 
        print(f'## {d} ##')
    except:
        continue
    print(std.mean(), std.max(), std.min(), std.std())

## 64 ##
0.010743368 0.010794666 0.010696257 1.7599377e-05
## 128 ##
0.011308035 0.011375209 0.0112355985 2.1920741e-05
## 256 ##
0.011604545 0.011706344 0.011459176 3.3088007e-05
## 1024 ##
0.011814561 0.012050536 0.011308623 7.070596e-05
## 4096 ##
0.011811152 0.012583601 0.0095969755 0.00014643215
