In [1]:
import argparse
import os
import time

import glog, json

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'

import torch
import torch.multiprocessing as mp
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.modeling_attn_mask_utils import \
    _prepare_4d_causal_attention_mask

from lib import utils
from lib.algo import finetune
from lib.codebook import bitshift
from operator import attrgetter

import sys
notebook_dir = os.path.dirname(os.path.abspath("__file__"))
project_root = os.path.abspath(os.path.join(notebook_dir, ".."))

from NWC.models import get_model

  from .autonotebook import tqdm as notebook_tqdm

I0723 16:13:24.247952 3504666 utils.py:146] Note: detected 128 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
I0723 16:13:24.249204 3504666 utils.py:149] Note: NumExpr detected 128 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 16.
I0723 16:13:24.249735 3504666 utils.py:162] NumExpr defaulting to 16 threads.
I0723 16:13:24.385351 3504666 config.py:54] PyTorch version 2.6.0 available.
  @amp.autocast(enabled=False)



In [2]:
class Config:
    def __init__(self, **entries):
        self.__dict__.update(entries)

comp_model_path = '/workspace/Weight_compression/NWC/checkpoint/nwc_ql/block_seq_ql_random_scaler_meta-llama--Meta-Llama-3-8B__col_1024_gaussian_padding.pt/M16/lmbda50_rdloss_ql_size16_encdim512_M16_Q4_R0_m0_batch_size2048_total_iter200000_lr0.0001_seed100/best_loss_model_loss_3.87239_bpp_4.65884_MSE_0.0162_total_iter_95000.pth.tar'
# comp_model_path = '/workspace/Weight_compression/NWC/checkpoint/nwc_scale_cond/block_seq_scale_cond_scaler_meta-llama--Meta-Llama-3-8B__scaleH_sig0.0001_std_rnormed_with_col_std_lidx_row_1024.pt/rdloss_size128_encdim1024_M256_Q0_R0_m0_batch_size2048_total_iter200000_lr0.0001_seed100/lmbda50_/best_loss_model_loss_3.94749_bpp_3.26997_MSE_4.91093_total_iter_192500.pth.tar'
config = os.path.join(os.path.dirname(comp_model_path), 'config.json')
with open(config, 'r', encoding='utf-8') as file:
    config = json.load(file)
config = Config(**config)

shift, scale = None, None
if config.architecture == 'nwc_ql' and not hasattr(config, "Q"):
    config.Q = 4
if not hasattr(config, "no_layernorm"):
    config.no_layernorm = False


comp_model = get_model(config.architecture, config, scale=scale, shift=shift)
comp_model.config = config
ckpt = torch.load(comp_model_path, weights_only=False)
scale, shift  = torch.zeros(1), torch.zeros(1)

comp_model.load_state_dict(ckpt["state_dict"], strict = False)
comp_model.scale = scale
comp_model.shift = shift
comp_model.eval()
comp_model.update()

comp_model.update(force=True)              # CompressAI: CDF 고정 및 버퍼 등록
comp_model.entropy_bottleneck._quantized_cdf  # 캐시되어 이후 재계산 안 됨

tensor([[    0,     1,     2,  ..., 65534, 65535, 65536],
        [    0,     1,     2,  ...,     0,     0,     0],
        [    0,     1,     2,  ...,     0,     0,     0],
        ...,
        [    0,     1,     2,  ...,     0,     0,     0],
        [    0,     1,     2,  ...,     0,     0,     0],
        [    0,     1,     2,  ...,     0,     0,     0]], dtype=torch.int32)

In [3]:
import torch

device = torch.device('cuda:4')

tt = []
with torch.no_grad():
    for i in range(10):
        T  = torch.zeros(256, 256)
        T = T.reshape(1, -1, 16).to(device)
        # T = T.reshape(1, -1, 128).to(device)
        data = {}
        data['weight_block'] = T
        data['q_level'] = torch.zeros(1, T.shape[1]).to(torch.int).to(device)
        # data['scale_cond'] = torch.zeros_like(T).to(device)

        comp_model.to(device)
        out_enc = comp_model.compress(data)

        # torch.cuda.synchronize()
        start = time.time()
        out_dec, timings  = comp_model.fast_decompress(out_enc)
        # out_dec  = comp_model.decompress(out_enc)
        # torch.cuda.synchronize()
        end = time.time()
        elapsed_ms = (end - start) * 1000
        tt.append(elapsed_ms)
        # print(f"Decompress time: {elapsed_ms:.3f} ms")
        # for step, duration in timings.items():
        #     print(f"{step:<25}: {duration:.4f} ms")


avg = 0
for t in tt:
    avg += t
avg /= len(tt)
print(f"Average: {avg:.3f} ms")

Average: 20.220 ms


In [4]:
import torch
import time
from collections import defaultdict

# 이전에 정의된 comp_model이 있다고 가정합니다.
# device = torch.device('cuda:5')

# 각 스텝별 시간을 저장할 딕셔너리
all_timings = defaultdict(list)

with torch.no_grad():
    for i in range(10):
        # --- 데이터 준비 (기존 코드와 동일) ---
        T = torch.zeros(256, 256)
        T = T.reshape(1, -1, 16).to(device)
        data = {}
        data['weight_block'] = T
        data['q_level'] = torch.zeros(1, T.shape[1], dtype=torch.int).to(device)

        comp_model.to(device)
        out_enc = comp_model.compress(data)

        # Decompress 실행 및 시간 측정
        # fast_decompress는 (결과, 시간_딕셔너리)를 반환한다고 가정
        out_dec, timings = comp_model.fast_decompress(out_enc)

        # 각 단계의 측정 시간을 리스트에 추가
        for step, duration in timings.items():
            all_timings[step].append(duration)

# --- 각 단계별 평균 시간 계산 및 출력 ---
print("\n--- 각 단계별 평균 소요 시간 (10회 실행) ---")
avg_timings = {}
for step, duration_list in all_timings.items():
    # 저장된 시간 리스트의 평균을 계산
    average_duration = sum(duration_list) / len(duration_list)
    avg_timings[step] = average_duration

# 보기 좋게 정렬하여 출력 (총 소요시간이 긴 순서대로)
sorted_avg_timings = sorted(avg_timings.items(), key=lambda item: item[1], reverse=True)

for step, avg_duration in sorted_avg_timings:
    print(f"{step:<25}: {avg_duration:.4f} ms")


--- 각 단계별 평균 소요 시간 (10회 실행) ---
total_decompress_ms      : 17.9213 ms
entropy_decompress_ms    : 17.1925 ms
synthesis_g_s_ms         : 0.5853 ms
quality_embedding_ms     : 0.0557 ms
permute_ms               : 0.0363 ms
rescale_shift_ms         : 0.0339 ms
parse_input_ms           : 0.0014 ms
