In [None]:
!pip --version
!python -m ensurepip -upgrade
!pip install transformers autoawq datasets accelerate torch
# 실행 후 세션 다시 시작

pip 24.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)
/usr/bin/python3: No module named ensurepip
Collecting autoawq
  Downloading autoawq-0.2.7.post3-py3-none-any.whl.metadata (18 kB)
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting triton (from autoawq)
  Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)
Collecting zstandard (from autoawq)
  Downloading zstandard-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>

In [None]:
import math
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from awq import AutoAWQForCausalLM
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers.utils import logging
import huggingface_hub
import numpy as np
from torch import nn

# 1. 설정
model_id = "meta-llama/Llama-3.2-1B"
dataset_name = "wikitext"
dataset_detail = "wikitext-2-raw-v1"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logging.get_logger("transformers").setLevel(logging.ERROR)
hf_token = ""

batch_size = 32
n_samples = 100

In [None]:
huggingface_hub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
def calculate_model_size(model):
    total_size_in_bytes = 0
    for _, param in model.named_parameters():
        param_size = np.prod(param.size()) * param.element_size()
        total_size_in_bytes += param_size

    size_in_mb = total_size_in_bytes / (1024 ** 2)
    print(f'model size : {size_in_mb} MB')

def preprocess_data(example, tokenizer, max_length):
    return tokenizer(example['text'], truncation=True, max_length=max_length, padding="max_length")

def evaluate_perplexity(model, tokenizer, dataset, nsamples) :
    testenc = tokenizer("\n\n".join(dataset['text']), return_tensors='pt')
    testenc = testenc.input_ids.to(device)
    model.eval()
    nlls = []

    for i in tqdm(range(nsamples), desc="evaluating..."):
        start_idx = i * 2048
        end_idx = min((i + 1) * 2048, testenc.size(1))
        batch = testenc[:, start_idx:end_idx].to(device)

        with torch.no_grad():
            lm_logits = model(batch).logits
        shift_logits = lm_logits[:, :-1, :].contiguous().float()
        shift_labels = testenc[:, start_idx:end_idx][:, 1:]
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        neg_log_likelihood = loss.float() * (end_idx - start_idx)
        nlls.append(neg_log_likelihood)

    total_tokens = sum(min(2048, testenc.size(1) - i * 2048) for i in range(nsamples))
    perplexity = torch.exp(torch.stack(nlls).sum() / total_tokens)
    return perplexity


def preparation(model_id, token, dataset_name, dataset_detail, max_length, batch_size) :
    tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token, use_fast=True)
    tokenizer.pad_token = tokenizer.eos_token

    raw_datasets = load_dataset(dataset_name, dataset_detail, split="test")
    tokenized_datasets = raw_datasets.map(lambda x: preprocess_data(x, tokenizer, max_length), batched=True)
    tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask'])

    dataloader = DataLoader(tokenized_datasets, batch_size=batch_size)

    return tokenizer, tokenized_datasets, dataloader

def evaluation(model, tokenizer, dataset, dataloader, nsamples) :
    perplexity = evaluate_perplexity(model, tokenizer, dataset, nsamples).item()
    print(f"perplexity : {perplexity}")

def quantize_model(model, tokenizer, quant_config, quant_name) :
    model.quantize(tokenizer, quant_config=quant_config)
    model.save_quantized(quant_name)
    load_model = AutoAWQForCausalLM.from_quantized(quant_name, device_map="auto")

    return load_model

# baseline_float16

In [None]:
model_awq_fp16 = AutoAWQForCausalLM.from_pretrained(
    model_id,
    token=hf_token,
    torch_dtype=torch.float16,
    device_map="auto"
)

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/2.18M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

LICENSE.txt:   0%|          | 0.00/7.71k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/41.2k [00:00<?, ?B/s]

original/params.json:   0%|          | 0.00/220 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

USE_POLICY.md:   0%|          | 0.00/6.02k [00:00<?, ?B/s]

In [None]:
tokenizer, dataset, dataloader = preparation(model_id, hf_token, dataset_name, dataset_detail, n_samples, batch_size)

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Map:   0%|          | 0/4358 [00:00<?, ? examples/s]

In [None]:
print(model_awq_fp16)
calculate_model_size(model_awq_fp16)

LlamaAWQForCausalLM(
  (model): LlamaForCausalLM(
    (model): LlamaModel(
      (embed_tokens): Embedding(128256, 2048)
      (layers): ModuleList(
        (0-15): 16 x LlamaDecoderLayer(
          (self_attn): LlamaSdpaAttention(
            (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
            (k_proj): Linear(in_features=2048, out_features=512, bias=False)
            (v_proj): Linear(in_features=2048, out_features=512, bias=False)
            (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
            (rotary_emb): LlamaRotaryEmbedding()
          )
          (mlp): LlamaMLP(
            (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
            (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
            (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
          (post_attention_

In [None]:
evaluation(model_awq_fp16, tokenizer, dataset, dataloader, n_samples)

evaluating...: 100%|██████████| 100/100 [00:28<00:00,  3.45it/s]


perplexity : 9.782742500305176


# qunatize_PTQ(AWQ)_int4
    세션 다시 시작


In [None]:
import math
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from awq import AutoAWQForCausalLM
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers.utils import logging
import huggingface_hub
import numpy as np
from torch import nn

# 1. 설정
model_id = "meta-llama/Llama-3.2-1B"
dataset_name = "wikitext"
dataset_detail = "wikitext-2-raw-v1"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logging.get_logger("transformers").setLevel(logging.ERROR)
hf_token = ""

batch_size = 32
n_samples = 100

In [None]:
huggingface_hub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
def calculate_model_size(model):
    total_size_in_bytes = 0
    for _, param in model.named_parameters():
        param_size = np.prod(param.size()) * param.element_size()
        total_size_in_bytes += param_size

    size_in_mb = total_size_in_bytes / (1024 ** 2)
    print(f'model size : {size_in_mb} MB')

def preprocess_data(example, tokenizer, max_length):
    return tokenizer(example['text'], truncation=True, max_length=max_length, padding="max_length")

def evaluate_perplexity(model, tokenizer, dataset, nsamples) :
    testenc = tokenizer("\n\n".join(dataset['text']), return_tensors='pt')
    testenc = testenc.input_ids.to(device)
    model.eval()
    nlls = []

    for i in tqdm(range(nsamples), desc="evaluating..."):
        start_idx = i * 2048
        end_idx = min((i + 1) * 2048, testenc.size(1))
        batch = testenc[:, start_idx:end_idx].to(device)

        with torch.no_grad():
            lm_logits = model(batch).logits
        shift_logits = lm_logits[:, :-1, :].contiguous().float()
        shift_labels = testenc[:, start_idx:end_idx][:, 1:]
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        neg_log_likelihood = loss.float() * (end_idx - start_idx)
        nlls.append(neg_log_likelihood)

    total_tokens = sum(min(2048, testenc.size(1) - i * 2048) for i in range(nsamples))
    perplexity = torch.exp(torch.stack(nlls).sum() / total_tokens)
    return perplexity


def preparation(model_id, token, dataset_name, dataset_detail, max_length, batch_size) :
    tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token, use_fast=True)
    tokenizer.pad_token = tokenizer.eos_token

    raw_datasets = load_dataset(dataset_name, dataset_detail, split="test")
    tokenized_datasets = raw_datasets.map(lambda x: preprocess_data(x, tokenizer, max_length), batched=True)
    tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask'])

    dataloader = DataLoader(tokenized_datasets, batch_size=batch_size)

    return tokenizer, tokenized_datasets, dataloader

def evaluation(model, tokenizer, dataset, dataloader, nsamples) :
    perplexity = evaluate_perplexity(model, tokenizer, dataset, nsamples).item()
    print(f"perplexity : {perplexity}")

def quantize_model(model, tokenizer, quant_config, quant_name) :
    model.quantize(tokenizer, quant_config=quant_config)
    model.save_quantized(quant_name)
    load_model = AutoAWQForCausalLM.from_quantized(quant_name, device_map="auto")

    return load_model

In [None]:
model_awq_fp16 = AutoAWQForCausalLM.from_pretrained(
    model_id,
    token=hf_token,
    torch_dtype=torch.float16,
    device_map="auto"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

In [None]:
tokenizer, dataset, dataloader = preparation(model_id, hf_token, dataset_name, dataset_detail, n_samples, batch_size)
model_awq_int4 = quantize_model(model_awq_fp16, tokenizer, {"w_bit": 4}, 'awq_llama3_int4_perplexity')

README.md:   0%|          | 0.00/167 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


val.jsonl.zst:   0%|          | 0.00/471M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/214670 [00:00<?, ? examples/s]

AWQ: 100%|██████████| 16/16 [08:40<00:00, 32.52s/it]
Replacing layers...: 100%|██████████| 16/16 [00:04<00:00,  3.52it/s]


In [None]:
print(model_awq_fp16)
calculate_model_size(model_awq_fp16)

LlamaAWQForCausalLM(
  (model): LlamaForCausalLM(
    (model): LlamaModel(
      (embed_tokens): Embedding(128256, 2048)
      (layers): ModuleList(
        (0-15): 16 x LlamaDecoderLayer(
          (self_attn): LlamaSdpaAttention(
            (q_proj): WQLinear_GEMM(in_features=2048, out_features=2048, bias=False, w_bit=4, group_size=128)
            (k_proj): WQLinear_GEMM(in_features=2048, out_features=512, bias=False, w_bit=4, group_size=128)
            (v_proj): WQLinear_GEMM(in_features=2048, out_features=512, bias=False, w_bit=4, group_size=128)
            (o_proj): WQLinear_GEMM(in_features=2048, out_features=2048, bias=False, w_bit=4, group_size=128)
            (rotary_emb): LlamaRotaryEmbedding()
          )
          (mlp): LlamaMLP(
            (gate_proj): WQLinear_GEMM(in_features=2048, out_features=8192, bias=False, w_bit=4, group_size=128)
            (up_proj): WQLinear_GEMM(in_features=2048, out_features=8192, bias=False, w_bit=4, group_size=128)
            (down_

In [None]:
evaluation(model_awq_int4, tokenizer, dataset, dataloader, n_samples)

evaluating...: 100%|██████████| 100/100 [00:36<00:00,  2.74it/s]


perplexity : 10.853047370910645
