In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
import os
from tqdm import tqdm
import glog
import json

os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import numpy as np
import torch


def set_seed(seed):
    np.random.seed(seed)
    torch.random.manual_seed(seed)


def get_wikitext2(nsamples, seed, seqlen, model):
    from datasets import load_dataset
    traindata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train')
    testdata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')

    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)
    trainenc = tokenizer("\n\n".join(traindata['text']), return_tensors='pt')
    testenc = tokenizer("\n\n".join(testdata['text']), return_tensors='pt')

    import random
    random.seed(seed)
    trainloader = []
    for _ in range(nsamples):
        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
        j = i + seqlen
        inp = trainenc.input_ids[:, i:j]
        tar = inp.clone()
        tar[:, :-1] = -100
        trainloader.append((inp, tar))
    return trainloader, testenc


def get_ptb(nsamples, seed, seqlen, model):
    from datasets import load_dataset
    traindata = load_dataset('ptb_text_only', 'penn_treebank', split='train')
    valdata = load_dataset('ptb_text_only',
                           'penn_treebank',
                           split='validation')

    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)
    trainenc = tokenizer("\n\n".join(traindata['sentence']),
                         return_tensors='pt')
    testenc = tokenizer("\n\n".join(valdata['sentence']), return_tensors='pt')

    import random
    random.seed(seed)
    trainloader = []
    for _ in range(nsamples):
        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
        j = i + seqlen
        inp = trainenc.input_ids[:, i:j]
        tar = inp.clone()
        tar[:, :-1] = -100
        trainloader.append((inp, tar))
    return trainloader, testenc


def get_c4(nsamples, seed, seqlen, model):
    from datasets import load_dataset
    traindata = load_dataset(
        'allenai/c4',
        data_files={'train': 'en/c4-train.00000-of-01024.json.gz'},
        split='train')
    valdata = load_dataset(
        'allenai/c4',
        data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'},
        split='validation')
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)

    import random
    random.seed(seed)
    trainloader = []
    for _ in range(nsamples):
        while True:
            i = random.randint(0, len(traindata) - 1)
            trainenc = tokenizer(traindata[i]['text'], return_tensors='pt')
            if trainenc.input_ids.shape[1] >= seqlen:
                break
        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
        j = i + seqlen
        inp = trainenc.input_ids[:, i:j]
        tar = inp.clone()
        tar[:, :-1] = -100
        trainloader.append((inp, tar))

    import random
    random.seed(0)
    valenc = []
    for _ in range(256):
        while True:
            i = random.randint(0, len(valdata) - 1)
            tmp = tokenizer(valdata[i]['text'], return_tensors='pt')
            if tmp.input_ids.shape[1] >= seqlen:
                break
        i = random.randint(0, tmp.input_ids.shape[1] - seqlen - 1)
        j = i + seqlen
        valenc.append(tmp.input_ids[:, i:j])
    valenc = torch.hstack(valenc)

    class TokenizerWrapper:

        def __init__(self, input_ids):
            self.input_ids = input_ids

    valenc = TokenizerWrapper(valenc)

    return trainloader, valenc


def get_ptb_new(nsamples, seed, seqlen, model):
    from datasets import load_dataset
    traindata = load_dataset('ptb_text_only', 'penn_treebank', split='train')
    testdata = load_dataset('ptb_text_only', 'penn_treebank', split='test')

    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)
    trainenc = tokenizer(" ".join(traindata['sentence']), return_tensors='pt')
    testenc = tokenizer(" ".join(testdata['sentence']), return_tensors='pt')

    import random
    random.seed(seed)
    trainloader = []
    for _ in range(nsamples):
        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
        j = i + seqlen
        inp = trainenc.input_ids[:, i:j]
        tar = inp.clone()
        tar[:, :-1] = -100
        trainloader.append((inp, tar))
    return trainloader, testenc


def get_c4_new(nsamples, seed, seqlen, model):
    from datasets import load_dataset
    traindata = load_dataset(
        'allenai/c4',
        data_files={'train': 'en/c4-train.00000-of-01024.json.gz'},
        split='train')
    valdata = load_dataset(
        'allenai/c4',
        data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'},
        split='validation')

    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)

    import random
    random.seed(seed)
    trainloader = []
    for _ in range(nsamples):
        while True:
            i = random.randint(0, len(traindata) - 1)
            trainenc = tokenizer(traindata[i]['text'], return_tensors='pt')
            if trainenc.input_ids.shape[1] >= seqlen:
                break
        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
        j = i + seqlen
        inp = trainenc.input_ids[:, i:j]
        tar = inp.clone()
        tar[:, :-1] = -100
        trainloader.append((inp, tar))

    valenc = tokenizer(' '.join(valdata[:1100]['text']), return_tensors='pt')
    valenc = valenc.input_ids[:, :(256 * seqlen)]

    class TokenizerWrapper:

        def __init__(self, input_ids):
            self.input_ids = input_ids

    valenc = TokenizerWrapper(valenc)

    return trainloader, valenc


def get_loaders(name, nsamples=128, seed=0, seqlen=2048, model=''):
    if 'wikitext2' in name:
        return get_wikitext2(nsamples, seed, seqlen, model)
    if 'ptb' in name:
        if 'new' in name:
            return get_ptb_new(nsamples, seed, seqlen, model)
        return get_ptb(nsamples, seed, seqlen, model)
    if 'c4' in name:
        if 'new' in name:
            return get_c4_new(nsamples, seed, seqlen, model)
        return get_c4(nsamples, seed, seqlen, model)


def get_test_tokens(name, seed=0, seqlen=2048, model=''):
    train_samples = 0
    if name == 'wikitext2':
        return get_wikitext2(train_samples, seed, seqlen,
                             model)[1]['input_ids']
    elif name == 'c4':
        return get_c4(train_samples, seed, seqlen, model)[1].input_ids
    elif name == 'c4_new':
        return get_c4_new(train_samples, seed, seqlen, model)[1].input_ids
    else:
        raise Exception


In [None]:
model_ids = [
    # "../Wparam_dataset/hf_model/meta-llama--Llama-2-7b-hf",
    "../Wparam_dataset/hf_model/meta-llama--Llama-2-13b-hf",
    # "../Wparam_dataset/hf_model/meta-llama--Meta-Llama-3-8B",
]
for model_id in model_ids:
    datasets = ['wikitext2']
    seqlen = 2048
    seed = 0

    for bit in [8]:
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        gptq_config = GPTQConfig(bits=bit, dataset="c4", tokenizer=tokenizer)
        quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", quantization_config=gptq_config)
        
        if "2-7b" in model_id.lower():
            hf_path = f"./hf/meta-llama--Llama-2-7b-hf/{bit}bit"
        elif "2-13b" in model_id.lower():
            hf_path = f"./hf/meta-llama--Llama-2-13b-hf/{bit}bit"
        elif "3-8b" in model_id.lower():
            hf_path = f"./hf/meta-llama--Meta-Llama-3-8B/{bit}bit"
        
        quantized_model.save_pretrained(hf_path)
        
        # for dataset in datasets:
        #     input_tok = get_test_tokens(dataset,
        #                                                 seed=seed,
        #                                                 seqlen=seqlen,
        #                                                 model=model_str)
        #     nsamples = input_tok.numel() // seqlen
        #     input_tok = input_tok[0, :(seqlen * nsamples)].view(
        #         nsamples, seqlen)

        #     loss_fct = torch.nn.CrossEntropyLoss().cuda()
        #     acc_loss = 0.0
        #     progress = tqdm(range(nsamples))
        #     for ii in progress:
        #         input = input_tok[ii, :].cuda().view(1, -1)
        #         output = quantized_model(input,
        #                         use_cache=False,
        #                         output_hidden_states=False,
        #                         output_attentions=False)[0]
        #         shift_logits = output[:, :-1, :].contiguous()
        #         shift_labels = input[:, 1:]
        #         loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
        #                         shift_labels.view(-1))
        #         acc_loss += loss.item()
        #         progress.set_description(f"avg_loss = {acc_loss/(ii+1)}")

        #     avg_loss = acc_loss / nsamples

        #     ppl = torch.exp(torch.tensor(avg_loss)).item()
        #     glog.info(f'{dataset} perplexity: {ppl}')
        #     print(f'{dataset} perplexity: {ppl:.3f}')
            
        #     try:
        #         with open(f'{hf_path}_result.json', 'r') as f:
        #             comp_result= json.load(f)
        #     except:
        #         comp_result = {}
        #     comp_result['ppl'] = {dataset: ppl}
        #     with open(f'./{hf_path}_result.json', 'w') as f:
        #         json.dump(comp_result, f, indent=4)

In [None]:
model_ids = [
    # "../Wparam_dataset/hf_model/meta-llama--Llama-2-7b-hf",
    "../Wparam_dataset/hf_model/meta-llama--Llama-2-13b-hf",
    # "../Wparam_dataset/hf_model/meta-llama--Meta-Llama-3-8B",
]
for model_id in model_ids:
    datasets = ['wikitext2']
    seqlen = 2048
    seed = 0

    for bit in [2, 3, 4, 8]:
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        if "2-7b" in model_id.lower():
            hf_path = f"./hf/meta-llama--Llama-2-7b-hf/{bit}bit"
        elif "2-13b" in model_id.lower():
            hf_path = f"./hf/meta-llama--Llama-2-13b-hf/{bit}bit"
        elif "3-8b" in model_id.lower():
            hf_path = f"./hf/meta-llama--Meta-Llama-3-8B/{bit}bit"
            
        quantized_model = AutoModelForCausalLM.from_pretrained(hf_path, device_map="auto")
        for dataset in datasets:
            input_tok = get_test_tokens(dataset,seed=seed,seqlen=seqlen,model=model_id)
            nsamples = input_tok.numel() // seqlen
            input_tok = input_tok[0, :(seqlen * nsamples)].view(
                nsamples, seqlen)

            loss_fct = torch.nn.CrossEntropyLoss().cuda()
            acc_loss = 0.0
            progress = tqdm(range(nsamples))
            for ii in progress:
                input = input_tok[ii, :].cuda().view(1, -1)
                output = quantized_model(input,
                                use_cache=False,
                                output_hidden_states=False,
                                output_attentions=False)[0]
                shift_logits = output[:, :-1, :].contiguous()
                shift_labels = input[:, 1:]
                loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                                shift_labels.view(-1))
                acc_loss += loss.item()
                progress.set_description(f"avg_loss = {acc_loss/(ii+1)}")

            avg_loss = acc_loss / nsamples

            ppl = torch.exp(torch.tensor(avg_loss)).item()
            glog.info(f'{dataset} perplexity: {ppl}')
            print(f'{dataset} perplexity: {ppl:.3f}')
            
            try:
                with open(f'{hf_path}_result.json', 'r') as f:
                    comp_result= json.load(f)
            except:
                comp_result = {}
            comp_result['ppl'] = {dataset: ppl}
            with open(f'./{hf_path}_result.json', 'w') as f:
                json.dump(comp_result, f, indent=4)

  warn(

I0325 16:06:17.348574 394704 config.py:54] PyTorch version 2.6.0 available.



[32mINFO[0m  ENV: Auto setting PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' for memory saving.
[32mINFO[0m  ENV: Auto setting CUDA_DEVICE_ORDER=PCI_BUS_ID for correctness.          
[32mINFO[0m   Kernel: Auto-selection: adding candidate `TritonV2QuantLinear`          


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


[32mINFO[0m  Optimize: `TritonV2QuantLinear` compilation triggered.                   


avg_loss = nan: 100%|██████████| 166/166 [04:51<00:00,  1.76s/it]  
I0325 16:12:08.743986 394704 413164442.py:46] wikitext2 perplexity: nan


wikitext2 perplexity: nan
[32mINFO[0m   Kernel: Auto-selection: adding candidate `TorchQuantLinear`             


Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.28it/s]


[32mINFO[0m  Optimize: `TorchQuantLinear` compilation triggered.                      


avg_loss = 1.4453125:   1%|          | 1/166 [00:05<14:27,  5.26s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 38.00 MiB. GPU 1 has a total capacity of 47.51 GiB of which 18.50 MiB is free. Process 1225153 has 47.48 GiB memory in use. Of the allocated memory 45.16 GiB is allocated by PyTorch, and 1.82 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

: 