## Data dependencies

In [1]:
!sha1sum ../data/cwn_semrel_dataset.json # 14.22
!sha1sum ../data/cwn-prompt-symtable.json # 22.05

e17555f7349313ce3e70ae764925618cf5b886e1  ../data/cwn_semrel_dataset.json
3ef55f3f5ed97f86f8e3be0274e1721a01782f98  ../data/cwn-prompt-symtable.json


## Parameters

In [2]:
use_model = "cwnLLaMA"

In [3]:
if use_model == "cwnLLaMA":
  model_name = "cwnLlama"
  repo_name = "/mnt/md0/models/LoLLaMA/cwn-Taiwan-LLaMa/"
elif use_model == "twLLaMA":
  model_name = "twLlama"
  repo_name = "yentinglin/Taiwan-LLaMa-v1.0"
else:
  raise ValueError("Invalid model name")
print(model_name)

## Load Data

In [4]:
import os
import sys
os.environ["BNB_CUDA_VERSION"] = "115"
if "../src/llm_compressor/src" not in sys.path:
  sys.path.append("../src/llm_compressor/src")

In [5]:
from pathlib import Path
import json
import pickle
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoModelForCausalLM, GPTNeoXForCausalLM, AutoTokenizer
from huggingface_hub import list_repo_refs 

from llm_compressor import AECompressorLLM


BNB_CUDA_VERSION=XXX can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.
If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=
If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH
For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_cuda_dir/lib64
Loading CUDA version: BNB_CUDA_VERSION=115


  warn((f'\n\n{"="*80}\n'


In [6]:
data = json.loads(Path("../data/cwn_semrel_dataset.json").read_text())
symtable = json.loads(Path("../data/cwn-prompt-symtable.json").read_text())
rev_symtable = {v: int(k) for k, v in symtable.items()}
symlist = list(symtable.values())
n_symbol = len(symlist)
symbol_bits = np.log2(n_symbol)

## Inference

### Setup

In [7]:
# reformat the rev_list in reverse order
if use_model == "twLLaMA":
  cache_dir = Path("/home/seantyh/hdd/hf_cache/twLLaMA/main")
else:
  cache_dir = None

In [8]:
tokenizer = AutoTokenizer.from_pretrained(repo_name)

model = AutoModelForCausalLM.from_pretrained(
  repo_name,
  cache_dir=cache_dir,
  load_in_8bit=True
)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
from collections import namedtuple
VarLoc = namedtuple("VarLoc", ("name", "value", "start", "end"))

def make_locmap(inst):
  return {
    x[0]: VarLoc(*x)
    for x in inst["var_loc"]
  }

def find_token_locs(batch, char_locs):
  return [
    batch.char_to_token(0, x)
    for x in char_locs
  ]

### Decoding

In [10]:
proc_data = []
for inst_x in tqdm(data):
    batch = tokenizer(inst_x["prompt"], 
                    return_tensors="pt").to("cuda")
    with torch.no_grad():
        out = model(**batch, output_hidden_states=True)

    var_loc_x = make_locmap(inst_x)

    a_loc = var_loc_x["a_lemma"]
    b_loc = var_loc_x["b_lemma"]
    a_toklocs = find_token_locs(batch, (a_loc.start, a_loc.end))
    b_toklocs = find_token_locs(batch, (b_loc.start, b_loc.end))

    # check the tokens and strings match
    lemma_a = var_loc_x["a_lemma"].value
    lemma_b = var_loc_x["b_lemma"].value
    recon_a = tokenizer.decode(batch.input_ids[0, a_toklocs[0]:a_toklocs[1]]).strip()
    recon_b = tokenizer.decode(batch.input_ids[0, b_toklocs[0]:b_toklocs[1]]).strip()
    ## assertion
    assert lemma_a == recon_a
    assert lemma_b == recon_b

    last_hidden = out.hidden_states[-1]
    vec_a = last_hidden[0, a_toklocs[0]:a_toklocs[1]].mean(0)
    vec_b = last_hidden[0, b_toklocs[0]:b_toklocs[1]].mean(0)
    logits = out.logits
    data_ids = batch.input_ids.squeeze().to("cpu").tolist()

    ab_logits = logits[:, :, torch.tensor(symlist)]
    ab_data_ids = [rev_symtable[x] for x in data_ids]
    ab_input_ids = torch.tensor(ab_data_ids).view(1, -1).to("cuda")

    probs = torch.softmax(ab_logits, dim=1).squeeze().to("cpu")
    uniform_prob = (torch.ones(probs.shape[1]) / probs.shape[1]).to("cpu")
    next_token_probs = torch.concat([uniform_prob.unsqueeze(0), probs[:-1, :]], dim=0).cpu()     
    tok_logits = ab_logits.gather(2, ab_input_ids[:, 1:].unsqueeze(2)).squeeze().to("cpu")
    tok_logits = torch.cat([torch.zeros(1,), tok_logits], dim=0)
    logit_a = tok_logits[a_toklocs[0]:a_toklocs[1]].mean(0).item()
    logit_b = tok_logits[b_toklocs[0]:b_toklocs[1]].mean(0).item()
    tok_probs = next_token_probs.gather(1, ab_input_ids.transpose(1,0).cpu()).squeeze().to("cpu")
    prob_a = tok_probs[a_toklocs[0]:a_toklocs[1]].mean(0).item()
    prob_b = tok_probs[b_toklocs[0]:b_toklocs[1]].mean(0).item()    

    # do compression
    compressor = AECompressorLLM()

    # only compress the different part of emp/perm
    ab_data_ids = ab_data_ids[b_toklocs[1]:]
    next_token_probs = next_token_probs[b_toklocs[1]:]
    try:
        msg = compressor.compress(ab_data_ids, next_token_probs)
        recon = compressor.decompress(msg, len(ab_data_ids), next_token_probs)
        compress_bits = len(msg)
        compress_ratio = compress_bits / (len(ab_data_ids) * symbol_bits)

        assert all(a==b for a, b in zip(recon, ab_data_ids))
    except Exception as ex:
        compress_bits = float('nan')
        compress_ratio = float('nan')
        print(type(ex).__name__, str(ex))        
        
    # put vectors, logits, input_ids into inst_x
    inst_x.update({
        "vec_a": vec_a.cpu().numpy(),
        "vec_b": vec_b.cpu().numpy(),
        "logit_a": logit_a,
        "logit_b": logit_b,
        "prob_a": prob_a,
        "prob_b": prob_b,
        "tok_logits": tok_logits.numpy(),
        "tok_probs": tok_probs.numpy(),        
        "compress_bits": compress_bits,
        "compress_ratio": compress_ratio,
        "input_ids": batch.input_ids.cpu().numpy(),
        "ab_data_ids": ab_data_ids,
        "n_ab_data_ids": len(ab_data_ids),
    })
    proc_data.append(inst_x)
    

  0%|          | 0/1252 [00:00<?, ?it/s]

In [11]:
emp_cr = np.array([x["compress_ratio"] for x in proc_data if x["type"] == "emp"])
perm_cr = np.array([x["compress_ratio"] for x in proc_data if x["type"] == "perm"])
emp_cr.mean(), perm_cr.mean()

(0.46858111108394895, 0.5293021242552889)

In [16]:
.529-.469, .598-.553

(0.06000000000000005, 0.04499999999999993)

## Export artefacts

In [13]:
out_dir = Path("../data/minrep/")
out_dir.mkdir(exist_ok=True, parents=True)
out_path = out_dir / f"minrep-{model_name}-cwn-semrel.pkl"
fsize = out_path.write_bytes(pickle.dumps(proc_data))
print("artefact size (MB):", fsize//(1024**2))

artefact size (MB): 27


In [14]:
!sha1sum {str(out_path)}

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
8c5c8a7c28536307a7dfebb2ddc786aea7023739  ../data/minrep/minrep-cwnLlama-cwn-semrel.pkl
