In [1]:
import argparse
from argparse import Namespace
from pathlib import Path
import os
os.environ['CUDA_VISIBLE_DEVICES']='0,1'
os.environ['HF_HUB_CACHE'] = '/next_share/hf_cache/hub/'
import json
import torch
from transformers import (
    AutoTokenizer, PreTrainedTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM
)
from peft import (
    LoraConfig
)
from accelerate import PartialState, Accelerator

import context
os.chdir(context.proj_dir)

from cont_gen.data_loader.cuad_prompt import CUAD_SFT, SFT_Padding, CUAD_SFT_Seq2Seq
from cont_gen.data_loader.cuad_sft import CUAD_SFT_Cached, CUAD_SFT_Filter_Type
from cont_gen.utils.model_utils import build_hf_or_peft_model, smart_resize_embeddings, load_hf_model_from_checkpoint
from cont_gen.trainer.utils import get_smart_optimizer, compute_clm_loss_with_ignore
from cont_gen.trainer.train_only_accelerate import Trainer_Basic, TrainingArgs_Basic
from cont_gen.model.loss import LM_Simple_Feed
from cont_gen.run.infer_sft import SimpleGenerator, load_test_dataset
from cont_gen.data_process.utils import tokenize_wo_bos

  return torch._C._cuda_getDeviceCount() > 0
  warn("The installed version of bitsandbytes was compiled without GPU support. "


/storage_fast/rhshui/lib/anaconda3/envs/llm2/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32


In [2]:
def build_tkn(path):
    return AutoTokenizer.from_pretrained(path, trust_remote_code = True)

# tokenizer name to path
TKN_MAP = {
    'flan-t5': build_tkn('google/flan-t5-large'),
    'llama2': build_tkn('meta-llama/Llama-2-7b-hf'),
    'llama3': build_tkn('meta-llama/Meta-Llama-3-8B'),
    'mistral': build_tkn('mistralai/Mistral-7B-v0.1'),
    # 'phi1': build_tkn('microsoft/phi-1_5'),
    'phi2': build_tkn('microsoft/phi-2')
}

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
tk_name='llama3'
is_seq2seq = False

model_path = 'meta-llama/Meta-Llama-3-8B-Instruct'
is_chat = 'Instruct' in model_path
ckpt = 'runs/ood/llama3_chat/seed42_tr29/pmt_01_all_lr1e-5_bs16_wd0.0/checkpoint-15692'

data_path = f'data/ood_split/seed42_tr29/{tk_name}/pmt_01/train_data.jsonl'
tokenizer = build_tkn(model_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [20]:
tk_name='mistral'
is_seq2seq = False
# model_path = 'mistralai/Mistral-7B-v0.1'
model_path = 'mistralai/Mistral-7B-Instruct-v0.2'

is_chat = 'Instruct' in model_path
# ckpt = 'runs/ood/llama3/seed42_tr29/pmt_01_lr1e-5_bs16_wd0.0/checkpoint-15692'

data_path = f'data/ood_split/seed42_tr29/{tk_name}/pmt_01/train_data.jsonl'
tokenizer = build_tkn(model_path)

In [3]:
tk_name='flan-t5'
model_path = 'google/t5-v1_1-large'
is_seq2seq = True
is_chat = False
ckpt = 'runs/ood/flan-t5-large/seed42_tr29/pmt_01_lr1e-4_bs16_wd0.0/checkpoint-7880'

data_path = f'data/ood_split/seed42_tr29/{tk_name}/pmt_01/train_data.jsonl'
tokenizer = build_tkn(model_path)

tokenizer_config.json:   0%|          | 0.00/1.86k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/607 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [4]:
tokenizer.eos_token

'</s>'

In [4]:
# Load training dataset
tr_ds = CUAD_SFT_Cached(data_path, tokenizer, is_seq2seq = is_seq2seq,
                        is_chat = is_chat, 
                          cache_dir = Path(data_path).parent / 'cache')
te_ds_ood = CUAD_SFT_Filter_Type(
    Path(data_path).parent / 'test_data_ood.jsonl', tokenizer, is_seq2seq = is_seq2seq, is_chat = is_chat, 
    cache_dir = Path(data_path).parent / 'cache',
    is_test = True,
    judge_type_fn = lambda k: k>0,
    )

# te_ds_ood_new = CUAD_SFT_Filter_Type(
#     Path(data_path).parent / 'test_data_ood.jsonl', tokenizer, is_seq2seq = is_seq2seq, is_chat = is_chat, 
#     is_test = True,
#     judge_type_fn = lambda k: k>0,
#     )

Load from cache: data/ood_split/seed42_tr29/llama3/pmt_01/cache/cached_train_data.jsonl_Meta-Llama-3-8B-Instruct_v1.1chat.pkl
Load from cache: data/ood_split/seed42_tr29/llama3/pmt_01/cache/cached_test_data_ood.jsonl_Meta-Llama-3-8B-Instruct_v1.1chat.pkl


In [5]:
te_ds_ood.is_seq2seq

True

In [5]:
print(tokenizer.convert_ids_to_tokens(te_ds_ood[0]['input_ids'][-20:]))

['▁force', '▁of', '▁law', '.', '<0x0A>', '<0x0A>', '###', 'Question', ':', '▁The', '▁name', '▁of', '▁the', '▁contract', '<0x0A>', '<0x0A>', '###', 'An', 'swer', ':']


### Build Model

In [5]:
if 'model' in dir():
    del model
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype = torch.bfloat16, device_map = 0)
# model = AutoModelForSeq2SeqLM.from_pretrained(ckpt, torch_dtype = torch.bfloat16, device_map = 0)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

RuntimeError: The NVIDIA driver on your system is too old (found version 11070). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver.

In [5]:
input_ids = tokenizer("translate English to German: The house is wonderful.", return_tensors="pt").input_ids
with torch.no_grad():
    outputs = model.generate(input_ids.cuda())
print(tokenizer.convert_ids_to_tokens(outputs[0]))

['<pad>', '<extra_id_0>', '▁the', '▁house', '.', '<extra_id_1>', '.', '</s>']




In [6]:
adp_st = torch.load(Path(ckpt) / 'adapter_model.bin', map_location='cpu')
print([k for k in adp_st.keys() if 'layers' not in k])

[]


In [14]:
if 'base_model.model.model.embed_tokens.weight' in adp_st:
    new_shape = adp_st['base_model.model.model.embed_tokens.weight'].shape[0]
    model.resize_token_embeddings(new_shape)
model.load_adapter(ckpt)

In [7]:
def to_cuda(data):
    return {k:v.cuda() for k,v in data.items()}

def to_batch(data):
    return {k:torch.tensor(v).unsqueeze(0).cuda() for k,v in data.items()}

def add_target_head(sample, head):
    """Append target head to source"""
    ori_ids = sample['input_ids']
    ipt_len = len([k for k in sample['labels'] if k == -100])
    new_ids = ori_ids[:ipt_len] + list(head)
    new_mask = [1] * len(new_ids)
    new_labels = [-100] * ipt_len + list(head)
    return {'input_ids': new_ids, 'attention_mask': new_mask, 'labels': new_labels}

def generate(model, batch, tokenizer, max_len = 512):
    return model.generate(**{k:batch[k] for k in ['input_ids', 'attention_mask']},
                          do_sample = False, eos_token_id = tokenizer.eos_token_id,
                          max_new_tokens = max_len)

def get_prob(logits, top_k = 5):
    probs= torch.softmax(logits, dim = -1)
    rank = torch.argsort(probs, descending = True).tolist()
    return [(rank[i], probs[rank[i]].item()) for i in range(top_k)]

def get_token_prob(logits, tokenizer, top_k = 5):
    top = get_prob(logits, top_k)
    return [(tokenizer.convert_ids_to_tokens(tid), tid, p) for tid, p in top]

def greedy_generate(model, batch, tokenizer, num_token = 5):
    past_key_values = []
    input_ids = batch['input_ids']
    mask = batch['attention_mask']
    gen_tokens = []
    for step in range(num_token):
        with torch.no_grad():
            out = model(input_ids = input_ids, attention_mask = mask, past_key_values = past_key_values)
        top = get_token_prob(out.logits[0][-1], tokenizer)
        print(f'Step {step+1}')
        for token, tid, p in top:
            print(f'\t{token} {tid} {p:.4f}')
        past_key_values = out.past_key_values
        input_ids = torch.tensor([[top[0][1]]]).cuda()
        mask = torch.concat([mask, torch.tensor([[1]]).cuda()], dim = -1)
        gen_tokens.append(top[0][1])
    return gen_tokens

### Infer on some examples

In [29]:
def get_gen_text(sample, model, tokenizer, is_train = False):
    if is_train:
        sample = add_target_head(sample, [])
    ipt_len = len(sample['input_ids'])
    with torch.no_grad():
        ba = to_batch(sample)
        out = model.generate(
            input_ids = ba['input_ids'], 
            attention_mask = ba['attention_mask'],
            max_new_tokens = 600,
        )
    return tokenizer.decode(out[0][ipt_len:], skip_special_tokens=True)

In [30]:
tokenizer.decode(tr_ds[0]['input_ids'])

'<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nYou are a helpful assistant. Review the contract clauses and answer questions. Output the mentioned clauses if exist; otherwise output "No".\n\n###Clauses:\nEXHIBIT 10.6\n DISTRIBUTOR AGREEMENT\n THIS DISTRIBUTOR AGREEMENT (the "Agreement") is made by and between Electric City Corp., a Delaware corporation ("Company") and Electric City of Illinois LLC ("Distributor") this 7th day of September, 1999.\n RECITALS\n A. The Company\'s Business. The Company is presently engaged in the business of selling an energy efficiency device, which is referred to as an "Energy Saver" which may be improved or otherwise changed from its present composition (the "Products"). The Company may engage in the business of selling other products or other devices other than the Products, which will be considered Products if Distributor exercises its options pursuant to Section 7 hereof.\n\n###Question: The date of the contract\n\n###Answer:<|eot_id|><

In [31]:
out_t = get_gen_text(te_ds_ood[0], model, tokenizer)
print(out_t)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


RuntimeError: "triu_tril_cuda_template" not implemented for 'BFloat16'

In [8]:
# Infer for decoder-only model
sample = tr_ds[0]

sample = add_target_head(sample, [])
pmt_len = len(sample['input_ids'])
with torch.no_grad():
    ba = to_batch(sample)
    out = model(**ba)
    # gen_out = generate(model, ba, tokenizer, 20)

In [10]:
# Infer for seq2seq model
sample = te_ds_ood[0]
with torch.no_grad():
    ba = to_batch(sample)
    out = model(**ba)
    gen_out = generate(model, ba, tokenizer, 20)

In [11]:
print(tokenizer.decode(sample['input_ids'][-100:]))
print(tokenizer.decode(sample['labels']))

puts forward amendments or not accept orders, the seller shall be in the form of a written notice to entrusted party, entrusted party accept the modified by written consent, the modified orders to be taken effect. 2.4 Seller's note, only the buyer entrust the entrusted party issued orders, the product delivery and payment has the force of law. ###Question: The name of the contract ###Answer:</s>
- SUPPLY CONTRACT</s>


In [11]:
# tokenizer.special_tokens_map

In [13]:
greedy_generate(model, to_batch(sample), tokenizer)

ValueError: You have to specify either decoder_input_ids or decoder_inputs_embeds

In [21]:
print(out.keys())
print(len(sample['input_ids']))
out.logits[0][-1].shape

odict_keys(['loss', 'logits', 'past_key_values', 'encoder_last_hidden_state'])
230


torch.Size([32128])

In [22]:
pmt_len = 0
gen_tokens = gen_out[0][pmt_len:]
print(gen_tokens)
print(tokenizer.convert_ids_to_tokens(gen_tokens))

tensor([    0, 32099,   489,   189,   239,    13,  1600,     6,  5247,     1],
       device='cuda:0')
['<pad>', '<extra_id_0>', '▁7', 'th', '▁day', '▁of', '▁September', ',', '▁1999', '</s>']


In [19]:
tokenizer.eos_token

'<|eot_id|>'

In [48]:
get_token_prob(out.logits[0][-1], tokenizer)

[('<0x0A>', 13, 0.4116723835468292),
 ('▁', 28705, 0.2496919333934784),
 ('▁September', 4074, 0.10408708453178406),
 ('▁The', 415, 0.0593070350587368),
 ('▁"', 345, 0.026317333802580833)]

In [10]:
print(tokenizer.decode(sample['input_ids'][-50:]))

of selling other products or other devices other than the Products, which will be considered Products if Distributor exercises its options pursuant to Section 7 hereof.

###Question: The date of the contract

###Answer:


In [36]:
TKN_MAP['phi2'].convert_ids_to_tokens(TKN_MAP['phi2'].encode('hello world'))

['hello', 'Ġworld']

In [16]:
tokenizer.bos_token_id

128000

In [24]:
vec = model.model.embed_tokens.weight[128:135]

In [25]:
torch.norm(vec, dim = 1)

tensor([0.6055, 0.6016, 0.6289, 0.6289, 0.6289, 0.6055, 0.6211],
       device='cuda:0', dtype=torch.bfloat16,
       grad_fn=<LinalgVectorNormBackward0>)

In [21]:
torch.norm(model.model.embed_tokens.weight, dim = 1).mean()

tensor(0.5938, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)

In [4]:
test_ds = CUAD_SFT_Filter_Type(
        'data/ood_split/seed42_tr29/llama3/pmt_01/test_data_ood.jsonl',
        tokenizer,
        is_seq2seq = is_seq2seq,
        is_chat = True,
        # cache_dir = Path(args.data_path).parent / 'cache',
        is_test = True,
        judge_type_fn = lambda k: k>0,
    )

100%|███████████████████████████████████| 11808/11808 [00:06<00:00, 1753.68it/s]


In [5]:
print(tokenizer.decode(test_ds[0]['input_ids']))

<|begin_of_text|><|start_header_id|>user<|end_header_id|>

You are a helpful assistant. Review the contract clauses and answer questions. Output the mentioned clauses if exist; otherwise output "No".

###Clauses:
Exhibit 10.16 SUPPLY CONTRACT Contract No: Date: The buyer/End-User: Shenzhen LOHAS Supply Chain Management Co., Ltd. ADD: Tel No. : Fax No. : The seller: ADD: The Contract is concluded and signed by the Buyer and Seller on, in Hong Kong. 1. General provisions 1.1 This is a framework agreement, the terms and conditions are applied to all purchase orders which signed by this agreement (hereinafter referred to as the "order"). 1.2 If the provisions of the agreement are inconsistent with the order, the order shall prevail. Not stated in order content will be subject to the provisions of agreement. Any modification, supplementary, give up should been written records, only to be valid by buyers and sellers authorized representative signature and confirmation, otherwise will be deem