In [1]:
import argparse
from argparse import Namespace
from pathlib import Path
import os
os.environ['CUDA_VISIBLE_DEVICES']='0'
os.environ['HF_HUB_CACHE'] = '/next_share/hf_cache/hub/'
import json
import torch
from transformers import (
    AutoTokenizer, PreTrainedTokenizer, AutoModelForCausalLM
)
from peft import (
    LoraConfig
)
from accelerate import PartialState, Accelerator

import context
os.chdir(context.proj_dir)

from cont_gen.data_loader.cuad_prompt import CUAD_SFT, SFT_Padding, CUAD_SFT_Seq2Seq
from cont_gen.data_loader.cuad_sft import CUAD_SFT_Cached, CUAD_SFT_Filter_Type
from cont_gen.utils.model_utils import build_hf_or_peft_model, smart_resize_embeddings, load_hf_model_from_checkpoint
from cont_gen.trainer.utils import get_smart_optimizer, compute_clm_loss_with_ignore
from cont_gen.trainer.train_only_accelerate import Trainer_Basic, TrainingArgs_Basic
from cont_gen.model.loss import LM_Simple_Feed
from cont_gen.run.infer_sft import SimpleGenerator, load_test_dataset

In [2]:
def build_tkn(path):
    return AutoTokenizer.from_pretrained(path, trust_remote_code = True)

# tokenizer name to path
TKN_MAP = {
    'flan-t5': build_tkn('google/flan-t5-large'),
    'llama2': build_tkn('meta-llama/Llama-2-7b-hf'),
    'llama3': build_tkn('meta-llama/Meta-Llama-3-8B'),
    'mistral': build_tkn('mistralai/Mistral-7B-v0.1'),
    # 'phi1': build_tkn('microsoft/phi-1_5'),
    'phi2': build_tkn('microsoft/phi-2')
}

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
tk_name='llama3'
model_path = 'meta-llama/Meta-Llama-3-8B-Instruct'
is_seq2seq = False
ckpt = 'runs/ood/llama3/seed42_tr29/pmt_01_lr1e-5_bs16_wd0.0/checkpoint-15692'

data_path = f'data/ood_split/seed42_tr29/{tk_name}/pmt_01/train_data.jsonl'
# tokenizer = TKN_MAP[tk_name]
tokenizer = build_tkn(model_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
tk_name='mistral'
model_path = 'mistralai/Mistral-7B-v0.1'
is_seq2seq = False
# ckpt = 'runs/ood/llama3/seed42_tr29/pmt_01_lr1e-5_bs16_wd0.0/checkpoint-15692'

data_path = f'data/ood_split/seed42_tr29/{tk_name}/pmt_01/train_data.jsonl'
tokenizer = TKN_MAP[tk_name]

In [8]:
# Load dataset
tr_ds = CUAD_SFT_Cached(data_path, tokenizer, is_seq2seq = is_seq2seq,
                        is_chat = True, 
                          cache_dir = Path(data_path).parent / 'cache')

100%|███████████████████████████████████| 15812/15812 [00:11<00:00, 1423.51it/s]


Write to cache: data/ood_split/seed42_tr29/mistral/pmt_01/cache/cached_train_data.jsonl_Mistral-7B-v0.1_v1.1.pkl


In [5]:
if 'model' in dir():
    del model
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype = torch.bfloat16, device_map = 0)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [28]:
adp_st = torch.load(Path(ckpt) / 'adapter_model.bin', map_location='cpu')
print([k for k in adp_st.keys() if 'layers' not in k])

['base_model.model.model.embed_tokens.weight', 'base_model.model.lm_head.weight']


In [30]:
new_shape = adp_st['base_model.model.model.embed_tokens.weight'].shape[0]
model.resize_token_embeddings(new_shape)
model.load_adapter(ckpt)

In [10]:
def to_cuda(data):
    return {k:v.cuda() for k,v in data.items()}

def to_batch(data):
    return {k:torch.tensor(v).unsqueeze(0).cuda() for k,v in data.items()}

def add_target_head(sample, head):
    """Append target head to source"""
    ori_ids = sample['input_ids']
    ipt_len = len([k for k in sample['labels'] if k == -100])
    new_ids = ori_ids[:ipt_len] + list(head)
    new_mask = [1] * len(new_ids)
    new_labels = [-100] * ipt_len + list(head)
    return {'input_ids': new_ids, 'attention_mask': new_mask, 'labels': new_labels}

def generate(model, batch, tokenizer, max_len = 512):
    return model.generate(**{k:batch[k] for k in ['input_ids', 'attention_mask']},
                          do_sample = False, eos_token_id = tokenizer.eos_token_id,
                          max_new_tokens = max_len)

def get_prob(logits, top_k = 5):
    probs= torch.softmax(logits, dim = -1)
    rank = torch.argsort(probs, descending = True).tolist()
    return [(rank[i], probs[rank[i]].item()) for i in range(top_k)]

def get_token_prob(logits, tokenizer, top_k = 5):
    top = get_prob(logits, top_k)
    return [(tokenizer.convert_ids_to_tokens(tid), tid, p) for tid, p in top]

def greedy_generate(model, batch, tokenizer, num_token = 5):
    past_key_values = []
    input_ids = batch['input_ids']
    mask = batch['attention_mask']
    gen_tokens = []
    for step in range(num_token):
        with torch.no_grad():
            out = model(input_ids = input_ids, attention_mask = mask, past_key_values = past_key_values)
        top = get_token_prob(out.logits[0][-1], tokenizer)
        print(f'Step {step+1}')
        for token, tid, p in top:
            print(f'\t{token} {tid} {p:.4f}')
        past_key_values = out.past_key_values
        input_ids = torch.tensor([[top[0][1]]]).cuda()
        mask = torch.concat([mask, torch.tensor([[1]]).cuda()], dim = -1)
        gen_tokens.append(top[0][1])
    return gen_tokens

In [11]:
sample = tr_ds[0]

prompt = add_target_head(sample, [])
pmt_len = len(prompt['input_ids'])
# with torch.no_grad():
#     ba = to_batch(prompt)
#     out = model(**ba)
#     gen_out = generate(model, ba, tokenizer, 5)

In [31]:
greedy_generate(model, to_batch(prompt), tokenizer)

Step 1
	<|begin_of_text|> 128000 0.9891
	Ġ 220 0.0005
	ĠNo 2360 0.0004
	ĠDonovan 71800 0.0002
	="" 8573 0.0002
Step 2
	No 2822 0.9608
	Ġ 220 0.0031
	The 791 0.0024
	Ċ 198 0.0015
	Question 14924 0.0014
Step 3
	<|end_of_text|> 128001 0.9990
	Ċ 198 0.0003
	. 13 0.0002
	ĊĊ 271 0.0001
	<|begin_of_text|> 128000 0.0001
Step 4
	<|begin_of_text|> 128000 1.0000
	<|end_of_text|> 128001 0.0000
	Ġon 389 0.0000
	ĠMe 2206 0.0000
	{ 90 0.0000
Step 5
	No 2822 0.9657
	Ġ 220 0.0033
	The 791 0.0022
	Ċ 198 0.0015
	- 12 0.0012


[128000, 2822, 128001, 128000, 2822]

In [33]:
print(out.keys())
print(len(prompt['input_ids']))
out.logits[0][-1].shape

odict_keys(['loss', 'logits', 'past_key_values'])
196


torch.Size([128257])

In [34]:
gen_tokens = gen_out[0][pmt_len:]
print(gen_tokens)
print(tokenizer.convert_ids_to_tokens(gen_tokens))

tensor([128000,   2822, 128001], device='cuda:0')
['<|begin_of_text|>', 'No', '<|end_of_text|>']


In [48]:
get_token_prob(out.logits[0][-1], tokenizer)

[('<0x0A>', 13, 0.4116723835468292),
 ('▁', 28705, 0.2496919333934784),
 ('▁September', 4074, 0.10408708453178406),
 ('▁The', 415, 0.0593070350587368),
 ('▁"', 345, 0.026317333802580833)]

In [12]:
print(tokenizer.decode(sample['input_ids'][-50:]))

considered Products if Distributor exercises its options pursuant to Section 7 hereof.

###Question: The date of the contract

###Answer: - 7th day of September, 1999.</s>


In [36]:
TKN_MAP['phi2'].convert_ids_to_tokens(TKN_MAP['phi2'].encode('hello world'))

['hello', 'Ġworld']

In [16]:
tokenizer.bos_token_id

128000

In [24]:
vec = model.model.embed_tokens.weight[128:135]

In [25]:
torch.norm(vec, dim = 1)

tensor([0.6055, 0.6016, 0.6289, 0.6289, 0.6289, 0.6055, 0.6211],
       device='cuda:0', dtype=torch.bfloat16,
       grad_fn=<LinalgVectorNormBackward0>)

In [21]:
torch.norm(model.model.embed_tokens.weight, dim = 1).mean()

tensor(0.5938, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)

In [4]:
test_ds = CUAD_SFT_Filter_Type(
        'data/ood_split/seed42_tr29/llama3/pmt_01/test_data_ood.jsonl',
        tokenizer,
        is_seq2seq = is_seq2seq,
        is_chat = True,
        # cache_dir = Path(args.data_path).parent / 'cache',
        is_test = True,
        judge_type_fn = lambda k: k>0,
    )

100%|███████████████████████████████████| 11808/11808 [00:06<00:00, 1753.68it/s]


In [5]:
print(tokenizer.decode(test_ds[0]['input_ids']))

<|begin_of_text|><|start_header_id|>user<|end_header_id|>

You are a helpful assistant. Review the contract clauses and answer questions. Output the mentioned clauses if exist; otherwise output "No".

###Clauses:
Exhibit 10.16 SUPPLY CONTRACT Contract No: Date: The buyer/End-User: Shenzhen LOHAS Supply Chain Management Co., Ltd. ADD: Tel No. : Fax No. : The seller: ADD: The Contract is concluded and signed by the Buyer and Seller on, in Hong Kong. 1. General provisions 1.1 This is a framework agreement, the terms and conditions are applied to all purchase orders which signed by this agreement (hereinafter referred to as the "order"). 1.2 If the provisions of the agreement are inconsistent with the order, the order shall prevail. Not stated in order content will be subject to the provisions of agreement. Any modification, supplementary, give up should been written records, only to be valid by buyers and sellers authorized representative signature and confirmation, otherwise will be deem