In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import json

device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
method = "post"
model_name = "upstage/llama-30b-instruct-2048"
model_str = model_name.replace("/", "_")
trained_model = "multitask_document"
data_path = f"../../result/{trained_model}.json"

# if method == "post": ## see small model  
#     save_path = f"../../result/LLM_{model_str}_{trained_model}.json"
#     error_path = f"../../result/LLM_{model_str}_{trained_model}_error.json"
# else: ## zero (unseen)
#     save_path = f"../../result/LLM_{model_str}_{method}.json"
#     error_path = f"../../result/LLM_{model_str}_{method}_error.json"

with open(data_path, "r") as f:
    dataset = json.load(f)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
len(dataset)

2832

In [3]:
model_path = f"/home/jovyan/hdfs-jmt-rungjoo/huggingface_models/{model_name}" # "upstage/llama-30b-instruct-2048"
tokenizer = AutoTokenizer.from_pretrained(model_path) 
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    torch_dtype=torch.float16,
)
model.eval()

Loading checkpoint shards: 100%|██████████| 7/7 [01:19<00:00, 11.38s/it]


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 6656, padding_idx=0)
    (layers): ModuleList(
      (0-59): 60 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=6656, out_features=6656, bias=False)
          (k_proj): Linear(in_features=6656, out_features=6656, bias=False)
          (v_proj): Linear(in_features=6656, out_features=6656, bias=False)
          (o_proj): Linear(in_features=6656, out_features=6656, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=6656, out_features=17920, bias=False)
          (up_proj): Linear(in_features=6656, out_features=17920, bias=False)
          (down_proj): Linear(in_features=17920, out_features=6656, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNo

In [4]:
def make_prompt(query, pred_facets, method):
    if method == "post":
        one_shot = """### User:\nThe predicted facets for 'caesars atlantic city' are 'parking, hotels'. But the correct facets are 'caesars atlantic city events, caesars atlantic city jobs, caesars atlantic city parking'\n"""
        two_shot = """The predicted facets for 'vista, ca' are 'parking, hotels'. But the correct facets are 'weather, zip code, population, homes for sale'\n\n"""
        prompt = one_shot + two_shot + f"""As in the example above, modify the predicted facets.\nThe predicted facets for '{query}' are '{pred_facets}'. What are the correct facets?\n\n### Assistant:\nThe correct facets for '{query}' are"""
    elif method == "zero":
        one_shot = """### User:\nThe facets for 'caesars atlantic city' are 'caesars atlantic city events, caesars atlantic city jobs, caesars atlantic city parking'\n"""
        two_shot = """The facets for 'vista, ca' are 'weather, zip code, population, homes for sale'\n\n"""
        prompt = one_shot + two_shot + f"""### Assistant:\nThe correct facets for '{query}' are"""    
    else: # noshot
        prompt = "### User:\nThe facets for 'query' are 'facets'\nAs in the format above, generate facets related to the query within 5, separated by ','.\n\n"
        prompt += f"""### Assistant:\nThe facets for '{query}' are"""
    
    return prompt

In [5]:
import re
from tqdm import tqdm

method = 'noshot'

eng_rule = re.compile('\'.+\'')    
test_result = {}
error_result = {}
for k, data in tqdm(dataset.items()):
    query = data['query']
    pred_facet_list = data['pred']
    pred_facets = ", ".join(pred_facet_list)
    label = data['label']
    options_overall_label = data['options_overall_label']

    prompt = make_prompt(query, pred_facets, method)

    label_inputs = tokenizer(pred_facets, return_tensors="pt")
    label_len = label_inputs['input_ids'].shape[1]        

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    output = model.generate(**inputs, use_cache=True, max_new_tokens=int(label_len*2), temperature=0.001, top_p=1)
    output = tokenizer.decode(output[0], skip_special_tokens=True)
    correct_facets = output[len(prompt):]

    try:
        matches = eng_rule.findall(correct_facets.strip())
        if len(matches) == 1:
            correct_facet_list = [x.strip() for x in matches[0].strip("'").split(",") if x.strip() != ""]
        else:
            correct_facet_list = [x.strip() for x in correct_facets.strip().split("\n")[0].strip("'").strip(".").strip("'").split(",") if x.strip() != ""]
        test_result[k] = {}
        test_result[k]['query'] = query
        test_result[k]['pred'] = correct_facet_list
        test_result[k]['label'] = label
        test_result[k]['options_overall_label'] = options_overall_label
    except:
        error_result[k] = {}
        error_result[k]['query'] = query
        error_result[k]['pred'] = correct_facets
        error_result[k]['label'] = label
        error_result[k]['options_overall_label'] = options_overall_label     
        
    break

  0%|          | 0/2832 [00:02<?, ?it/s]


In [8]:
print(output)

### User:
The facets for 'query' are 'facets'
As in the format above, generate facets related to the query within 5, separated by ','.

### Assistant:
The facets for 'caesars atlantic city' are 'hotel, casino, entertainment, dining, location'.


In [15]:
print(make_prompt(query, pred_facets, 'fewshot'))

### User:
The correct facets for 'caesars atlantic city' are 'caesars atlantic city events, caesars atlantic city jobs, caesars atlantic city parking'.
The correct facets for 'vista, ca' are 'weather, zip code, population, homes for sale'.

As in the example above, predict the correct facets.

### Assistant:
The correct facets for 'caesars atlantic city' are


In [6]:
%%time
max_len = 0
for ind, data in dataset.items():
    label = ", ".join(data['label'])
    inputs = tokenizer(label, return_tensors="pt")
    
    input_len = inputs['input_ids'].shape[1]
    if input_len > max_len:
        max_len = input_len
print(max_len)

50
CPU times: user 245 ms, sys: 3.13 ms, total: 249 ms
Wall time: 245 ms


## 에러 확인하기

In [None]:
import re
eng_rule = re.compile('\'[a-zA-Z0-9\-&.,\s]+\'')
correct_facets = "The correct facets for 'fps' are 'fps windows 10, fps windows 7, fps xbox one, fps ps4'."
parsing = eng_rule.findall(correct_facets)
print(parsing)
correct_facet_list = [x.strip() for x in parsing[1].strip("'").split(",")]

In [None]:
import json
data_path = f"../../result/LLM_multitask_document_related.json"
with open(data_path, "r") as f:
    dataset = json.load(f)
    
# data_path = f"../../result/LLM_multitask_related_error.json"
# with open(data_path, "r") as f:
#     dataset_error = json.load(f)

In [None]:
len(dataset), len(dataset_error)

In [None]:
final_dataset = {}
error_list = []
for ind, data in dataset.items():
    final_dataset[ind] = data
    
for ind, data in dataset_error.items():
    query = data['query']
    pred = data['pred']
    filter_pred = pred.replace(f"'{query}'", '')
    correct_facets = eng_rule.findall(filter_pred)
    if len(correct_facets) == 1:
        correct_facet_list = [x.strip() for x in correct_facets[0].strip("'").split(",")]

        final_dataset[ind] = {}
        final_dataset[ind]['query'] = data['query']
        final_dataset[ind]['pred'] = correct_facet_list
        final_dataset[ind]['label'] = data['label']
        final_dataset[ind]['options_overall_label'] = data['options_overall_label']
    else:
        # print(pred)
        # print(filter_pred)
        # print(correct_facets)
        error_list.append([pred, correct_facets, data['label']])

In [None]:
final_dataset = {}
for ind, data in dataset.items():
    final_dataset[ind] = {}
    final_dataset[ind]['query'] = data['query']
    final_dataset[ind]['pred'] = [x.strip("'") for x in data['pred'] if x.strip()!=""]
    final_dataset[ind]['label'] = data['label']
    final_dataset[ind]['options_overall_label'] = data['options_overall_label']

In [None]:
final_dataset['7']

In [None]:
save_path = f"../../result/LLM_multitask_document_related.json"
with open(save_path, "w", encoding='utf-8') as f:
    json.dump(final_dataset, f)

In [None]:
save_path = f"../../result/LLM_multitask_document_related1.json"
with open(save_path, "w", encoding='utf-8') as f:
    json.dump(dataset, f)

In [2]:
xx="""### User:
The predicted facets for 'caesars atlantic city' are 'parking, hotels'. But the correct facets are 'caesars atlantic city events, caesars atlantic city jobs, caesars atlantic city parking'
The predicted facets for 'vista, ca' are 'parking, hotels'. But the correct facets are 'weather, zip code, population, homes for sale'

As in the example above, modify the predicted facets.
The predicted facets for 'device manager' are 'device manager, windows device manager'. What are the correct facets?

### Assistant:
The correct facets for 'device manager' are"""

In [3]:
print(xx)

### User:
The predicted facets for 'caesars atlantic city' are 'parking, hotels'. But the correct facets are 'caesars atlantic city events, caesars atlantic city jobs, caesars atlantic city parking'
The predicted facets for 'vista, ca' are 'parking, hotels'. But the correct facets are 'weather, zip code, population, homes for sale'

As in the example above, modify the predicted facets.
The predicted facets for 'device manager' are 'device manager, windows device manager'. What are the correct facets?

### Assistant:
The correct facets for 'device manager' are
