## Data dependencies

In [1]:
!sha1sum ../data/cwn-test-500-Taiwan-LLaMa-eval.csv

fd3666f6bdd4c895546832f71f30476d7d310bf1  ../data/cwn-test-500-Taiwan-LLaMa-eval.csv


## Load data

In [2]:
import re
from io import StringIO
import pandas as pd

data = pd.read_csv("../data/cwn-test-500-Taiwan-LLaMa-eval.csv")
data.index.set_names("serial", inplace=True)
data = data.reset_index()

In [3]:
subdata = data.loc[:, "serial,given,want,term,pos,gold,perm_gold,complete".split(",")]

In [4]:
subdata["ori_prompt"] = subdata.complete.map(lambda x: x.split("ASSISTANT")[0].strip())

In [5]:
## What should I use as a key?

In [6]:
cwn_prompt_template = """
{ori_prompt}
ASSISTANT: {resp}
""".strip()

def build_prompt(fstring, **kwargs):
    var_locations = []
    formatted_string = StringIO()    
    while match:=re.search(r'\{(\w+)\}', fstring):
        var_name = match.group(1)
        var_value = str(kwargs[var_name])  
        
        formatted_string.write(fstring[:match.start()])
        var_start = formatted_string.tell()
        formatted_string.write(var_value)
        var_end = formatted_string.tell()
        var_locations.append((var_name, var_value, var_start, var_end))
        fstring = fstring[match.end():]    
    formatted_string.write(fstring)

    ## check var_locations
    outstr = formatted_string.getvalue()
    for _, val, si, ei in var_locations:
        assert outstr[si:ei] == val
        
    return outstr, var_locations


In [7]:
prompt_items = []
def format_prompt(ori_prompt, resp):
  prompt, var_loc = build_prompt(cwn_prompt_template,
    ori_prompt=ori_prompt, resp=resp)
  return prompt, var_loc

for _, row in subdata.iterrows():
    gold_prompt, gold_loc = format_prompt(row.ori_prompt, row.gold)
    perm_prompt, perm_loc = format_prompt(row.ori_prompt, row.perm_gold)
    prompt_items.append({
        "eval_serial": row.serial,
        "term": row.term,
        "pos": row.pos,
        "prompt_type": "emp",
        "prompt": gold_prompt,
        "resp_loc": gold_loc[1][2:],
    })
    prompt_items.append({
        "eval_serial": row.serial,
        "term": row.term,
        "pos": row.pos,
        "prompt_type": "perm",
        "prompt": perm_prompt,
        "resp_loc": perm_loc[1][2:]
    })

In [8]:
prompt_df = pd.DataFrame(prompt_items)
prompt_df.head()

Unnamed: 0,eval_serial,term,pos,prompt_type,prompt,resp_loc
0,0,<幕>,<Na>,emp,USER: 給定詞條、詞性和例句，創建一個最能代表該詞條的定義。\n詞條： <幕>\n詞性：...,"(187, 201)"
1,0,<幕>,<Na>,perm,USER: 給定詞條、詞性和例句，創建一個最能代表該詞條的定義。\n詞條： <幕>\n詞性：...,"(187, 202)"
2,1,<幕>,<Na>,emp,USER: 給定詞條、詞性和定義，創建一個最能代表該詞條的例句。\n詞條： <幕>\n詞性：...,"(79, 201)"
3,1,<幕>,<Na>,perm,USER: 給定詞條、詞性和定義，創建一個最能代表該詞條的例句。\n詞條： <幕>\n詞性：...,"(79, 274)"
4,2,<幕>,<Na>,emp,USER: 給定詞條、詞性和定義，列出該詞條的同義詞。\n詞條： <幕>\n詞性：<Na>\...,"(202, 276)"


In [9]:
out_path = "../data/cwn-prompt-500-perm-eval.csv"
prompt_df.to_csv(out_path, index=False)
!sha1sum $out_path

2e101d021306406001c3c4a6c3bf3679bc4149e9  ../data/cwn-prompt-500-perm-eval.csv
