diff --git a/lm_eval/tasks/ja/jsquad.py b/lm_eval/tasks/ja/jsquad.py index 7b643819d5..29179a717e 100644 --- a/lm_eval/tasks/ja/jsquad.py +++ b/lm_eval/tasks/ja/jsquad.py @@ -2,8 +2,8 @@ JGLUE: Japanese General Language Understanding Evaluation https://aclanthology.org/2022.lrec-1.317/ -JGLUE, Japanese General Language Understanding Evaluation, is built to measure the general NLU ability in Japanese. -JGLUE has been constructed from scratch without translation. +JGLUE, Japanese General Language Understanding Evaluation, is built to measure the general NLU ability in Japanese. +JGLUE has been constructed from scratch without translation. Homepage: https://github.com/yahoojapan/JGLUE """ @@ -40,6 +40,7 @@ class JSQuAD(Task): """ prompt template is taken from [日本語に特化した60億パラメータ規模のGPTモデルの構築と評価](https://www.anlp.jp/proceedings/annual_meeting/2023/pdf_dir/H9-4.pdf) """ + VERSION = 1.1 PROMPT_VERSION = 0.1 DATASET_PATH = "shunk031/JGLUE" @@ -51,11 +52,11 @@ class JSQuAD(Task): # REMOVE_IDS = ['a10743p19q0', 'a10743p19q1', 'a10743p19q2', 'a10743p19q3', 'a13221p1q0', 'a13221p1q1', 'a13221p1q2', 'a13221p1q3', 'a14985p1q0', 'a14985p1q1', 'a14985p1q2', 'a14985p1q3', 'a14985p1q4', 'a14985p93q0', 'a14985p93q1', 'a14985p93q2', 'a14985p93q3', 'a14985p93q4', 'a1540503p36q0', 'a1540503p36q1', 'a1540503p36q2', 'a1540503p36q3', 'a1540503p36q4', 'a18783p1q0', 'a18783p3q0', 'a18783p3q1', 'a18783p3q2', 'a18783p8q0', 'a18873p25q0', 'a18873p25q1', 'a18873p25q2', 'a18873p25q3', 'a18873p26q0', 'a18873p26q1', 'a18873p26q2', 'a20898p10q0', 'a20898p15q0', 'a20898p15q1', 'a20898p15q2', 'a20898p15q3', 'a2164640p22q0', 'a2164640p22q1', 'a2164640p22q2', 'a2164640p22q3', 'a2164640p22q4', 'a22392p20q0', 'a22392p20q1', 'a22392p20q2', 'a22392p20q3', 'a3011628p3q0', 'a3011628p3q1', 'a3011628p3q2', 'a3011628p3q3', 'a3189p4q0', 'a3189p4q1', 'a3189p4q2', 'a369953p0q0', 'a369953p0q1', 'a369953p0q2', 'a369953p0q3', 'a3949p1q0', 'a3949p1q1', 'a4596p0q0', 'a4596p0q1', 'a4596p0q2', 'a4596p0q3', 'a4596p1q0', 'a4596p1q1', 'a4596p1q2', 'a4596p1q3', 'a4596p1q4', 'a4596p38q0', 'a4596p38q1', 'a4596p38q2', 'a4596p38q3', 'a4596p38q4', 'a4768p13q0', 'a4768p13q1', 'a4768p13q2', 'a4768p3q0', 'a4768p3q1', 'a4768p3q2', 'a4768p3q3', 'a4768p8q0', 'a4768p8q1', 'a4768p8q2', 'a51481p0q0', 'a51481p0q1', 'a51481p0q2', 'a51481p10q0', 'a51481p10q1', 'a51481p10q2', 'a51481p10q3', 'a51481p6q0', 'a51481p6q1', 'a51481p6q2', 'a51481p6q3', 'a51481p7q0', 'a51481p7q1', 'a67892p11q0', 'a67892p11q1', 'a67892p11q2', 'a67892p11q3', 'a67892p2q0', 'a8874p6q0', 'a8874p6q1', 'a916079p3q0', 'a916079p3q1', 'a95156p4q0', 'a95156p4q1', 'a95156p4q2', 'a95156p4q3', 'a95156p6q0', 'a95156p6q1', 'a95156p6q2', 'a95156p6q3'] """ @mkshing's comment - I found that JSQuAD contains errors inside contexts such as below. + I found that JSQuAD contains errors inside contexts such as below. ``` {'id': 'a4596p0q0', 'title': 'ポルトガル', 'context': 'ポルトガル [SEP] 正式名称はポルトガル語で、。通称、 。', 'question': 'ポルトガルね正式名称は何語であるか', 'answers': {'text': ['正式名称はポルトガル語', 'ポルトガル語', 'ポルトガル語'], 'answer_start': [12, 17, 17]}, 'is_impossible': False} ``` - So, I tried to identify all of them and found that the following processing can be okay to detect the ids + So, I tried to identify all of them and found that the following processing can be okay to detect the ids ```python from datasets import load_dataset from transformers import T5Tokenizer @@ -70,19 +71,20 @@ class JSQuAD(Task): remove_ids.append(item["id"]) ``` """ + def __init__(self, **kwargs): super().__init__(**kwargs) self.jasquad_metric = datasets.load_metric(jasquad.__file__) def has_training_docs(self): return True - + def has_validation_docs(self): return True - + def has_test_docs(self): return False - + def training_docs(self): return self.dataset["train"] @@ -91,7 +93,7 @@ def validation_docs(self): if len(self.REMOVE_IDS) > 0: dataset = [item for item in dataset if item["id"] not in self.REMOVE_IDS] return dataset - + def doc_to_text(self, doc): return ( "[題名]:" @@ -126,12 +128,19 @@ def construct_requests(self, doc, ctx): encode_params = dict(add_special_tokens=False) else: encode_params = {} - max_num_tokens = max([len(encode_fn(answer, **encode_params)) for answer in doc["answers"]["text"]]) + max_num_tokens = max( + [ + len(encode_fn(answer, **encode_params)) + for answer in doc["answers"]["text"] + ] + ) continuation = rf.greedy_until(ctx, [self.SEP], max_num_tokens) return continuation - + def process_results(self, doc, results): - assert len(results) == 1, f"results should be a list with 1 str element, but is {results}" + assert ( + len(results) == 1 + ), f"results should be a list with 1 str element, but is {results}" continuation = results[0] predictions = { "id": doc["id"], @@ -153,7 +162,6 @@ def process_results(self, doc, results): ), # The F-score of predicted tokens versus the gold answer } - def aggregation(self): return { "exact_match": partial( @@ -163,7 +171,7 @@ def aggregation(self): self._squad_agg, "f1" ), # The F-score of predicted tokens versus the gold answer } - + def higher_is_better(self): return { "exact_match": True, # Exact match (the normalized answer exactly match the gold answer) @@ -171,20 +179,24 @@ def higher_is_better(self): } def _squad_metric(self, predictions, references): - return self.jasquad_metric.compute(predictions=predictions, references=references) - + return self.jasquad_metric.compute( + predictions=predictions, references=references + ) def _squad_agg(self, key, item): predictions, references = zip(*item) return self._squad_metric(predictions=predictions, references=references)[key] + class JSQuADWithFintanPrompt(JSQuAD): """ prompt template is taken from [ChatGPT vs BERT: どちらが日本語をより理解できるのか?](https://fintan.jp/page/9126/) """ + PROMPT_VERSION = 0.2 DESCRIPTION = "質問に対する回答を文章から一言で抽出してください。回答は名詞で答えてください。\n\n" SEP = "\n" + def doc_to_text(self, doc): return ( "文章:" @@ -195,15 +207,38 @@ def doc_to_text(self, doc): + f"{self.SEP}" + "回答:" ) - + + +class JSQuADWithFintanPromptV12(JSQuADWithFintanPrompt): + """ + prompt template is taken from [ChatGPT vs BERT: どちらが日本語をより理解できるのか?](https://fintan.jp/page/9126/) + """ + + VERSION = 1.2 + DESCRIPTION = "質問に対する回答を題名と文章から一言で抽出してください。回答は名詞で答えてください。\n\n" + + def doc_to_text(self, doc): + return ( + "題名:" + + doc["title"] + + f"{self.SEP}" + + "文章:" + + doc["context"].split("[SEP]")[-1].strip() + + f"{self.SEP}" + + "質問:" + + doc["question"] + + f"{self.SEP}" + + "回答:" + ) + class JSQuADWithJAAlpacaPrompt(JSQuAD): """ - This prompt format was inspired by the below data in fujiki/japanese_alpaca_data. + This prompt format was inspired by the below data in fujiki/japanese_alpaca_data. ``` { - 'instruction': '与えられた文脈に最も適した文を選択してください。', - 'input': '文脈:あなたは親友と現在の仕事の状況について話しています。\nA)私にはあまり選択肢がありません。\nB)他に選択肢がありません。\nC)私には本当に決断する必要がありません。', + 'instruction': '与えられた文脈に最も適した文を選択してください。', + 'input': '文脈:あなたは親友と現在の仕事の状況について話しています。\nA)私にはあまり選択肢がありません。\nB)他に選択肢がありません。\nC)私には本当に決断する必要がありません。', 'output': 'A) 私には多くの選択肢がありません。' } ``` @@ -211,23 +246,61 @@ class JSQuADWithJAAlpacaPrompt(JSQuAD): - data: https://huggingface.co/datasets/fujiki/japanese_alpaca_data - code: https://github.com/Stability-AI/gpt-neox/blob/c130a4edc1120dccec8f02a34eb60d3e8f484cd3/finetune/finetune_base_ja.py#LL118C23-L127C11 """ + PROMPT_VERSION = 0.3 DESCRIPTION = "以下は、タスクを説明する指示と、文脈のある入力の組み合わせです。要求を適切に満たす応答を書きなさい。\n\n" INSTRUCTION = "与えられた文脈から、質問に対する答えを抜き出してください。" + + def doc_to_text(self, doc): + """ + 以下は、タスクを説明する指示と、文脈のある入力の組み合わせです。要求を適切に満たす応答を書きなさい。 + + ### 指示: + {instruction} + + ### 入力: + {input} + + ### 応答: + {response} + """ + input_text = ( + f"文脈:{doc['context'].split('[SEP]')[-1].strip()}\n質問:{doc['question']}" + ) + return f"### 指示:\n{self.INSTRUCTION}\n\n### 入力:\n{input_text}\n\n### 応答:\n" + + +class JSQuADWithJAAlpacaPromptV12(JSQuADWithJAAlpacaPrompt): + """ + This prompt format was inspired by the below data in fujiki/japanese_alpaca_data. + ``` + { + 'instruction': '与えられた文脈に最も適した文を選択してください。', + 'input': '文脈:あなたは親友と現在の仕事の状況について話しています。\nA)私にはあまり選択肢がありません。\nB)他に選択肢がありません。\nC)私には本当に決断する必要がありません。', + 'output': 'A) 私には多くの選択肢がありません。' + } + ``` + Reference: + - data: https://huggingface.co/datasets/fujiki/japanese_alpaca_data + - code: https://github.com/Stability-AI/gpt-neox/blob/c130a4edc1120dccec8f02a34eb60d3e8f484cd3/finetune/finetune_base_ja.py#LL118C23-L127C11 + """ + + VERSION = 1.2 + def doc_to_text(self, doc): """ 以下は、タスクを説明する指示と、文脈のある入力の組み合わせです。要求を適切に満たす応答を書きなさい。 - ### 指示: + ### 指示: {instruction} - ### 入力: + ### 入力: {input} - ### 応答: + ### 応答: {response} """ - input_text = f"文脈:{doc['context'].split('[SEP]')[-1].strip()}\n質問:{doc['question']}" + input_text = f"文脈:{doc['title']}\n{doc['context'].split('[SEP]')[-1].strip()}\n質問:{doc['question']}" return f"### 指示:\n{self.INSTRUCTION}\n\n### 入力:\n{input_text}\n\n### 応答:\n" @@ -236,6 +309,7 @@ class JSQuADWithRinnaInstructionSFT(JSQuAD): Reference: - HF Hub: https://huggingface.co/rinna/japanese-gpt-neox-3.6b-instruction-sft """ + PROMPT_VERSION = 0.4 DESCRIPTION = "ユーザー: 与えられた文脈から、質問に対する答えを抜き出してください。システム: 分かりました。" SEP = "" @@ -243,7 +317,19 @@ class JSQuADWithRinnaInstructionSFT(JSQuAD): def doc_to_text(self, doc): input_text = f"文脈:{doc['context'].split('[SEP]')[-1].strip()}{self.SEP}質問:{doc['question']}" - # input_text = f"質問:{doc['question']}文脈:{doc['context'].split('[SEP]')[-1].strip()}" + return f"ユーザー: {input_text}{self.SEP}システム: " + + +class JSQuADWithRinnaInstructionSFTV12(JSQuADWithRinnaInstructionSFT): + """ + Reference: + - HF Hub: https://huggingface.co/rinna/japanese-gpt-neox-3.6b-instruction-sft + """ + + VERSION = 1.2 + + def doc_to_text(self, doc): + input_text = f"文脈:{doc['title']}{self.SEP}{doc['context'].split('[SEP]')[-1].strip()}{self.SEP}質問:{doc['question']}" return f"ユーザー: {input_text}{self.SEP}システム: " @@ -252,23 +338,43 @@ class JSQuADWithRinnaBilingualInstructionSFT(JSQuADWithRinnaInstructionSFT): Reference: - HF Hub: https://huggingface.co/rinna/bilingual-gpt-neox-4b-instruction-sft """ + PROMPT_VERSION = 0.5 DESCRIPTION = "ユーザー: 与えられた文脈から、質問に対する答えを抜き出してください。\nシステム: 分かりました。\n" SEP = "\n" FEWSHOT_SEP = "\n" - + +class JSQuADWithRinnaBilingualInstructionSFTV12(JSQuADWithRinnaBilingualInstructionSFT): + """ + Reference: + - HF Hub: https://huggingface.co/rinna/bilingual-gpt-neox-4b-instruction-sft + """ + + VERSION = 1.2 + + def doc_to_text(self, doc): + input_text = f"文脈:{doc['title']}{self.SEP}{doc['context'].split('[SEP]')[-1].strip()}{self.SEP}質問:{doc['question']}" + return f"ユーザー: {input_text}{self.SEP}システム: " + + VERSIONS = [ JSQuAD, JSQuADWithFintanPrompt, + JSQuADWithFintanPromptV12, JSQuADWithJAAlpacaPrompt, + JSQuADWithJAAlpacaPromptV12, JSQuADWithRinnaInstructionSFT, + JSQuADWithRinnaInstructionSFTV12, JSQuADWithRinnaBilingualInstructionSFT, + JSQuADWithRinnaBilingualInstructionSFTV12, ] def construct_tasks(): tasks = {} for version_class in VERSIONS: - tasks[f"jsquad-{version_class.VERSION}-{version_class.PROMPT_VERSION}"] = version_class + tasks[ + f"jsquad-{version_class.VERSION}-{version_class.PROMPT_VERSION}" + ] = version_class return tasks diff --git a/models/abeja-gpt-neox-japanese-2.7b/harness.jsquad-1.2.sh b/models/abeja-gpt-neox-japanese-2.7b/harness.jsquad-1.2.sh new file mode 100644 index 0000000000..c917a87bd9 --- /dev/null +++ b/models/abeja-gpt-neox-japanese-2.7b/harness.jsquad-1.2.sh @@ -0,0 +1,3 @@ +MODEL_ARGS="pretrained=abeja/gpt-neox-japanese-2.7b,device_map=auto,torch_dtype=auto" +TASK="jsquad-1.2-0.2" +python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "3" --device "cuda" --output_path "models/abeja-gpt-neox-japanese-2.7b/result.jsquad-1.2.json" diff --git a/models/abeja-gpt-neox-japanese-2.7b/result.jsquad-1.2.json b/models/abeja-gpt-neox-japanese-2.7b/result.jsquad-1.2.json new file mode 100644 index 0000000000..7b13dd2c3c --- /dev/null +++ b/models/abeja-gpt-neox-japanese-2.7b/result.jsquad-1.2.json @@ -0,0 +1,22 @@ +{ + "results": { + "jsquad-1.2-0.2": { + "exact_match": 15.803692030616839, + "f1": 25.18326978234071 + } + }, + "versions": { + "jsquad-1.2-0.2": 1.2 + }, + "config": { + "model": "hf-causal", + "model_args": "pretrained=abeja/gpt-neox-japanese-2.7b,device_map=auto,torch_dtype=auto", + "num_fewshot": 3, + "batch_size": null, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} diff --git a/models/cyberagent/cyberagent-open-calm-1b/harness.jsquad-1.2.sh b/models/cyberagent/cyberagent-open-calm-1b/harness.jsquad-1.2.sh new file mode 100644 index 0000000000..485052cd86 --- /dev/null +++ b/models/cyberagent/cyberagent-open-calm-1b/harness.jsquad-1.2.sh @@ -0,0 +1,3 @@ +MODEL_ARGS="pretrained=cyberagent/open-calm-1b,device_map=auto,torch_dtype=auto" +TASK="jsquad-1.2-0.2" +python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "3" --device "cuda" --output_path "models/cyberagent-open-calm-1b/result.jsquad-1.2.json" diff --git a/models/cyberagent/cyberagent-open-calm-1b/result.jsquad-1.2.json b/models/cyberagent/cyberagent-open-calm-1b/result.jsquad-1.2.json new file mode 100644 index 0000000000..7fce196a57 --- /dev/null +++ b/models/cyberagent/cyberagent-open-calm-1b/result.jsquad-1.2.json @@ -0,0 +1,22 @@ +{ + "results": { + "jsquad-1.2-0.2": { + "exact_match": 39.53174245835209, + "f1": 49.49399460234075 + } + }, + "versions": { + "jsquad-1.2-0.2": 1.2 + }, + "config": { + "model": "hf-causal", + "model_args": "pretrained=cyberagent/open-calm-1b", + "num_fewshot": 3, + "batch_size": null, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} diff --git a/models/cyberagent/cyberagent-open-calm-3b/harness.jsquad-1.2.sh b/models/cyberagent/cyberagent-open-calm-3b/harness.jsquad-1.2.sh new file mode 100644 index 0000000000..32424c0a81 --- /dev/null +++ b/models/cyberagent/cyberagent-open-calm-3b/harness.jsquad-1.2.sh @@ -0,0 +1,3 @@ +MODEL_ARGS="pretrained=cyberagent/open-calm-3b,device_map=auto,torch_dtype=auto" +TASK="jsquad-1.2-0.2" +python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2" --device "cuda" --output_path "models/cyberagent/cyberagent-open-calm-3b/result.jsquad-1.2.json" diff --git a/models/cyberagent/cyberagent-open-calm-3b/result.jsquad-1.2.json b/models/cyberagent/cyberagent-open-calm-3b/result.jsquad-1.2.json new file mode 100644 index 0000000000..b38f1a845b --- /dev/null +++ b/models/cyberagent/cyberagent-open-calm-3b/result.jsquad-1.2.json @@ -0,0 +1,22 @@ +{ + "results": { + "jsquad-1.2-0.2": { + "exact_match": 44.529491220171096, + "f1": 56.02141036867636 + } + }, + "versions": { + "jsquad-1.2-0.2": 1.2 + }, + "config": { + "model": "hf-causal", + "model_args": "pretrained=cyberagent/open-calm-3b,device_map=auto,torch_dtype=auto", + "num_fewshot": 2, + "batch_size": null, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} diff --git a/models/cyberagent/cyberagent-open-calm-7b/harness.jsquad-1.2.sh b/models/cyberagent/cyberagent-open-calm-7b/harness.jsquad-1.2.sh new file mode 100644 index 0000000000..730fdce25d --- /dev/null +++ b/models/cyberagent/cyberagent-open-calm-7b/harness.jsquad-1.2.sh @@ -0,0 +1,3 @@ +MODEL_ARGS="pretrained=cyberagent/open-calm-7b,device_map=auto,torch_dtype=auto" +TASK="jsquad-1.2-0.2" +python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2" --device "cuda" --output_path "models/cyberagent/cyberagent-open-calm-7b/result.jsquad-1.2.json" diff --git a/models/cyberagent/cyberagent-open-calm-7b/result.jsquad-1.2.json b/models/cyberagent/cyberagent-open-calm-7b/result.jsquad-1.2.json new file mode 100644 index 0000000000..6ccffe99f7 --- /dev/null +++ b/models/cyberagent/cyberagent-open-calm-7b/result.jsquad-1.2.json @@ -0,0 +1,22 @@ +{ + "results": { + "jsquad-1.2-0.2": { + "exact_match": 48.10895992796038, + "f1": 60.90961937230767 + } + }, + "versions": { + "jsquad-1.2-0.2": 1.2 + }, + "config": { + "model": "hf-causal", + "model_args": "pretrained=cyberagent/open-calm-7b,device_map=auto,torch_dtype=auto", + "num_fewshot": 2, + "batch_size": null, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} diff --git a/models/cyberagent/cyberagent-open-calm-large/harness.jsquad-1.2.sh b/models/cyberagent/cyberagent-open-calm-large/harness.jsquad-1.2.sh new file mode 100644 index 0000000000..9619e2fbb3 --- /dev/null +++ b/models/cyberagent/cyberagent-open-calm-large/harness.jsquad-1.2.sh @@ -0,0 +1,3 @@ +MODEL_ARGS="pretrained=cyberagent/open-calm-large,use_fast=True,device_map=auto,torch_dtype=auto" +TASK="jsquad-1.2-0.2" +python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "3" --device "cuda" --output_path "models/cyberagent-open-calm-large/result.jsquad-1.2.json" diff --git a/models/cyberagent/cyberagent-open-calm-large/result.jsquad-1.2.json b/models/cyberagent/cyberagent-open-calm-large/result.jsquad-1.2.json new file mode 100644 index 0000000000..8f6d038661 --- /dev/null +++ b/models/cyberagent/cyberagent-open-calm-large/result.jsquad-1.2.json @@ -0,0 +1,22 @@ +{ + "results": { + "jsquad-1.2-0.2": { + "exact_match": 40.4997748761819, + "f1": 51.32160467436942 + } + }, + "versions": { + "jsquad-1.2-0.2": 1.2 + }, + "config": { + "model": "hf-causal", + "model_args": "pretrained=cyberagent/open-calm-large,use_fast=True,device_map=auto,torch_dtype=auto", + "num_fewshot": 3, + "batch_size": null, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} diff --git a/models/cyberagent/cyberagent-open-calm-medium/harness.jsquad-1.2.sh b/models/cyberagent/cyberagent-open-calm-medium/harness.jsquad-1.2.sh new file mode 100644 index 0000000000..08ce0e4ee9 --- /dev/null +++ b/models/cyberagent/cyberagent-open-calm-medium/harness.jsquad-1.2.sh @@ -0,0 +1,3 @@ +MODEL_ARGS="pretrained=cyberagent/open-calm-medium,use_fast=True,device_map=auto,torch_dtype=auto" +TASK="jsquad-1.2-0.2" +python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "3" --device "cuda" --output_path "models/cyberagent-open-calm-medium/result.jsquad-1.2.json" diff --git a/models/cyberagent/cyberagent-open-calm-medium/result.jsquad-1.2.json b/models/cyberagent/cyberagent-open-calm-medium/result.jsquad-1.2.json new file mode 100644 index 0000000000..4de6f23de7 --- /dev/null +++ b/models/cyberagent/cyberagent-open-calm-medium/result.jsquad-1.2.json @@ -0,0 +1,22 @@ +{ + "results": { + "jsquad-1.2-0.2": { + "exact_match": 29.85141828005403, + "f1": 40.49655778214922 + } + }, + "versions": { + "jsquad-1.2-0.2": 1.2 + }, + "config": { + "model": "hf-causal", + "model_args": "pretrained=cyberagent/open-calm-medium,use_fast=True,device_map=auto,torch_dtype=auto", + "num_fewshot": 3, + "batch_size": null, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} diff --git a/models/llama/llama-7b/harness.jsquad-1.2.sh b/models/llama/llama-7b/harness.jsquad-1.2.sh new file mode 100644 index 0000000000..5bf324bab7 --- /dev/null +++ b/models/llama/llama-7b/harness.jsquad-1.2.sh @@ -0,0 +1,3 @@ +MODEL_ARGS="pretrained=huggyllama/llama-7b,use_accelerate=True,load_in_8bit=True" +TASK="jsquad-1.2-0.3" +python main.py --model hf-causal-experimental --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2" --device "cuda" --output_path "models/llama/llama-7b/result.jsquad-1.2.json" --batch_size 2 diff --git a/models/llama/llama-7b/result.jsquad-1.2.json b/models/llama/llama-7b/result.jsquad-1.2.json new file mode 100644 index 0000000000..1871e65351 --- /dev/null +++ b/models/llama/llama-7b/result.jsquad-1.2.json @@ -0,0 +1,22 @@ +{ + "results": { + "jsquad-1.2-0.3": { + "exact_match": 36.24493471409275, + "f1": 50.91625240527312 + } + }, + "versions": { + "jsquad-1.2-0.3": 1.2 + }, + "config": { + "model": "hf-causal-experimental", + "model_args": "pretrained=huggyllama/llama-7b,use_accelerate=True,load_in_8bit=True", + "num_fewshot": 2, + "batch_size": 2, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} diff --git a/models/llama2/llama2-2.7b/harness.jsquad-1.2.sh b/models/llama2/llama2-2.7b/harness.jsquad-1.2.sh new file mode 100644 index 0000000000..1f3974969d --- /dev/null +++ b/models/llama2/llama2-2.7b/harness.jsquad-1.2.sh @@ -0,0 +1,3 @@ +MODEL_ARGS="pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,dtype=auto" +TASK="jsquad-1.2-0.3" +python main.py --model hf-causal-experimental --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2" --device "cuda" --output_path "models/llama2/llama2-2.7b/result.jsquad-1.2.json" --batch_size 2 diff --git a/models/llama2/llama2-2.7b/result.jsquad-1.2.json b/models/llama2/llama2-2.7b/result.jsquad-1.2.json new file mode 100644 index 0000000000..e08c3863ae --- /dev/null +++ b/models/llama2/llama2-2.7b/result.jsquad-1.2.json @@ -0,0 +1,22 @@ +{ + "results": { + "jsquad-1.2-0.3": { + "exact_match": 59.92796037820801, + "f1": 70.8236875084182 + } + }, + "versions": { + "jsquad-1.2-0.3": 1.2 + }, + "config": { + "model": "hf-causal-experimental", + "model_args": "pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,dtype=auto", + "num_fewshot": 2, + "batch_size": 2, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} diff --git a/models/llama2/llama2-7b-chat/harness.jsquad-1.2.sh b/models/llama2/llama2-7b-chat/harness.jsquad-1.2.sh new file mode 100644 index 0000000000..02d2c04d11 --- /dev/null +++ b/models/llama2/llama2-7b-chat/harness.jsquad-1.2.sh @@ -0,0 +1,3 @@ +MODEL_ARGS="pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True,dtype=auto" +TASK="jsquad-1.2-0.3" +python main.py --model hf-causal-experimental --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2" --device "cuda" --output_path "models/llama2/llama2-7b-chat/result.jsquad-1.2.json" --batch_size 2 diff --git a/models/llama2/llama2-7b-chat/result.jsquad-1.2.json b/models/llama2/llama2-7b-chat/result.jsquad-1.2.json new file mode 100644 index 0000000000..4cc31eb6b0 --- /dev/null +++ b/models/llama2/llama2-7b-chat/result.jsquad-1.2.json @@ -0,0 +1,22 @@ +{ + "results": { + "jsquad-1.2-0.3": { + "exact_match": 62.17919855920756, + "f1": 74.84345935966519 + } + }, + "versions": { + "jsquad-1.2-0.3": 1.2 + }, + "config": { + "model": "hf-causal-experimental", + "model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True,dtype=auto", + "num_fewshot": 2, + "batch_size": 2, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} diff --git a/models/llama2/llama2-7b/harness.jsquad-1.2.sh b/models/llama2/llama2-7b/harness.jsquad-1.2.sh new file mode 100644 index 0000000000..38d2a1a546 --- /dev/null +++ b/models/llama2/llama2-7b/harness.jsquad-1.2.sh @@ -0,0 +1,3 @@ +MODEL_ARGS="pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,dtype=auto" +TASK="jsquad-1.2-0.3" +python main.py --model hf-causal-experimental --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2" --device "cuda" --output_path "models/llama2/llama2-7b/result.jsquad-1.2.json" --batch_size 2 diff --git a/models/llama2/llama2-7b/result.jsquad-1.2.json b/models/llama2/llama2-7b/result.jsquad-1.2.json new file mode 100644 index 0000000000..e08c3863ae --- /dev/null +++ b/models/llama2/llama2-7b/result.jsquad-1.2.json @@ -0,0 +1,22 @@ +{ + "results": { + "jsquad-1.2-0.3": { + "exact_match": 59.92796037820801, + "f1": 70.8236875084182 + } + }, + "versions": { + "jsquad-1.2-0.3": 1.2 + }, + "config": { + "model": "hf-causal-experimental", + "model_args": "pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,dtype=auto", + "num_fewshot": 2, + "batch_size": 2, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} diff --git a/models/rinna/rinna-bilingual-gpt-neox-4b-instruction-ppo/harness.jsquad-1.2.sh b/models/rinna/rinna-bilingual-gpt-neox-4b-instruction-ppo/harness.jsquad-1.2.sh new file mode 100644 index 0000000000..0996e5f292 --- /dev/null +++ b/models/rinna/rinna-bilingual-gpt-neox-4b-instruction-ppo/harness.jsquad-1.2.sh @@ -0,0 +1,3 @@ +MODEL_ARGS="pretrained=rinna/bilingual-gpt-neox-4b-instruction-ppo,use_fast=False,device_map=auto,torch_dtype=auto" +TASK="jsquad-1.2-0.5" +python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2" --device "cuda" --output_path "models/rinna/rinna-bilingual-gpt-neox-4b-instruction-ppo/result.jsquad-1.2-0.5.json" diff --git a/models/rinna/rinna-bilingual-gpt-neox-4b-instruction-ppo/result.jsquad-1.2.json b/models/rinna/rinna-bilingual-gpt-neox-4b-instruction-ppo/result.jsquad-1.2.json new file mode 100644 index 0000000000..2614b439f6 --- /dev/null +++ b/models/rinna/rinna-bilingual-gpt-neox-4b-instruction-ppo/result.jsquad-1.2.json @@ -0,0 +1,22 @@ +{ + "results": { + "jsquad-1.2-0.5": { + "exact_match": 55.94326879783881, + "f1": 70.64052956733126 + } + }, + "versions": { + "jsquad-1.2-0.5": 1.2 + }, + "config": { + "model": "hf-causal", + "model_args": "pretrained=rinna/bilingual-gpt-neox-4b-instruction-ppo,use_fast=False,device_map=auto,torch_dtype=auto", + "num_fewshot": 2, + "batch_size": null, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} diff --git a/models/rinna/rinna-bilingual-gpt-neox-4b-instruction-sft/harness.jsquad-1.2.sh b/models/rinna/rinna-bilingual-gpt-neox-4b-instruction-sft/harness.jsquad-1.2.sh new file mode 100644 index 0000000000..7112320819 --- /dev/null +++ b/models/rinna/rinna-bilingual-gpt-neox-4b-instruction-sft/harness.jsquad-1.2.sh @@ -0,0 +1,3 @@ +MODEL_ARGS="pretrained=rinna/bilingual-gpt-neox-4b-instruction-sft,use_fast=False,device_map=auto,torch_dtype=auto" +TASK="jsquad-1.2-0.5" +python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2" --device "cuda" --output_path "models/rinna/rinna-bilingual-gpt-neox-4b-instruction-sft/result.jsquad-1.2.json" diff --git a/models/rinna/rinna-bilingual-gpt-neox-4b-instruction-sft/result.jsquad-1.2.json b/models/rinna/rinna-bilingual-gpt-neox-4b-instruction-sft/result.jsquad-1.2.json new file mode 100644 index 0000000000..8a8bf42647 --- /dev/null +++ b/models/rinna/rinna-bilingual-gpt-neox-4b-instruction-sft/result.jsquad-1.2.json @@ -0,0 +1,22 @@ +{ + "results": { + "jsquad-1.2-0.5": { + "exact_match": 58.66726699684827, + "f1": 72.38803519363597 + } + }, + "versions": { + "jsquad-1.2-0.5": 1.2 + }, + "config": { + "model": "hf-causal", + "model_args": "pretrained=rinna/bilingual-gpt-neox-4b-instruction-sft,use_fast=False,device_map=auto,torch_dtype=auto", + "num_fewshot": 2, + "batch_size": null, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} diff --git a/models/rinna/rinna-bilingual-gpt-neox-4b/harness.jsquad-1.2.sh b/models/rinna/rinna-bilingual-gpt-neox-4b/harness.jsquad-1.2.sh new file mode 100644 index 0000000000..8276c40fb4 --- /dev/null +++ b/models/rinna/rinna-bilingual-gpt-neox-4b/harness.jsquad-1.2.sh @@ -0,0 +1,3 @@ +MODEL_ARGS="pretrained=rinna/bilingual-gpt-neox-4b,use_fast=False,device_map=auto,torch_dtype=auto" +TASK="jsquad-1.2-0.2" +python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2" --device "cuda" --output_path "models/rinna/rinna-bilingual-gpt-neox-4b/result.jsquad-1.2.json" diff --git a/models/rinna/rinna-bilingual-gpt-neox-4b/result.jsquad-1.2.json b/models/rinna/rinna-bilingual-gpt-neox-4b/result.jsquad-1.2.json new file mode 100644 index 0000000000..32c9c4abf5 --- /dev/null +++ b/models/rinna/rinna-bilingual-gpt-neox-4b/result.jsquad-1.2.json @@ -0,0 +1,22 @@ +{ + "results": { + "jsquad-1.2-0.2": { + "exact_match": 51.32823052678973, + "f1": 61.9390389728309 + } + }, + "versions": { + "jsquad-1.2-0.2": 1.2 + }, + "config": { + "model": "hf-causal", + "model_args": "pretrained=rinna/bilingual-gpt-neox-4b,use_fast=False,device_map=auto,torch_dtype=auto", + "num_fewshot": 2, + "batch_size": null, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} diff --git a/models/rinna/rinna-japanese-gpt-1b/harness.jsquad-1.2.sh b/models/rinna/rinna-japanese-gpt-1b/harness.jsquad-1.2.sh new file mode 100644 index 0000000000..6824f3f3cd --- /dev/null +++ b/models/rinna/rinna-japanese-gpt-1b/harness.jsquad-1.2.sh @@ -0,0 +1,3 @@ +MODEL_ARGS="pretrained=rinna/japanese-gpt-1b,use_fast=False" +TASK="jsquad-1.2-0.2" +python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2" --device "cuda" --output_path "models/rinna/rinna-japanese-gpt-1b/result.jsquad-1.2.json" diff --git a/models/rinna/rinna-japanese-gpt-1b/result.jsquad-1.2.json b/models/rinna/rinna-japanese-gpt-1b/result.jsquad-1.2.json new file mode 100644 index 0000000000..779b473008 --- /dev/null +++ b/models/rinna/rinna-japanese-gpt-1b/result.jsquad-1.2.json @@ -0,0 +1,22 @@ +{ + "results": { + "jsquad-1.2-0.2": { + "exact_match": 30.189104007203962, + "f1": 47.12467642283419 + } + }, + "versions": { + "jsquad-1.2-0.2": 1.2 + }, + "config": { + "model": "hf-causal", + "model_args": "pretrained=rinna/japanese-gpt-1b,use_fast=False", + "num_fewshot": 2, + "batch_size": null, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} diff --git a/models/rinna/rinna-japanese-gpt-neox-3.6b-instruction-ppo/harness.jsquad-1.2.sh b/models/rinna/rinna-japanese-gpt-neox-3.6b-instruction-ppo/harness.jsquad-1.2.sh new file mode 100644 index 0000000000..ce3f084cb9 --- /dev/null +++ b/models/rinna/rinna-japanese-gpt-neox-3.6b-instruction-ppo/harness.jsquad-1.2.sh @@ -0,0 +1,3 @@ +MODEL_ARGS="pretrained=rinna/japanese-gpt-neox-3.6b-instruction-ppo,use_fast=False,device_map=auto,torch_dtype=auto" +TASK="jsquad-1.2-0.4" +python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2" --device "cuda" --output_path "models/rinna/rinna-japanese-gpt-neox-3.6b-instruction-ppo/result.jsquad-1.2.json" diff --git a/models/rinna/rinna-japanese-gpt-neox-3.6b-instruction-ppo/result.jsquad-1.2.json b/models/rinna/rinna-japanese-gpt-neox-3.6b-instruction-ppo/result.jsquad-1.2.json new file mode 100644 index 0000000000..c4ffa9e163 --- /dev/null +++ b/models/rinna/rinna-japanese-gpt-neox-3.6b-instruction-ppo/result.jsquad-1.2.json @@ -0,0 +1,22 @@ +{ + "results": { + "jsquad-1.2-0.4": { + "exact_match": 52.633948671769474, + "f1": 64.387511749343 + } + }, + "versions": { + "jsquad-1.2-0.4": 1.2 + }, + "config": { + "model": "hf-causal", + "model_args": "pretrained=rinna/japanese-gpt-neox-3.6b-instruction-ppo,use_fast=False,device_map=auto,torch_dtype=auto", + "num_fewshot": 2, + "batch_size": null, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} diff --git a/models/rinna/rinna-japanese-gpt-neox-3.6b-instruction-sft-v2/harness.jsquad-1.2.sh b/models/rinna/rinna-japanese-gpt-neox-3.6b-instruction-sft-v2/harness.jsquad-1.2.sh new file mode 100644 index 0000000000..6971a7f55b --- /dev/null +++ b/models/rinna/rinna-japanese-gpt-neox-3.6b-instruction-sft-v2/harness.jsquad-1.2.sh @@ -0,0 +1,3 @@ +MODEL_ARGS="pretrained=rinna/japanese-gpt-neox-3.6b-instruction-sft-v2,use_fast=False,device_map=auto,torch_dtype=auto" +TASK="jsquad-1.2-0.4" +python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2" --device "cuda" --output_path "models/rinna/rinna-japanese-gpt-neox-3.6b-instruction-sft-v2/result.jsquad-1.2.json" diff --git a/models/rinna/rinna-japanese-gpt-neox-3.6b-instruction-sft-v2/result.jsquad-1.2.json b/models/rinna/rinna-japanese-gpt-neox-3.6b-instruction-sft-v2/result.jsquad-1.2.json new file mode 100644 index 0000000000..5de18a40c1 --- /dev/null +++ b/models/rinna/rinna-japanese-gpt-neox-3.6b-instruction-sft-v2/result.jsquad-1.2.json @@ -0,0 +1,22 @@ +{ + "results": { + "jsquad-1.2-0.4": { + "exact_match": 47.54615038271049, + "f1": 61.633765369013354 + } + }, + "versions": { + "jsquad-1.2-0.4": 1.2 + }, + "config": { + "model": "hf-causal", + "model_args": "pretrained=rinna/japanese-gpt-neox-3.6b-instruction-sft-v2,use_fast=False,device_map=auto,torch_dtype=auto", + "num_fewshot": 2, + "batch_size": null, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} diff --git a/models/rinna/rinna-japanese-gpt-neox-3.6b-instruction-sft/harness.jsquad-1.2.sh b/models/rinna/rinna-japanese-gpt-neox-3.6b-instruction-sft/harness.jsquad-1.2.sh new file mode 100644 index 0000000000..9f5ac41609 --- /dev/null +++ b/models/rinna/rinna-japanese-gpt-neox-3.6b-instruction-sft/harness.jsquad-1.2.sh @@ -0,0 +1,3 @@ +MODEL_ARGS="pretrained=rinna/japanese-gpt-neox-3.6b-instruction-sft,use_fast=False,device_map=auto,torch_dtype=auto" +TASK="jsquad-1.2-0.4" +python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2" --device "cuda" --output_path "models/rinna/rinna-japanese-gpt-neox-3.6b-instruction-sft/result.jsquad-1.2.json" diff --git a/models/rinna/rinna-japanese-gpt-neox-3.6b-instruction-sft/result.jsquad-1.2.json b/models/rinna/rinna-japanese-gpt-neox-3.6b-instruction-sft/result.jsquad-1.2.json new file mode 100644 index 0000000000..d2001d0058 --- /dev/null +++ b/models/rinna/rinna-japanese-gpt-neox-3.6b-instruction-sft/result.jsquad-1.2.json @@ -0,0 +1,22 @@ +{ + "results": { + "jsquad-1.2-0.4": { + "exact_match": 49.34714092751013, + "f1": 63.33718413567939 + } + }, + "versions": { + "jsquad-1.2-0.4": 1.2 + }, + "config": { + "model": "hf-causal", + "model_args": "pretrained=rinna/japanese-gpt-neox-3.6b-instruction-sft,use_fast=False,device_map=auto,torch_dtype=auto", + "num_fewshot": 2, + "batch_size": null, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} diff --git a/models/rinna/rinna-japanese-gpt-neox-3.6b/harness.jsquad-1.2.sh b/models/rinna/rinna-japanese-gpt-neox-3.6b/harness.jsquad-1.2.sh new file mode 100644 index 0000000000..899442be2e --- /dev/null +++ b/models/rinna/rinna-japanese-gpt-neox-3.6b/harness.jsquad-1.2.sh @@ -0,0 +1,3 @@ +MODEL_ARGS="pretrained=rinna/japanese-gpt-neox-3.6b,use_fast=False,device_map=auto,torch_dtype=auto" +TASK="jsquad-1.2-0.2" +python main.py --model hf-causal --model_args $MODEL_ARGS --tasks $TASK --num_fewshot "2" --device "cuda" --output_path "models/rinna/rinna-japanese-gpt-neox-3.6b/result.jsquad-1.2.json" diff --git a/models/rinna/rinna-japanese-gpt-neox-3.6b/result.jsquad-1.2.json b/models/rinna/rinna-japanese-gpt-neox-3.6b/result.jsquad-1.2.json new file mode 100644 index 0000000000..d708afc85f --- /dev/null +++ b/models/rinna/rinna-japanese-gpt-neox-3.6b/result.jsquad-1.2.json @@ -0,0 +1,22 @@ +{ + "results": { + "jsquad-1.2-0.2": { + "exact_match": 49.0094552003602, + "f1": 59.80363888369063 + } + }, + "versions": { + "jsquad-1.2-0.2": 1.2 + }, + "config": { + "model": "hf-causal", + "model_args": "pretrained=rinna/japanese-gpt-neox-3.6b,use_fast=False,device_map=auto,torch_dtype=auto", + "num_fewshot": 2, + "batch_size": null, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +}