[ruige/evaluation] modify README.md show_results_mt.py gen_judge_mtbe…

…nch.py
rui-ye · Mar 7, 2024 · 4e9a949 · 4e9a949
1 parent a8e7bb8
commit 4e9a949
Show file tree

Hide file tree

Showing 3 changed files with 4 additions and 4 deletions.
diff --git a/evaluation/open_ended/README.md b/evaluation/open_ended/README.md
@@ -36,11 +36,11 @@ The judgments will be saved to `data/mtbench/model_judgment/gpt-4-1106-preview_s
 
 - Show the scores for selected models
   ```
-  python show_results_mt.py --model_list [LIST-OF-MODEL-ID]
+  python show_results_mt.py --model_list [LIST-OF-MODEL-ID] --judge_model gpt-4-1106-preview
   ```
 - Show all scores
   ```
-  python show_result.py
+  python show_results_mt.py 
   ```
 
 ## Vicuna and AdvBench

diff --git a/evaluation/open_ended/gen_judge_mtbench.py b/evaluation/open_ended/gen_judge_mtbench.py
@@ -179,7 +179,7 @@ def make_judge_single(judge_model, judge_prompts):
         default="data/judge_prompts.jsonl",
         help="The file of judge prompts.",
     )
-    parser.add_argument("--judge_model", type=str, default="gpt-4")
+    parser.add_argument("--judge_model", type=str, default="gpt-4-1106-preview")
     parser.add_argument("--baseline_model", type=str, default="gpt-3.5-turbo")
     parser.add_argument(
         "--mode",

diff --git a/evaluation/open_ended/show_results_mt.py b/evaluation/open_ended/show_results_mt.py
@@ -29,7 +29,7 @@ def display_result_single(args):
     df_1 = df[df["turn"] == 1].groupby(["model", "turn"]).mean()
     print(df_1.sort_values(by="score", ascending=False))
 
-    if args.bench_name == "mt_bench":
+    if args.bench_name == "mtbench":
         print("\n########## Second turn ##########")
         df_2 = df[df["turn"] == 2].groupby(["model", "turn"]).mean()
         print(df_2.sort_values(by="score", ascending=False))