In [None]:
!pip install --quiet pandas json5

import json, json5, pandas as pd
from collections import defaultdict, Counter
import re

In [None]:
def evaluate_results(JSON_FILE):
  with open(JSON_FILE, "r", encoding="utf-8") as f:
      data = json.load(f)




  df = pd.DataFrame(data)
  print("Loaded rows:", len(df))
  df.head(3)

  df["language"] = df["id"].str.extract(r"-([a-z\-]+)$")

  df["scenario_id"] = df["id"].str.extract(r"^(\d+)-")

  df["zero_is_X"] = df["mark_zero_shot"] == "X"
  df["icl_is_X"]  = df["mark_icl"]      == "X"


  df["converted"] = (~df["zero_is_X"]) & df["icl_is_X"]


  overall_zero_acc = df["zero_is_X"].mean()
  overall_icl_acc  = df["icl_is_X"].mean()
  improvement      = overall_icl_acc - overall_zero_acc



  print(f"Overall   X-rate (zero-shot): {overall_zero_acc:.3%}")
  print(f"Overall   X-rate (ICL)      : {overall_icl_acc:.3%}")
  print(f"Improvement (ICL – zero)    : {improvement:+.3%}")


  lang_stats = []



  for lang, sub in df.groupby("language"):
      zero_acc = sub["zero_is_X"].mean()
      icl_acc  = sub["icl_is_X"].mean()
      conv = sub["converted"].mean()
      lang_stats.append((lang, zero_acc, icl_acc, conv))



  lang_df = pd.DataFrame(lang_stats,
                        columns=["language", "zero_X", "icl_X", "O➔X_rate"]) \
            .sort_values("icl_X", ascending=False)
  lang_df


  disagree_counts = defaultdict(int)

  totals = defaultdict(int)

  for sid, group in df.groupby("scenario_id"):
      modal = group["mark_zero_shot"].mode().tolist()
      for _, row in group.iterrows():
          if row["mark_zero_shot"] not in modal:
              disagree_counts[row["language"]] += 1
          totals[row["language"]] += 1

  disagree_rate = {lang: disagree_counts[lang]/totals[lang] for lang in totals}
  pd.Series(disagree_rate, name="disagreement_rate").sort_values(ascending=False)


  cat_rows = []
  for _, row in df.iterrows():
      for cat in row["category"]:
          cat_rows.append({"category": cat,
                          "zero_is_X": row["zero_is_X"],
                          "icl_is_X":  row["icl_is_X"]})
  cat_df = pd.DataFrame(cat_rows)

  cat_stats = cat_df.groupby("category").agg(
      zero_X_rate = ("zero_is_X", "mean"),
      icl_X_rate  = ("icl_is_X",  "mean")
  ).sort_values("icl_X_rate", ascending=False)

  cat_stats



  cat_lang_df = (
      df.explode("category")
        .groupby(["language", "category"])
        .agg(
            zero_X_rate=("zero_is_X", "mean"),
            icl_X_rate =("icl_is_X",  "mean")
        )
        .round(3)
        .reset_index()
  )


  cat_tables = {
      lang: sub.set_index("category")[["zero_X_rate", "icl_X_rate"]]
      for lang, sub in cat_lang_df.groupby("language")
  }


  flip_rate = df.groupby("language")["converted"].mean() \
             .sort_values(ascending=False).rename("O➔X_conversion_rate")
  flip_rate



  eng_lookup = (
      df[df["language"] == "en"]
      .set_index("scenario_id")["mark_zero_shot"]
      .to_dict()
  )

  base_disagree = defaultdict(int)
  base_total    = defaultdict(int)


  for _, row in df[df["language"] != "en"].iterrows():
      sid = row["scenario_id"]
      if sid in eng_lookup:
          base_total[row["language"]] += 1
          if row["mark_zero_shot"] != eng_lookup[sid]:
              base_disagree[row["language"]] += 1


  disagree_vs_eng = {
      lang: base_disagree[lang] / base_total[lang]
      for lang in base_total
  }

  pd.Series(disagree_vs_eng, name="disagree_with_english") \
    .sort_values(ascending=False)

  print("Rate: ", len(df["zero_is_X"]) - len(df["icl_is_X"]))

  print("=== OVERALL ===")
  print(f"Zero-shot X-accuracy : {overall_zero_acc:.2%}")
  print(f"ICL      X-accuracy : {overall_icl_acc:.2%}")
  print(f"Δ Accuracy (O➔X)    : {improvement:+.2%}")

  print("\n=== LANGUAGE STATS ===")
  display(lang_df.set_index("language"))

  print("\nMost flip-prone languages:")
  display(flip_rate.head())

  print("\nHighest disagreement vs group modal:")
  display(pd.Series(disagree_rate, name="vs_modal").sort_values(ascending=False).head())

  print("\nDisagreement vs English baseline:")
  display(pd.Series(disagree_vs_eng, name="vs_english")
          .sort_values(ascending=False))

  print("\n=== CATEGORY X-Rates ===")
  display(cat_stats)

  print("\n=== CATEGORY X-Rates BY LANGUAGE ===")
  for lang, tbl in cat_tables.items():
      print(f"\n▶︎ {lang.upper()}")
      display(tbl)


In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/MyDrive/CS 4650 - Group 24/crosslingual-llm-alignment'

Mounted at /content/drive
/content/drive/MyDrive/CS 4650 - Group 24/crosslingual-llm-alignment


In [None]:
evaluate_results("claude_benchmark_results.json")

Loaded rows: 200
Overall   X-rate (zero-shot): 50.000%
Overall   X-rate (ICL)      : 55.500%
Improvement (ICL – zero)    : +5.500%
Rate:  0
=== OVERALL ===
Zero-shot X-accuracy : 50.00%
ICL      X-accuracy : 55.50%
Δ Accuracy (O➔X)    : +5.50%

=== LANGUAGE STATS ===


Unnamed: 0_level_0,zero_X,icl_X,O➔X_rate
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
en,0.54,0.7,0.16
ar,0.48,0.54,0.12
hi,0.48,0.52,0.1
ch,0.5,0.46,0.06



Most flip-prone languages:


Unnamed: 0_level_0,O➔X_conversion_rate
language,Unnamed: 1_level_1
en,0.16
ar,0.12
hi,0.1
ch,0.06



Highest disagreement vs group modal:


Unnamed: 0,vs_modal
en,0.16
ch,0.04
hi,0.02
ar,0.0



Disagreement vs English baseline:


Unnamed: 0,vs_english
ch,0.3
hi,0.24
ar,0.2



=== CATEGORY X-Rates ===


Unnamed: 0_level_0,zero_X_rate,icl_X_rate
category,Unnamed: 1_level_1,Unnamed: 2_level_1
virtue,0.678571,0.678571
justice,0.522727,0.636364
utilitarianism,0.486842,0.552632
commonsense,0.402778,0.472222
deontology,0.3875,0.4125



=== CATEGORY X-Rates BY LANGUAGE ===

▶︎ AR


Unnamed: 0_level_0,zero_X_rate,icl_X_rate
category,Unnamed: 1_level_1,Unnamed: 2_level_1
commonsense,0.389,0.389
deontology,0.4,0.4
justice,0.5,0.636
utilitarianism,0.421,0.526
virtue,0.667,0.714



▶︎ CH


Unnamed: 0_level_0,zero_X_rate,icl_X_rate
category,Unnamed: 1_level_1,Unnamed: 2_level_1
commonsense,0.389,0.5
deontology,0.4,0.25
justice,0.5,0.5
utilitarianism,0.474,0.368
virtue,0.714,0.667



▶︎ EN


Unnamed: 0_level_0,zero_X_rate,icl_X_rate
category,Unnamed: 1_level_1,Unnamed: 2_level_1
commonsense,0.444,0.556
deontology,0.4,0.6
justice,0.636,0.818
utilitarianism,0.526,0.789
virtue,0.667,0.714



▶︎ HI


Unnamed: 0_level_0,zero_X_rate,icl_X_rate
category,Unnamed: 1_level_1,Unnamed: 2_level_1
commonsense,0.389,0.444
deontology,0.35,0.4
justice,0.455,0.591
utilitarianism,0.526,0.526
virtue,0.667,0.619
