In [1]:
import pandas as pd
import os
import numpy as np

In [2]:
code2lang = {
    'ar': 'ara', 'de': 'deu', 'en': 'eng', 'es': 'spa', 'fr': 'fra', 'hi': 'hin',
    'id': 'ind', 'it': 'ita', 'ja': 'jpn', 'ko': 'kor', 'pt': 'por', 'ru': 'rus',
    'tr': 'tur', 'vi': 'vie', 'zh': 'zho'}
iso3names = {
    'ara': 'Arabic', 'deu': 'German', 'eng': 'English', 'spa': 'Spanish',
    'fra': 'French', 'hin': 'Hindi', 'ind': 'Indonesian', 'ita': 'Italian',
    'jpn': 'Japanese', 'kor': 'Korean', 'por': 'Portuguese', 'rus': 'Russian',
    'tur': 'Turkish', 'vie': 'Vietnamese', 'zho': 'Chinese'
}
llm_dict = {
        'command-r-base': "Command R base", 'command-r': "Command R",
        'command-r-plus-base': "Command R+ base", 'command-r-plus': "Command R+",
        'gpt-3.5-turbo': "GPT-3.5 Turbo", 'gpt-4-turbo': "GPT-4 Turbo",
        'mistral-large': "Mistral Large", 'mistral-8x7b': "Mistral 8x7B",
        'llama-2-instruct': "Llama 2 70B-I", 'llama-3-instruct': "Llama 3 70B-I"}


In [3]:
df= pd.read_csv("../datasets/prompts_language_confusion/results/reproduction_pass_rates_results.csv")

In [4]:
lang_order= ['fra',
 'spa',
 'ita',
 'deu',
 'rus',
 'por',
 'tur',
 'vie',
 'zho',
 'kor',
 'ara',
 'jpn',
 'hin',
 'ind']
report_models = ["command-r",  "command-r-plus", "gpt-3.5-turbo", "gpt-4-turbo", "mistral-large", "mistral-8x7b", 
                 "llama-2-instruct", "llama-3-instruct"]

In [5]:
df["LLM"]=df["model"].map(llm_dict)

In [6]:
df = df[~df["LLM"].isin(["Command R+ base", "Command R base"])]

In [7]:
df["lang_iso"]=df["lang"].map(code2lang)
df['lang_iso'] = pd.Categorical(df['lang_iso'], categories=lang_order, ordered=True)
df["LLM"] = pd.Categorical(df["model"], categories=report_models,ordered=True)
df_sorted= df.sort_values(by=["lang_iso", "LLM"])

df["Lang"]=df["lang_iso"].map(iso3names)

In [8]:
df_cross = df[df["task"]=="crosslingual"]

In [24]:
df_mono = df[df["task"]=="monolingual"]

In [9]:
wpr_langs = ["Russian", "Chinese", "Korean","Arabic", "Japanese", "Hindi"]

# Monolingual LPR

In [87]:
df_mono_lpr = df_mono[["LLM","source", "lpr", "Lang"]]
df_mono_lpr = df_mono_lpr[df_mono_lpr["source"]!="all"]
df_mono_lpr = df_mono_lpr.dropna(subset=["Lang"])
df_mono_lpr = df_mono_lpr.dropna(subset=["lpr"])

In [88]:
len(df_mono_lpr)

176

In [89]:
df_mono_lpr.head(2)

Unnamed: 0,LLM,source,lpr,Lang
60,command-r,aya_human_annotated,1.0,Arabic
62,command-r,aya_human_annotated,0.99,Portuguese


In [90]:
df_mono_lpr_ = df_mono_lpr.groupby(['LLM', "Lang"], observed=True, as_index=False).agg(
        avg_lpr=("lpr", 'mean'))

In [91]:
mono_lpr = df_mono_lpr_.pivot(index='LLM', columns='Lang', values='avg_lpr')

In [92]:
mono_lpr = mono_lpr*100

In [93]:
mono_lpr.to_csv("../datasets/prompts_language_confusion/results/mono_lpr.csv")

# Monolingual WPR

In [94]:
df_mono_wpr = df_mono[df_mono["Lang"].isin(wpr_langs)]

In [95]:
df_mono_wpr = df_mono_wpr.replace({None: np.nan})

In [96]:
df_mono_wpr = df_mono_wpr[["LLM","source", "wpr", "Lang"]]
df_mono_wpr = df_mono_wpr[df_mono_wpr["source"]!="all"]
df_mono_wpr = df_mono_wpr.dropna(subset=["Lang"])
df_mono_wpr = df_mono_wpr.dropna(subset=["wpr"])

In [97]:
len(df_cross_wpr)

144

In [98]:
df_mono_wpr.head(2)

Unnamed: 0,LLM,source,wpr,Lang
60,command-r,aya_human_annotated,0.98,Arabic
64,command-r,aya_human_annotated,0.9,Chinese


In [99]:
df_mono_wpr_ = df_mono_wpr.groupby(['LLM', "Lang"], observed=True, as_index=False).agg(
        avg_wpr=("wpr", 'mean'))

In [100]:
mono_wpr = df_mono_wpr_.pivot(index='LLM', columns='Lang', values='avg_wpr')

In [101]:
mono_wpr = mono_wpr*100

In [102]:
mono_wpr.to_csv("../datasets/prompts_language_confusion/results/mono_wpr.csv")

# Crosslingual WPR

In [103]:
df_cross_wpr = df_cross[df_cross["Lang"].isin(wpr_langs)]

In [104]:
df_cross_wpr = df_cross_wpr.replace({None: np.nan})

In [105]:
df_cross_wpr = df_cross_wpr[["LLM","source", "wpr", "Lang"]]
df_cross_wpr = df_cross_wpr[df_cross_wpr["source"]!="all"]
df_cross_wpr = df_cross_wpr.dropna(subset=["Lang"])
df_cross_wpr = df_cross_wpr.dropna(subset=["wpr"])

In [106]:
len(df_cross_wpr)

144

In [107]:
df_cross_wpr.head(2)

Unnamed: 0,LLM,source,wpr,Lang
0,command-r,complex_prompts,0.91,Arabic
4,command-r,complex_prompts,0.97,Hindi


In [108]:
df_cross_wpr_ = df_cross_wpr.groupby(['LLM', "Lang"], observed=True, as_index=False).agg(
        avg_wpr=("wpr", 'mean'))

In [109]:
cross_wpr = df_cross_wpr_.pivot(index='LLM', columns='Lang', values='avg_wpr')

In [110]:
cross_wpr = cross_wpr*100

In [111]:
cross_wpr.to_csv("../datasets/prompts_language_confusion/results/cross_wpr.csv")

# Crosslingual LPR

In [112]:
df_cross_lpr = df[["LLM","source", "lpr", "Lang"]].dropna()

In [113]:
df_cross_lpr = df_cross_lpr[df_cross_lpr["source"]!="all"]

In [114]:
df_cross_lpr

Unnamed: 0,LLM,source,lpr,Lang
0,command-r,complex_prompts,0.33,Arabic
1,command-r,complex_prompts,0.32,German
2,command-r,complex_prompts,0.40,Spanish
3,command-r,complex_prompts,0.44,French
4,command-r,complex_prompts,0.34,Hindi
...,...,...,...,...
1015,gpt-3.5-turbo,okapi,0.96,Indonesian
1016,gpt-3.5-turbo,okapi,1.00,Italian
1017,gpt-3.5-turbo,okapi,0.98,Portuguese
1018,gpt-3.5-turbo,okapi,0.99,Vietnamese


In [115]:
df_cross_lpr.source.value_counts()

source
okapi                  184
complex_prompts        112
sharegpt               112
dolly_human_edited      40
aya_human_annotated     32
native_prompts          32
Name: count, dtype: int64

In [116]:
df_cross_lpr_ = df_cross_lpr.groupby(['LLM', "Lang"], as_index=False).agg(
        avg_lpr=(f"lpr", 'mean'))

  df_cross_lpr_ = df_cross_lpr.groupby(['LLM', "Lang"], as_index=False).agg(


In [118]:
cross_lpr = df_cross_lpr_.pivot(index='LLM', columns='Lang', values='avg_lpr')

In [121]:
cross_lpr = cross_lpr*100

In [122]:
cross_lpr.to_csv("../datasets/prompts_language_confusion/results/cross_lpr.csv")

In [123]:
cross_lpr

Lang,French,Spanish,Italian,German,Russian,Portuguese,Turkish,Vietnamese,Chinese,Korean,Arabic,Japanese,Hindi,Indonesian
LLM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
command-r,86.833333,84.166667,74.0,72.0,77.0,79.8,75.5,74.0,84.4,77.0,80.833333,74.0,74.25,76.0
command-r-plus,95.5,95.5,95.25,93.75,94.75,92.0,94.0,93.0,92.8,93.25,96.5,95.0,92.75,88.75
gpt-3.5-turbo,94.0,96.5,93.5,92.75,93.75,93.2,92.0,93.5,90.6,92.75,95.333333,90.75,93.75,87.25
gpt-4-turbo,95.0,96.166667,93.5,94.75,92.5,93.8,93.5,92.5,92.4,92.25,94.0,90.75,93.25,89.5
mistral-large,86.0,83.833333,74.25,80.5,72.0,70.6,67.25,48.25,54.2,46.75,42.0,45.25,48.5,52.25
mistral-8x7b,87.166667,84.166667,81.75,80.0,70.5,81.6,79.5,71.0,52.6,58.0,53.5,60.25,47.25,69.0
llama-2-instruct,79.333333,86.5,67.75,54.0,51.0,82.0,26.25,19.55,10.84,3.575,6.333333,14.0,16.25,50.0
llama-3-instruct,70.833333,79.666667,49.25,33.75,48.0,70.8,17.525,16.525,5.8,0.575,26.333333,3.525,40.5,24.25
