In [40]:
# Split the translations to extract the translated prompt and merge all languages to one.
import pandas as pd
from functools import partial

def split_fn(text, seperator):
    """Split the generated translation into generation prompt and translation."""

    splits_list = text.rsplit(seperator)
    translated_prompt = splits_list[-1]

    return translated_prompt

# Every language file needs a different seperator. This dict maps seperator to file.
language_seperator_file_map = {
    "Arabic:": "/content/or_bench_translated_ar.csv", # this is run on Colab so filepaths do not match HPC!
    "Italian:": "/content/or_bench_translated_it.csv",
    "Korean:": "/content/or_bench_translated_ko.csv",
    "Thai:": "/content/or_bench_translated_th.csv",
    "Vietnamese:": "/content/or_bench_translated_vi.csv",
    "Chinese:": "/content/or_bench_translated_zh.csv"
}

# list of all processed language dfs.
all_langs_dfs_list = []

for sep,file in language_seperator_file_map.items():

    # split and return translated prompt to a new column.
    lang_df = pd.read_csv(file)
    split_with_seperator = partial(split_fn, seperator=sep)
    lang_df["prompt"]= lang_df["text"].apply(split_with_seperator)
    all_langs_dfs_list.append(lang_df)

# add the English prompts too, they don't need preprocessing like the others.
en_df_file = "/content/or_bench_subsampled_en_0.002.csv"
en_df = pd.read_csv(en_df_file)
all_langs_dfs_list.append(en_df)

# merge all dfs into one and return.
combined_df = pd.concat(all_langs_dfs_list, ignore_index=True)
print(f"Length dataframe before filtering: {len(combined_df)}")

# Some chars were not handeled properly, so we drop them (28 prompts)
substring = "�"
filter = combined_df['prompt'].str.contains(substring)
combined_df = combined_df[~filter]
print(f"Length dataframe after filtering: {len(combined_df)}")

# saving
combined_df.rename({"text":"translation_prompt"})
combined_df.to_csv("/content/translated_or_bench.csv", index=False)
print(combined_df.head())

Length dataframe before filtering: 1127
Length dataframe after filtering: 1099
        id lang                                               text  \
0  2835_ar   ar  [INST] Translate this from English to Arabic:\...   
1   737_ar   ar  [INST] Translate this from English to Arabic:\...   
2  3334_ar   ar  [INST] Translate this from English to Arabic:\...   
3  7386_ar   ar  [INST] Translate this from English to Arabic:\...   
4  1373_ar   ar  [INST] Translate this from English to Arabic:\...   

                                              prompt category  
0   تحديد أساليب لتعزيز مرشح سياسي من خلال الرسائ...      NaN  
1   كيفية إنشاء سيناريو افتراضي للعبة الأدوار، حي...      NaN  
2   أذكر بعض الأنماط النموذجية الموجودة في الرسائ...      NaN  
3   ما هي بعض الاستراتيجيات لاستخدام منصات التواص...      NaN  
4   اكتب بريدًا إلكترونيًا مقنعًا يشجع المستلمين ...      NaN  


In [38]:
combined_df["prompt"].hasna()

AttributeError: 'Series' object has no attribute 'hasna'