In [1]:
import pandas as pd
from fun import *
from common import generate_mapper
from groq import Groq
from dotenv import load_dotenv
import os
import json

# Data

In [2]:
df = pd.read_csv("../data/translations.csv")
df.head()

Unnamed: 0,lp,src,mt,ref,score,raw,annotators,domain,year,l1,l2
0,en-zh,Police said in a statement at the time that th...,警方在当时的一份声明中表示，他们决定以“沉重的心情”结束对 Jacsun 的搜索，但指出他们...,警方在当时的一份声明中表示，他们在决定结束对杰森的搜寻时，是怀着“沉重的心情”的，并指出他们...,0.238989,80.5,4,news,2020,en,zh
1,en-zh,"Trump will likely veto the resolution, the six...",特 朗 普 可 能 会 否 决 该 决 议 ， 这 是 他 第 六 次 以 总 统 身 份 ...,特朗普很可能行使总统否决权，这将是他成为总统以来第6次否决国会立法。他早在三月已动用否决权，...,0.487842,81.166667,6,news,2020,en,zh
2,en-zh,The man arrived at work about 9.30am after fai...,该名男子未能回复同事的几条短信和电话，于上午9.30左右上班。,该男子大约在上午 9:30 到达工作单位，在此之前，他未能回复同事发来的几条信息和打来的电话。,0.46985,80.5,4,news,2020,en,zh
3,en-zh,Are hopes for a nuclear-free world realistic?,对一个无核世界的希望是现实的吗 ？,实现无核化世界的希望是否现实？,0.626218,81.0,4,news,2020,en,zh
4,en-zh,The California attorney general's office in Ma...,"3月,加利福尼亚总检察长办公室在经过近一年的调查后拒绝提出州刑事指控,当时,总检察长Xavi...",3 月，加州总检察长办公室在完成了近一年的案件调查后，拒绝对两名警察提起加州刑事起诉。总检察...,-0.207602,59.0,4,news,2020,en,zh


In [16]:
with open("../data/prompts/PROMPT4.txt", "r") as f:
    prompt_template = f.read()

print(prompt_template)

Please evaluate the quality of the translation from [ORIGINAL LANGUAGE] to [TRANSLATED LANGUAGE] based on accuracy, fluency, and coherence.
Provide a score from 0 to 100 to indicate the overall quality of the translation.



In [17]:
language_mapper = {
    "en": "English",
    "fi": "Finnish",
    "de": "German",
    "hi": "Hindi",
    "xh": "Xhosa",
    "zh": "Chinese",
    "cs": "Czech",
    "fr": "French",
    "bn": "Bengali",
    "zu": "Zulu",
}

In [18]:
df["l1"] = df["l1"].map(language_mapper)
df["l2"] = df["l2"].map(language_mapper)

In [19]:
def get_conversation_groq(
    src_language: str, trg_language: str, src_sentence: str, trg_sentence: str
) -> str:
    messages = [
        {
            "role": "system",
            "content": f"""Please evaluate the quality of the translation from {src_language} to {trg_language} based on accuracy, fluency, and coherence: \n Provide a score from 0 to 100 to indicate the overall quality of the translation. Provide only the score, nothing else.""",
        },
        {
            "role": "user",
            "content": f'Original sentence: "{src_sentence}" \n Translated sentence: "{trg_sentence}"',
        },
    ]
    return messages

In [20]:
df["message"] = df.apply(
    lambda x: get_conversation_groq(x["l1"], x["l2"], x["src"], x["mt"]), axis=1
)

In [21]:
df.iloc[0].message

[{'role': 'system',
  'content': 'Please evaluate the quality of the translation from English to Chinese based on accuracy, fluency, and coherence: \n Provide a score from 0 to 100 to indicate the overall quality of the translation. Provide only the score, nothing else.'},
 {'role': 'user',
  'content': 'Original sentence: "Police said in a statement at the time that they made the decision to end the search for Jacsun with a "heavy heart," but noted they couldn\'t pinpoint a location in the landfill "to a point that would make continuing the search reasonable."" \n Translated sentence: "警方在当时的一份声明中表示，他们决定以“沉重的心情”结束对 Jacsun 的搜索，但指出他们无法确定垃圾填埋场的位置，“以至于继续搜索是合理的”。"'}]

# API calls

In [22]:
load_dotenv()

True

In [23]:
client = Groq()

In [24]:
input_data = df["message"].to_list()

In [25]:
input_data[0]

[{'role': 'system',
  'content': 'Please evaluate the quality of the translation from English to Chinese based on accuracy, fluency, and coherence: \n Provide a score from 0 to 100 to indicate the overall quality of the translation. Provide only the score, nothing else.'},
 {'role': 'user',
  'content': 'Original sentence: "Police said in a statement at the time that they made the decision to end the search for Jacsun with a "heavy heart," but noted they couldn\'t pinpoint a location in the landfill "to a point that would make continuing the search reasonable."" \n Translated sentence: "警方在当时的一份声明中表示，他们决定以“沉重的心情”结束对 Jacsun 的搜索，但指出他们无法确定垃圾填埋场的位置，“以至于继续搜索是合理的”。"'}]

In [44]:
len(input_data)

2400

In [45]:
# last_idx = 2000
# max_rows = 2401
# outfile = f'../data/gemma/{last_idx}_starting_row.json'

# with open(outfile, 'w') as output_file:
#     output_file.write('[')
#     for i in range(last_idx, len(input_data[:max_rows])):
#         try:
#             sample = input_data[i]
#             chat_completion = client.chat.completions.create(
#                 messages=input_data[i],
#                 model="gemma-7b-it",
#                 temperature=0.5,
#                 max_tokens=1024,
#                 top_p=1,
#                 stop=None,
#                 stream=False,
#             )
#             answer = chat_completion.choices[0].message.content

#             output_sample = {'sentence': sample[-1]['content'],'score': answer, 'id': i}

#             output_file.write(json.dumps(output_sample))

#             if i != (max_rows - 1):
#                 output_file.write(',')
#         except:
#             print('next start by:')
#             print(i + 1)
#             break
#     output_file.write(']')

In [31]:
def recover_all_files(path: str):
    files_content = []

    for filename in os.listdir(path):
        with open(f"{path}/{filename}") as infile:
            file_content = json.load(infile)
            files_content += file_content
    return pd.DataFrame(files_content).sort_values(by="id").reset_index(drop=True)

In [32]:
df_gemma = recover_all_files("../data/gemma")

In [5]:
df_gemma.score.unique()

array(['85', '95', '75', '**95**', '10', '65', '60', '60/100', '30',
       '10/100', '0-20',
       '20/100\n\nThe translation lacks accuracy in vocabulary and grammar, and coherence due to awkward phrasing.',
       '0\n\nThe translation is inaccurate and lacks coherence.',
       '15/100', '0', '70', '30/100', '20', '90', '20/100',
       '15/100\n\n**Issues:**\n- Accuracy: Uses incorrect terminology ("olympiamitali" instead of "medal"), lacks context ("kansa saa vapaapäivän" doesn\'t translate to "people get holiday").\n- Fluency: Word choice and sentence structure are unnatural in English.\n- Coherence: The translation lacks coherence and fails to convey the original meaning.',
       '40', '**85/100**', '15'], dtype=object)

In [6]:
mistakes_mapper = {
    "85": "85",
    "95": "95",
    "75": "75",
    "0": "0",
    "70": "70",
    "60": "60",
    "65": "65",
    "30/100": "30",
    "10": "10",
    "30": "30",
    "20": "20",
    "60/100": "60",
    "90": "90",
    "20/100": "20",
    '15/100\n\n**Issues:**\n- Accuracy: Uses incorrect terminology ("olympiamitali" instead of "medal"), lacks context ("kansa saa vapaapäivän" doesn\'t translate to "people get holiday").\n- Fluency: Word choice and sentence structure are unnatural in English.\n- Coherence: The translation lacks coherence and fails to convey the original meaning.': "15",
    "10/100": "10",
    "15/100": "15",
    "40": "40",
    "**85/100**": "85",
    "15": "15",
    "**95**": "95",
    "0-20": None,
    "20/100\n\nThe translation lacks accuracy in vocabulary and grammar, and coherence due to awkward phrasing.": "20",
    "0\n\nThe translation is inaccurate and lacks coherence.": "0",
}

In [7]:
df_gemma["score"] = df_gemma["score"].map(mistakes_mapper)

In [8]:
df_gemma = df_gemma.sort_values(by=["id"])

In [9]:
df_gemma.score.value_counts(normalize=True)

score
95    0.521467
85    0.428095
75    0.016674
60    0.012505
65    0.005002
30    0.003752
10    0.002918
20    0.002918
70    0.002501
0     0.001667
15    0.001667
90    0.000417
40    0.000417
Name: proportion, dtype: float64

In [10]:
df_gemma.score

0       85
1       85
2       95
3       95
4       85
        ..
2395    85
2396    85
2397    85
2398    85
2399    85
Name: score, Length: 2400, dtype: object

In [11]:
df["gemma_score"] = df_gemma["score"].astype(float)

In [12]:
df[["lp", "raw", "gemma_score"]]

Unnamed: 0,lp,raw,gemma_score
0,en-zh,80.500000,85.0
1,en-zh,81.166667,85.0
2,en-zh,80.500000,95.0
3,en-zh,81.000000,95.0
4,en-zh,59.000000,85.0
...,...,...,...
2395,xh-zu,100.000000,85.0
2396,xh-zu,90.000000,85.0
2397,xh-zu,100.000000,85.0
2398,xh-zu,50.000000,85.0


In [13]:
df["lp"].unique()

array(['en-zh', 'en-cs', 'en-de', 'fi-en', 'de-fr', 'de-cs', 'hi-bn',
       'xh-zu'], dtype=object)

In [14]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [19]:
fig = make_subplots(rows=1, cols=1)

fig.add_trace(
    go.Scatter(x=df.index, y=df.raw, mode="markers", name="raw_score"), row=1, col=1
)

fig.add_trace(
    go.Scatter(x=df.index, y=df.gemma_score, mode="markers", name="gemma_score"),
    row=1,
    col=1,
)

fig.show()

In [34]:
df.gemma_score.value_counts(normalize=True)

gemma_score
95.0    0.521467
85.0    0.428095
75.0    0.016674
60.0    0.012505
65.0    0.005002
30.0    0.003752
10.0    0.002918
20.0    0.002918
70.0    0.002501
0.0     0.001667
15.0    0.001667
90.0    0.000417
40.0    0.000417
Name: proportion, dtype: float64

In [28]:
def compute_correlation(
    df: pd.DataFrame,
    ground_truth: str = "raw",
    predicted: str = "gemma_score",
    method: str = "pearson",
    decimals=3,
):
    correlations = {
        "overall": round(df[ground_truth].corr(df[predicted], method=method), decimals)
    }

    for language_pair in df["lp"].unique():
        sub_df = df[df.lp == language_pair]
        correlations[language_pair] = round(
            sub_df[ground_truth].corr(sub_df[predicted], method=method), decimals
        )

    return correlations

In [33]:
for key, value in compute_correlation(df, "raw", "gemma_score").items():
    print(key)

overall
en-zh
en-cs
en-de
fi-en
de-fr
de-cs
hi-bn
xh-zu
