In [47]:
import pandas as pd
from fun import *
from common import generate_mapper
from groq import Groq
from dotenv import load_dotenv
import os
import json

# Data

In [18]:
df = pd.read_csv("../data/translations.csv")
df.head()

Unnamed: 0,lp,src,mt,ref,score,raw,annotators,domain,year,l1,l2
0,en-zh,Police said in a statement at the time that th...,警方在当时的一份声明中表示，他们决定以“沉重的心情”结束对 Jacsun 的搜索，但指出他们...,警方在当时的一份声明中表示，他们在决定结束对杰森的搜寻时，是怀着“沉重的心情”的，并指出他们...,0.238989,80.5,4,news,2020,en,zh
1,en-zh,"Trump will likely veto the resolution, the six...",特 朗 普 可 能 会 否 决 该 决 议 ， 这 是 他 第 六 次 以 总 统 身 份 ...,特朗普很可能行使总统否决权，这将是他成为总统以来第6次否决国会立法。他早在三月已动用否决权，...,0.487842,81.166667,6,news,2020,en,zh
2,en-zh,The man arrived at work about 9.30am after fai...,该名男子未能回复同事的几条短信和电话，于上午9.30左右上班。,该男子大约在上午 9:30 到达工作单位，在此之前，他未能回复同事发来的几条信息和打来的电话。,0.46985,80.5,4,news,2020,en,zh
3,en-zh,Are hopes for a nuclear-free world realistic?,对一个无核世界的希望是现实的吗 ？,实现无核化世界的希望是否现实？,0.626218,81.0,4,news,2020,en,zh
4,en-zh,The California attorney general's office in Ma...,"3月,加利福尼亚总检察长办公室在经过近一年的调查后拒绝提出州刑事指控,当时,总检察长Xavi...",3 月，加州总检察长办公室在完成了近一年的案件调查后，拒绝对两名警察提起加州刑事起诉。总检察...,-0.207602,59.0,4,news,2020,en,zh


In [19]:
with open("../data/prompts/PROMPT4.txt", 'r') as f:
    prompt_template = f.read()

print(prompt_template)

Please evaluate the quality of the translation from [ORIGINAL LANGUAGE] to [TRANSLATED LANGUAGE] based on accuracy, fluency, and coherence:
Provide a score from 0 to 100 to indicate the overall quality of the translation.



In [32]:
language_mapper = {
    'en': 'English',
    'fi': 'Finnish',
    'de': 'German',
    'hi': 'Hindi',
    'xh': 'Xhosa',
    'zh': 'Chinese',
    'cs': 'Czech',
    'fr': 'French',
    'bn': 'Bengali',
    'zu': 'Zulu'
}

In [33]:
df['l1'] = df['l1'].map(language_mapper)
df['l2'] = df['l2'].map(language_mapper)

In [55]:
def get_conversation_groq(src_language: str, trg_language: str, src_sentence: str, trg_sentence: str) -> str:
    messages = [
        {
            "role": "system",
            "content": f"""Please evaluate the quality of the translation from {src_language} to {trg_language} based on accuracy, fluency, and coherence: \n Provide a score from 0 to 100 to indicate the overall quality of the translation. Provide only the score, nothing else."""
        },
        {
            "role": "user",
            "content": f'Original sentence: "{src_sentence}" \n Translated sentence: "{trg_sentence}"'
        },
    ]
    return messages

In [75]:
df['message'] = df.apply(lambda x: get_conversation_groq(x['l1'], x['l2'], x['src'], x['mt']), axis=1)

In [57]:
df.iloc[0].message

[{'role': 'system',
  'content': 'Please evaluate the quality of the translation from English to Chinese based on accuracy, fluency, and coherence: \n Provide a score from 0 to 100 to indicate the overall quality of the translation. Provide only the score, nothing else.'},
 {'role': 'user',
  'content': 'Original sentence: "Police said in a statement at the time that they made the decision to end the search for Jacsun with a "heavy heart," but noted they couldn\'t pinpoint a location in the landfill "to a point that would make continuing the search reasonable."" \n Translated sentence: "警方在当时的一份声明中表示，他们决定以“沉重的心情”结束对 Jacsun 的搜索，但指出他们无法确定垃圾填埋场的位置，“以至于继续搜索是合理的”。"'}]

# API calls

In [20]:
load_dotenv()

True

In [42]:
client = Groq()

In [76]:
input_data = df['message'].to_list()

In [62]:
chat_completion = client.chat.completions.create(
                messages=input_data[i],
                model="llama3-70b-8192",
                temperature=0.5,
                max_tokens=1024,
                top_p=1,
                stop=None,
                stream=False,
            )

In [77]:
input_data[0]

[{'role': 'system',
  'content': 'Please evaluate the quality of the translation from English to Chinese based on accuracy, fluency, and coherence: \n Provide a score from 0 to 100 to indicate the overall quality of the translation. Provide only the score, nothing else.'},
 {'role': 'user',
  'content': 'Original sentence: "Police said in a statement at the time that they made the decision to end the search for Jacsun with a "heavy heart," but noted they couldn\'t pinpoint a location in the landfill "to a point that would make continuing the search reasonable."" \n Translated sentence: "警方在当时的一份声明中表示，他们决定以“沉重的心情”结束对 Jacsun 的搜索，但指出他们无法确定垃圾填埋场的位置，“以至于继续搜索是合理的”。"'}]

In [87]:
last_idx = 0
max_rows = 100
outfile = f'../data/llama_3/{last_idx}_starting_row.json'

with open(outfile, 'w') as output_file:
    output_file.write('[')
    for i in range(last_idx, len(input_data[:max_rows])):
        try:
            sample = input_data[i]
            chat_completion = client.chat.completions.create(
                messages=input_data[i],
                model="llama3-70b-8192",
                temperature=0.5,
                max_tokens=1024,
                top_p=1,
                stop=None,
                stream=False,
            )
            answer = chat_completion.choices[0].message.content

            output_sample = {'sentence': sample[-1]['content'],'score': answer, 'id':i}
            
            output_file.write(json.dumps(output_sample))

            if i != (max_rows-1):
                output_file.write(',')
        except:
            print('next start by:')
            print(i + 1)
            break
    output_file.write(']')
    

next start by:
11


In [88]:
with open(outfile) as file:
    content = json.load(file)

JSONDecodeError: Expecting value: line 1 column 5700 (char 5699)

In [None]:
content

[{'sentence': 'Original sentence: "Police said in a statement at the time that they made the decision to end the search for Jacsun with a "heavy heart," but noted they couldn\'t pinpoint a location in the landfill "to a point that would make continuing the search reasonable."" \n Translated sentence: "警方在当时的一份声明中表示，他们决定以“沉重的心情”结束对 Jacsun 的搜索，但指出他们无法确定垃圾填埋场的位置，“以至于继续搜索是合理的”。"',
  'score': '92',
  'id': 0},
 {'sentence': 'Original sentence: "Trump will likely veto the resolution, the sixth time he will have blocked congressional legislation as president. He already vetoed a measure to end the emergency declaration in March." \n Translated sentence: "特 朗 普 可 能 会 否 决 该 决 议 ， 这 是 他 第 六 次 以 总 统 身 份 阻 止 国 会 立 法 ， 他 已 经 在 3 月 份 否 决 了 一 项 结 束 紧 急 声 明 的 措 施 。"',
  'score': '95',
  'id': 1},
 {'sentence': 'Original sentence: "The man arrived at work about 9.30am after failing to reply to several text messages and phone calls from colleagues." \n Translated sentence: "该名男子未能回复同事的几条短信和电话，于上午9.30左右上