In [None]:

import sys 
import pandas as pd
import logging
import openai 
import os 


openai.api_key = os.environ['OPENAI_API_KEY']



sys.path.append('../')

import utils.llm 

logger = logging.getLogger()
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
logger.addHandler(handler)
logger.setLevel(logging.INFO)

class Config:
    DATA_DIR = '../construct_dataset/'
    DATA_OUT_DIR = '../data_out/'
    TEMPERATURE = 0.01
    SAMPLE_N = 30
    # 収集対象の対象クイズの範囲
    TARGET_QUIZ_RANGE = [i for i in range(1, 201)]
    INTERMEDIATE_LANGUAGES = \
        ["英語","中国語","フランス語","ドイツ語","イタリア語","韓国語","ポルトガル語","ロシア語","スペイン語"]
                 
    


annotated_df = pd.read_csv(Config.DATA_DIR + 'df_JAQKET_qa_annot1.csv')





In [None]:
llm = utils.llm.LLM()

In [None]:
def backtranslate_with_en_inserted(s:str,lang1:str,lang2:str):
    temperature = 0.05
    model = "gpt-3.5-turbo"
    
    def translate(s,lang1,lang2):

        prompt =  \
            f'Please translate the following question from "{lang1}" to "{lang2}" with full accuracy, ensuring no content is overlooked. Do not output other than the translated question. Question:\n\n{s}'
        
        response = openai.ChatCompletion.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a language translator."},
                {"role": "user", "content": prompt},
            ],
            temperature=temperature
        )
        
        return response.choices[0].message['content']
    
    # forward translation
    s = translate(s,lang1,"英語")
    s = translate(s,"英語",lang2)
    
    # backward translation
    s = translate(s,lang2,"英語")
    s = translate(s,"英語",lang1)
    
    return s
    
    

# samples = []
# answers = []
# langs = ["英語","中国語","フランス語","ドイツ語","イタリア語","韓国語","ポルトガル語","ロシア語","スペイン語"]
# query = "『騎士団長殺し』『1Q84』『ノルウェイの森』といった小説の作者は誰でしょう?"
# # query =  "アメンボが水の上をすべることができるのはこの力を利用しているからである、液体がその表面積をできるだけ小さく保とうとする力を何というでしょう?" 

# for lang in langs:
#     samples.append(backtranslate_with_en_inserted(query,"日本語",lang))
#     print (f"日本語 -> {lang} -> 日本語",samples[-1]) 
#     query_translated = samples[-1]
#     ans = llm.generate(query_translated,temperture=Config.TEMPERATURE)
#     answers.append(ans)
#     print (f"回答",ans)




In [None]:
# serial_number, qid, query, responseを記録するdf
import time

df = pd.DataFrame(columns=['serial_number','qid','intermediate_lang','query','translated','sample'])


serial_number = 0



for i in Config.TARGET_QUIZ_RANGE:
    
    df.to_csv('totyu.csv',index=False)
    
    
    # show progress
    if i % 5 == 0:
        logger.info("now processing the quiz of index... {}".format(i))
    
    qid = annotated_df.iloc[i]['qid']
    query = annotated_df.iloc[i]['question']
    response = annotated_df.iloc[i]['generated_answer']
    
    for inter_lang in Config.INTERMEDIATE_LANGUAGES:
        while True:
            try:
                time.sleep(0.3)
                paraphrased_q = backtranslate_with_en_inserted(query,"日本語",inter_lang)
                break
            except:
                time.sleep(3)
                continue
        ans = llm.generate(paraphrased_q,temperture=Config.TEMPERATURE)
        
        df = pd.concat(
            [
                df,
                pd.DataFrame(
                    [[serial_number,qid,inter_lang,query,paraphrased_q,ans]],
                    columns=['serial_number','qid','intermediate_lang','query','translated','sample']
                )
            ]
        )
        
        serial_number += 1
        



In [None]:
df.shape

In [None]:
import datetime

info = "temperature_{}_sample_n_{}".format(Config.TEMPERATURE,Config.SAMPLE_N)
suffix = datetime.datetime.now().strftime('%m_%d_%H_%M_%S')

df.to_csv('BT_query_samples.csv'.format(info,suffix),index=False)




In [None]:
df