In [1]:
import pandas as pd
from tqdm import tqdm
import json
import random
import numpy as np
from call_api import *

This notebook generates the required data for SFT. It takes in the queries, references, and answers and tie them together with the instruction prompt. It then saves it to a json file.

In [2]:
# replace these with the paths of your file

file_path = 'train-query-ref.json' # the json file of the references and queries
file_path_ground_truth = 'train_df.csv' # the csv for the training data
output_json_file_path = 'train-multi-nogroundtruth-new-prompt-trans.json' #output path

In [3]:
SYSTEM = """
You are a helpful assistant. 
You are always a reliable assistant that can answer questions with the help of external documents.
"""


INSTRUCTIONS_PREV = """
- All contents between <DOCUMENTS> and </DOCUMENTS> are reference information retrieved from an external knowledge base.
- If you cannot answer based on the given information, you will return the sentence \"抱歉，检索到的参考信息并未提供充足的信息，因此无法回答。\".
- Now, answer the following question based on the following retrieved documents:
{question}
- Return your answer in Markdown formatting, and in Chinese".
"""
# No figure

INSTRUCTIONS_POST = """
- All contents between <DOCUMENTS> and </DOCUMENTS> are reference information retrieved from an external knowledge base.
- If you cannot answer based on the given information, you will return the sentence \"抱歉，检索到的参考信息并未提供充足的信息，因此无法回答。\".
- Now, answer the following question based on the above retrieved documents:
{question}
- Return your answer in Markdown formatting, and in Chinese".
"""


PROMPT_TEMPLATE = """
<SYSTEM>
{system}
</SYSTEM>

<USER_INSTRUCTIONS>
{user_instructions}
</USER_INSTRUCTIONS>

<INSTRUCTIONS>
{instructions_prev}
</INSTRUCTIONS>

<DOCUMENTS>
{context}
</DOCUMENTS>

<INSTRUCTIONS>
{instructions_post}
</INSTRUCTIONS>
"""

def construct_prompt(query : str, references: list, answer: str):
    result_data = {
        'instruction': PROMPT_TEMPLATE.format(
            system=SYSTEM,
            instructions_prev= INSTRUCTIONS_PREV.format(question=query),
            instructions_post= INSTRUCTIONS_POST.format(question=query),
            user_instructions='',
            context=''.join(['\n<reference>' + s + '</reference>' for s in references])
        ),
        'input': '',
        'output': answer,
        'history':[]
    } 
    
    return result_data

In [None]:

data = pd.read_json(file_path)
data_ground_truth = pd.read_csv(file_path_ground_truth)


data.head()

In [5]:

TRANS_PROMPT = '''
Your task is to translate the answer to a question into Chinese.

Answer:
{translate_text}

Follow the following guidelines when translating:
1. If the answer is already in Chinese, please do not make any changes to it. Output it as it is.
2. Do not change the meaning or any factual elements in the answer. Make sure the meaning stays the same.
'''




In [6]:
# translate responses into chinese:


client = AzureOpenAI(
    api_key="replace with openai key",
    azure_endpoint="https://gptforai03.openai.azure.com/",
    api_version=API_VERSION
)

def translate_ch(query: str, answer: str):
    cur_eval_prompt = TRANS_PROMPT.format(
        query = query,
        translate_text = answer,
    )
    
    messages = [
        {"role": "user", "content": ""},
        {"role": "system", "content" : cur_eval_prompt}
    ]



    res = client.chat.completions.create(model=Deployment.GPT35_16K, messages=messages, max_tokens=4096)
    resstr = res.choices[0].message.content
    #print(f'answer: {answer}\ntranslated answer: {resstr}')
    return resstr 


In [None]:

print('Translating...')
result_data = []
for i in tqdm(data.index):
    result_data.append(construct_prompt(data['query'][i], data['references'][i], translate_ch(data_ground_truth['queries'][i], data_ground_truth['answers'][i])))


with open(output_json_file_path, 'w', encoding='utf-8') as fout:
    json.dump(result_data, fout, ensure_ascii=False, indent=4)
