This notebook processes the raw query-reference-data csv into a more organized 3 column dataframe.

In [None]:
import pandas as pd


file_path = 'train_json.csv'
output_file_path = 'train_df.csv'
output_json_file_path = 'train.json'

data = pd.read_csv(file_path)

print(data.head())
data_queries = data.iloc[::3]['queries'].reset_index(drop=True)
data_references = data.iloc[1::3]['passages'].reset_index(drop=True)
data_answers = data.iloc[::3]['passages'].reset_index(drop=True)

# Create a DataFrame from the Series
data_processed = pd.DataFrame({
    'queries': data_queries,
    'references': data_references,
    'answers': data_answers
})

data_processed.head()
data_processed.to_csv(output_file_path)


In [8]:
'''

prepare data to Alpaca Format so we can do sft.

https://github.com/hiyouga/LLaMA-Factory/tree/main/data

 

Usage:

python prepare_data input_path dataset_path


'''

import sys
import json
import numpy as np



SYSTEM = """
You are a helpful assistant. 
You are always a reliable assistant that can answer questions with the help of external documents.
"""

INSTRUCTIONS = """
- All contents between <DOCUMENTS> and </DOCUMENTS> are reference information retrieved from an external knowledge base.
- If you cannot answer based on the given information, you will return the sentence \"抱歉，检索到的参考信息并未提供充足的信息，因此无法回答。\".
- Please include the complete reference to the figure in your answer, "![figure](xxxx)" for example.
- Now, answer the following question based on the above retrieved documents:
{question}
- Return your answer in Markdown formatting, and in the same language as the question "{question}".
"""

PROMPT_TEMPLATE = """
<SYSTEM>
{system}
</SYSTEM>

<USER_INSTRUCTIONS>
{user_instructions}
</USER_INSTRUCTIONS>

<INSTRUCTIONS>
{instructions}
</INSTRUCTIONS>

<DOCUMENTS>
{context}
</DOCUMENTS>

<INSTRUCTIONS>
{instructions}
</INSTRUCTIONS>
"""



# user instruction should input user's prompt?

#question = '心智障碍者家庭支持计划覆盖了多少个市县？'
#answer = '截至2021年12月底，心智障碍者家庭支持计划覆盖了25个市县。'


result_data = [{
    'instruction': PROMPT_TEMPLATE.format(
        system=SYSTEM,
        instructions= INSTRUCTIONS.format(question=data_processed['queries'][i]),
        user_instructions='',
        context=data_processed['references'][i]
    ),
    'input': '',
    'output': data_processed['answers'][i],
    'history':[]
} for i in data_processed.index]


#print(result_data)

with open(output_json_file_path, 'w', encoding='utf-8') as fout:
    json.dump(result_data, fout, ensure_ascii=False, indent=4)





