In [37]:
from tqdm import tqdm

def create_example(all_data):
    all_result = []
    for data in tqdm(all_data):
        data_id = data["_id"]
        Question = data["question"]
        answer = data["answer"]
        context = data["context"]
        supporting_facts = data["supporting_facts"]
        support_dic = {}
        for sup_sent in supporting_facts:
            title = sup_sent[0]  # supporting fact의 제목
            set_num = sup_sent[1]  # 문장번호
            if title not in support_dic.keys():
                support_dic[title] = []
            support_dic[title].append(set_num)
        
        supporting_num_list = []
        sent_list = []
        sentences = ""
        sent_num = 1
        for index, j in enumerate(context):
            title = j[0]
            sent_list.append(title.strip()+ ".")
            sent_num += 1
            
            if title in support_dic.keys():
                for i in support_dic[title]:
                    supporting_num_list.append(sent_num+ i )
    
            if sentences == "":
                sentences = title + ". "
            else:
                sentences = sentences + " " + title + ". "
            for sent in j[1]:
                sentences = sentences + sent
                sent_list.append(sent.strip())
                sent_num += 1
            
            assert (sent_num -1) == len(sent_list)
            
        result = {}
        result["_id"] = data_id
        result["question"] = Question
        result["document"] = sentences
        result["sent"] = sent_list
        result["supporting_num"] = supporting_num_list
        result["output"] = answer
        

        all_result.append(result)

    return all_result
    

In [38]:
import json
# file_path = "../data/origin/hotpot_train_v1.1_re.json"
file_path = "../data/origin/hotpot_dev.json"
with open(file_path, "r", encoding="utf-8") as file:
    dev_data = json.load(file)

input_data = create_example(dev_data)

100%|██████████| 7405/7405 [00:00<00:00, 35359.17it/s]


In [39]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B-Instruct")

In [40]:
len(input_data)

7405

In [30]:
from tqdm import tqdm
all_len = []
all_result = []

for example in tqdm(input_data):
    MAX_LENGTH = 2048
    input_ids, attention_mask, labels = [], [], []
    example["document"] = example["document"].strip()
    # token 된 doc
    token_doc = {"input_ids": [], "attention_mask": []}
    # document 문장 index
    sentence_number = 0
    sentence_position = []
    for i, sent in enumerate(example["sent"]):
        # 0번 문장은 instruction으로 지정할 계획
        sent = sent.strip()
        token_sent = tokenizer(sent + " ", add_special_tokens=False)
        sentence_number += 1  # 1부터 시작
        sentence_position.extend([sentence_number] * len(token_sent["input_ids"]))
        token_doc["input_ids"] += token_sent["input_ids"]
        token_doc["attention_mask"] += token_sent["attention_mask"]
    token_end = tokenizer("<|im_end|>\n", add_special_tokens=False)
    sentence_position.extend([0] * len(token_end))
    token_doc["input_ids"] += token_end["input_ids"]
    token_doc["attention_mask"] += token_end["attention_mask"]
    instruction = tokenizer(
        f"<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n**Question:{example['question']}\n**Document:\n",
        add_special_tokens=False,
    )
    response = tokenizer(
        f"<|im_start|>assistant\n**Answer:{example['output'].strip()}<|im_end|>\n", add_special_tokens=False
    )
    
    input_ids = instruction["input_ids"] + token_doc["input_ids"] + response["input_ids"]
    count = len(input_ids)
    if count <= MAX_LENGTH:
        all_result.append(example)
    all_len.append(count)

100%|██████████| 7405/7405 [00:33<00:00, 221.98it/s]


In [31]:
print(len(all_result))
print(sum(all_len)/len(all_len))

6913
1495.5786630654964


In [32]:
print(tokenizer.decode(input_ids))

<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
**Question:Blackfin is a family of processors developed by the company that is headquartered in what city?
**Document:
1st Word/1st Word Plus. 1st Word and 1st Word Plus are word processors developed by GST Computer Systems in the 1980s. The original package, 1st Word, was given away free with all Atari STs. The later 1st Word Plus was sold by GST and was more advanced. Atari ST disk magazine ST News was written entirely and exclusively using 1st Word and, later, 1st Word Plus. The first Volume (1986) was distributed as a plain 1st Word . DOC file, after that a custom shell was produced that enabled the 1st Word documents to be displayed in a userfriendly disk magazine shell. Arm Holdings. Arm Holdings (Arm) is a British multinational semiconductor and software design company, owned by SoftBank Group and its Vision Fund. Headquartered in Cambridge, United Kingdom, its prim

In [33]:
file_path = "../data/1113data/hotpot_dev.json"
with open(file_path, 'w', encoding='utf-8') as f:
    json.dump(all_result, f, ensure_ascii=False, indent=4)

In [35]:
file_path = "../data/1113data/hotpot_train.json"
with open(file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

In [36]:
import random
random.seed(42)
random.shuffle(data)

write_path = "../data/1113data/hotpot_train_shuffle.json"
with open(write_path, 'w', encoding='utf-8') as f:
    json.dump(data[:30000], f, ensure_ascii=False, indent=4)