In [1]:
import os
import json
from typing import List
from tqdm import tqdm

In [3]:
data_all = []

def extract_data_from_json_files(file_paths):
    for file_path in file_paths:
        print(f'Processing file: {file_path}')
        # 检查文件是否存在并且是.json文件
        if os.path.isfile(file_path) and file_path.endswith('.json'):
            # 打开并读取JSON文件
            with open(file_path, 'r', encoding='utf-8') as file:
                # 如果是一个JSON列表
                data_list = json.load(file)
                
                # 如果是单个对象，将其转换为列表
                if not isinstance(data_list, list):
                    data_list = [data_list]

                data_all.extend(data_list)

def find_json_files(directory: str) -> List[str]:
    json_file_paths = []

    # 遍历目录
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.json'):
                # 获取绝对路径并添加到列表
                json_file_paths.append(os.path.abspath(os.path.join(root, file)))

    return json_file_paths

# json_file_paths = find_json_files('/mnt/petrelfs/hujucheng/train/data/raw_base/aya_muri_openhms_sft1v2_xinv3_with_score')
json_file_paths = find_json_files('/mnt/hwfile/opendatalab/peiqizhi/data/aya_muri_openhms_sft1v2_xinv3')
extract_data_from_json_files(json_file_paths)
print(len(data_all))

jsonl_path = '/mnt/hwfile/opendatalab/gaoxin/best_hu_9w_enzh_6w.jsonl'
with open(jsonl_path, 'r', encoding='utf-8') as f:
    jsonl_data = f.readlines()

jsonl_ids = [json.loads(line)['_id'] for line in jsonl_data]
jsonl_ids_set = set(jsonl_ids)

assert len(jsonl_ids) == len(jsonl_ids_set)

final_data = []

for item in tqdm(data_all):
    if item['_id'] in jsonl_ids_set:
        final_data.append({'messages': item['messages']})

assert len(final_data) == len(jsonl_ids)

output_dir = '/mnt/petrelfs/hujucheng/train/data/10Wbase/15wbaseIFDLanguageBalancedSelected'
# os.makedirs(output_dir, exist_ok=True)

n = 8
# 计算每一份的大小
chunk_size = len(final_data) // n
for i in range(n):
    # 确定每一份的起始和结束索引
    start = i * chunk_size
    # 对最后一份特殊处理，确保包含所有剩余的数据
    end = (i + 1) * chunk_size if i < n - 1 else len(final_data)
    
    # 获取当前分割的数据
    chunk_data = final_data[start:end]
    
    # 将该份保存为 JSON 文件
    filename = os.path.join(output_dir, f'data_part_{i+1}.json')
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(chunk_data, f, ensure_ascii=False, indent=4)

print(f'Data split into {n} parts and saved to {output_dir}')

Processing file: /mnt/hwfile/opendatalab/peiqizhi/data/aya_muri_openhms_sft1v2_xinv3/muri_hu_train_val_format.json
Processing file: /mnt/hwfile/opendatalab/peiqizhi/data/aya_muri_openhms_sft1v2_xinv3/aya_collection_100k_format/translated_flan_coqa_format.json
Processing file: /mnt/hwfile/opendatalab/peiqizhi/data/aya_muri_openhms_sft1v2_xinv3/aya_collection_100k_format/translated_paws_sample2000_format.json
Processing file: /mnt/hwfile/opendatalab/peiqizhi/data/aya_muri_openhms_sft1v2_xinv3/aya_collection_100k_format/translated_adversarial_qa_sample3000_format.json
Processing file: /mnt/hwfile/opendatalab/peiqizhi/data/aya_muri_openhms_sft1v2_xinv3/aya_collection_100k_format/translated_hotpotqa_sample10000_format.json
Processing file: /mnt/hwfile/opendatalab/peiqizhi/data/aya_muri_openhms_sft1v2_xinv3/aya_collection_100k_format/translated_joke_explaination_format.json
Processing file: /mnt/hwfile/opendatalab/peiqizhi/data/aya_muri_openhms_sft1v2_xinv3/aya_collection_100k_format/transla

100%|██████████| 4742440/4742440 [00:04<00:00, 964677.96it/s] 


Data split into 8 parts and saved to /mnt/petrelfs/hujucheng/train/data/10Wbase/15wbaseIFDLanguageBalancedSelected
