In [1]:
import json

def create_prompt_completion(entry, field):
    prompt = entry.get(field, "").strip()
    if not prompt:
        return None
    prompt += "\n\n###\n\n"
    distortions = entry.get("distortions", [])
    if not distortions:
        return None
    completion = " " + ", ".join(distortions)
    return {"prompt": prompt, "completion": completion}

def convert_json_to_jsonl(input_json_path, output_jsonl_path):
    with open(input_json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    output = []
    for entry in data:
        for field in ["ori_text", "situation", "thoughts"]:
            item = create_prompt_completion(entry, field)
            if item:
                output.append(item)

    with open(output_jsonl_path, "w", encoding="utf-8") as f:
        for obj in output:
            f.write(json.dumps(obj, ensure_ascii=False) + "\n")

    print(f"JSONL 파일 생성 완료: {output_jsonl_path}")

def merge_jsonl_files(file1_path, file2_path, output_path):
    with open(output_path, 'w', encoding='utf-8') as outfile:
        for file_path in [file1_path, file2_path]:
            with open(file_path, 'r', encoding='utf-8') as infile:
                for line in infile:
                    # 빈 줄 무시
                    if line.strip():
                        outfile.write(line)

# 사용 예시
convert_json_to_jsonl("distortions_seed.json", "distortions_seed_finetune_data.jsonl")
convert_json_to_jsonl("distortions_test.json", "distortions_test_finetune_data.jsonl")
merge_jsonl_files('distortions_seed_finetune_data.jsonl', 'distortions_test_finetune_data.jsonl', 'merged_distortions_old.jsonl')


JSONL 파일 생성 완료: distortions_seed_finetune_data.jsonl
JSONL 파일 생성 완료: distortions_test_finetune_data.jsonl


In [2]:
import json

def convert_prompt_completion_to_messages(input_path, output_path):
    with open(input_path, 'r', encoding='utf-8') as fin, open(output_path, 'w', encoding='utf-8') as fout:
        for line in fin:
            data = json.loads(line)
            prompt = data.get("prompt", "").strip()
            completion = data.get("completion", "").strip()

            example = {
                "messages": [
                    {"role": "user", "content": prompt},
                    {"role": "assistant", "content": completion}
                ]
            }
            fout.write(json.dumps(example, ensure_ascii=False) + "\n")

# 사용 예
convert_prompt_completion_to_messages("merged_distortions_old.jsonl", "merged_distortions_output.jsonl")


In [3]:
import json
import random

# 파일 경로
input_file = 'merged_distortions_output.jsonl'
train_file = 'distortions_output_train.jsonl'
valid_file = 'distortions_output_valid.jsonl'

# 데이터 읽기
with open(input_file, 'r', encoding='utf-8') as f:
    data = [line for line in f if line.strip()]

# 셔플 (재현성을 위해 seed 고정)
random.seed(21)
random.shuffle(data)

# 9:1 비율로 split
n_total = len(data)
n_valid = int(n_total * 0.1)
n_train = n_total - n_valid

train_data = data[:n_train]
valid_data = data[n_train:]

# 파일로 저장
with open(train_file, 'w', encoding='utf-8') as f:
    for line in train_data:
        f.write(line)

with open(valid_file, 'w', encoding='utf-8') as f:
    for line in valid_data:
        f.write(line)

print(f"Train: {n_train}개, Valid: {n_valid}개")



Train: 449개, Valid: 49개
