In [1]:
import time
import uuid
import json
import os
import re
from util import find_random_question

WRITING_PROBLEM_TYPE_1 = 'WRITING_PROBLEM_TYPE_1'
WRITING_PROBLEM_TYPE_2 = 'WRITING_PROBLEM_TYPE_2'
WRITING_PROBLEM_TYPE_3 = 'WRITING_PROBLEM_TYPE_3'

problem_type_to_score = {
    WRITING_PROBLEM_TYPE_1: 10,
    WRITING_PROBLEM_TYPE_2: 30,
    WRITING_PROBLEM_TYPE_3: 50,
}

def generate_uuid():
    time.sleep(0.01)
    return str(uuid.uuid1())

def get_answer_number(answer, selector):
    if answer is None or selector is None:
        return None
    for i, s in enumerate(selector):
        if s == answer:
            return i+1
    raise ValueError('Answer not found in selector', answer, selector)

def convert_string_to_list_format(input_string):
    # 정규 표현식을 사용하여 (가), (나), (다), (라) 등을 기준으로 분리합니다.
    sentences = re.split(r'\s*(\([가-힣]\))\s*', input_string)
    
    # 빈 문자열과 None을 제거하고 유효한 문장과 부호를 페어로 묶습니다.
    sentences = [s for s in sentences if s and s != ' ']
    paired_sentences = [(sentences[i], sentences[i+1]) for i in range(0, len(sentences), 2)]
    
    # 지정된 형식으로 문자열을 작성합니다.
    result = "[["
    for pair in paired_sentences:
        result += f"'{pair[0]} {pair[1]}', "
    
    # 마지막 문장의 끝에 있는 ', '를 제거하고 닫는 대괄호를 추가합니다.
    result = result.rstrip(', ') + "]]"
    
    return result

def update_json(json_data, single_problems, topik_type, problem_type):
    if 'uuid' not in json_data:
        json_data['uuid'] = generate_uuid()
    if 'title' not in json_data:
        json_data['title'] = 'Generated_TESTN_TOPIK_2_WRITING'
    if 'type' not in json_data:
        json_data['type'] = topik_type
    if 'year' not in json_data:
        json_data['year'] = 2024
    if 'tags' not in json_data:
        json_data['tags'] = []
    if 'problems' not in json_data:
        json_data['problems'] = []
    
    problem = {}
    # problem['uuid'] = generate_uuid()
    problem['problem'] = single_problems[0]['question']
    problem['WType'] = problem_type

    questions = []
    if problem_type == WRITING_PROBLEM_TYPE_1:
        for single_problem in single_problems:
            question = {}
            question['question'] = ''
            question['example'] = single_problem['article']
            question['score'] = problem_type_to_score[problem_type]
            questions.append(question)
    elif problem_type == WRITING_PROBLEM_TYPE_2:
        for single_problem in single_problems:
            question = {}
            question['question'] = ''
            question['example'] = f"{single_problem['pro1']}\n{single_problem['pro2']}\n{single_problem['con1']}\n{single_problem['con2']}"
            question['score'] = problem_type_to_score[problem_type]
            questions.append(question)
    elif problem_type == WRITING_PROBLEM_TYPE_3:
        for single_problem in single_problems:
            question = {}
            question['question'] = ''
            question['example'] = f"{single_problem['paragraph']}\n{single_problem['guide1']}●{single_problem['guide2']}●{single_problem['guide3']}"
            question['score'] = problem_type_to_score[problem_type]
            questions.append(question)
    else:
        raise ValueError('Invalid problem type', problem_type)
    problem['questions'] = questions
    ### Append the problem to problems
    json_data['problems'].append(problem)

    return json_data

def create_json_file(base_path, file_name, data):
    file_path = f'{base_path}/{file_name}'
    if os.path.exists(file_path):
        # If the file exists, load the existing data
        with open(file_path, 'r', encoding='utf-8-sig') as file:
            existing_data = json.load(file)
        
        # Update the existing data with the new data
        existing_data.update(data)
        data = existing_data
    
    # Write the data to the JSON file
    with open(file_path, 'w', encoding='utf-8-sig') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)
        print(f'{file_path} has been created successfully.')

def clean_text(text):
    if text.startswith('http'):
        return text
    text = text.replace("'", '＇')
    pattern = r':\s*(.*)$'
    match = re.search(pattern, text)
    if match:
        text = match.group(1).strip()
    text = re.sub(r'\d+\.', '', text)  # 숫자 다음에 오는 점 (예: "1.", "2.") 제거
    text = re.sub(r'\n', ' ', text)  # 개행 문자를 공백으로 대체
    return text.strip()

def assign_scores(total_score, num_questions):
    if total_score == 4 and num_questions == 2:
        return [2, 2]
    if total_score == 5 and num_questions == 2:
        return [2, 3]
    if total_score == 6 and num_questions == 2:
        return [3, 3]
    
    scores = []
    remaining_score = total_score
    
    for i in range(num_questions):
        if remaining_score - 3 >= (num_questions - len(scores) - 1) * 2:
            scores.append(3)
            remaining_score -= 3
        else:
            scores.append(2)
            remaining_score -= 2
    #print(scores)
    return scores

def exam_scoring(json_data):
    scores_per_type = [6, 14, 9, 8, 8, 19, 5, 5, 26]  # 각 유형별 총 점수

    problem_num = 0
    for type_index, questions in enumerate(json_data):
        total_score = scores_per_type[type_index]
        current_score = total_score
        num_questions = len(questions)
        if type_index in [5, 7, 8]:  # 유형 번호가 6, 8, 9인 경우 (인덱스는 0부터 시작하므로 5, 7, 8 사용)
            for question in questions:
                question["example"] = clean_text(question["example"])
                question["score"] = 5 
                #print(f"Type {type_index + 1} problem {problem_num + 1} score: {current_score}")
                sub_total_score = question["score"]  # 하위 문제에 할당된 총 점수
                num_sub_questions = len(question["questions"])  # 하위 문제의 수
                sub_scores = assign_scores(5, num_sub_questions)  # 하위 문제에 점수 할당
                for sub_question_index, sub_question in enumerate(question["questions"]):
                    sub_question["score"] = sub_scores[sub_question_index]  # 하위 문제에 점수 할당
                    #exam_score += sub_question["score"]
                    current_score -= sub_question["score"]
                    for i in range(len(sub_question["selector"])):
                        sub_question["selector"][i] = clean_text(sub_question["selector"][i])
                    sub_question["answer"] = clean_text(sub_question["answer"])
                    sub_question["eval_answer"] = clean_text(sub_question["eval_answer"])
                    
        else:
            scores = assign_scores(total_score, num_questions)
            for question_index, question in enumerate(questions):
                question["example"] = clean_text(question["example"])
                for i in range(len(question["selector"])):
                        question["selector"][i] = clean_text(question["selector"][i])
                question["answer"] = clean_text(question["answer"])
                question["eval_answer"] = clean_text(question["eval_answer"])
                question["score"] = scores[question_index]
                #exam_score+=question["score"]
                problem_num += 1
        #print(f"Type {type_index + 1} score: {total_score}")
    return json_data

# v1
# def example_preprocess(json_data):
#     temp = []
#     for item in json_data[10:12]:
#         for r in item:
#             temp.append(r)
#     del json_data[10:12]
#     json_data.insert(10, temp) 

#     temp = []
#     for item in json_data[1:4]:
#         for r in item:
#             temp.append(r)
#     del json_data[1:4]
#     json_data.insert(1, temp)
    
#     json_data = exam_scoring(json_data)
#     return json_data

# v2
def example_preprocess(json_data):
    temp = []
    for item in json_data[12:14]:
        for r in item:
            temp.append(r)
    del json_data[12:14]
    json_data.insert(12, temp) 

    temp = []
    for item in json_data[1:6]:
        for r in item:
            temp.append(r)
    del json_data[1:6]
    json_data.insert(1, temp)
    
    json_data = exam_scoring(json_data)
    return json_data

from datetime import datetime
import glob
# TODO : Change the folder path
folder_path = '/home/shinbg/topik-korea-data/data/reading_1_v2/problem_data' 

# 'generate_exam_'로 시작하고 '.json'으로 끝나는 모든 파일을 검색합니다.
file_pattern = os.path.join(folder_path, "generate_exam_*.json")
files = glob.glob(file_pattern)

if not files:
    print("JSON 파일을 찾을 수 없습니다.")
else:
    Exam_cost_list = []
    for file in files:
        print(f"Processing {file}")
        with open(file, "r") as json_file:
            exam_cost = []
            json_data = json.load(json_file)
            # json_data = example_preprocess(json_data)
    
            list_of_dict_WRITING_PROBLEM_TYPE_1 = json_data[0]
            list_of_dict_WRITING_PROBLEM_TYPE_2 = json_data[1]
            list_of_dict_WRITING_PROBLEM_TYPE_3 = json_data[2]

            my_json = {}
            my_json = update_json(
                json_data=my_json, 
                single_problems=list_of_dict_WRITING_PROBLEM_TYPE_1, 
                topik_type='TOPIK_2_WRITING', 
                problem_type=WRITING_PROBLEM_TYPE_1
                )
            my_json = update_json(
                json_data=my_json, 
                single_problems=list_of_dict_WRITING_PROBLEM_TYPE_2, 
                topik_type='TOPIK_2_WRITING', 
                problem_type=WRITING_PROBLEM_TYPE_2
                )
            my_json = update_json(
                json_data=my_json, 
                single_problems=list_of_dict_WRITING_PROBLEM_TYPE_3, 
                topik_type='TOPIK_2_WRITING', 
                problem_type=WRITING_PROBLEM_TYPE_3
                )

            file_name = file.split('/')[-1]
            # TODO : Change the folder path
            base_path = '/home/shinbg/topik-korea-data/data/reading_1_v2/exam_data' 
            if base_path == '':
                raise ValueError('Please specify the base path')
            create_json_file(base_path, file_name, my_json)

Processing /home/shinbg/topik-korea-data/data/reading_1_v2/problem_data/generate_exam_20240925_0549.json
/home/shinbg/topik-korea-data/data/reading_1_v2/exam_data/generate_exam_20240925_0549.json has been created successfully.
Processing /home/shinbg/topik-korea-data/data/reading_1_v2/problem_data/generate_exam_20240925_0543.json
/home/shinbg/topik-korea-data/data/reading_1_v2/exam_data/generate_exam_20240925_0543.json has been created successfully.
Processing /home/shinbg/topik-korea-data/data/reading_1_v2/problem_data/generate_exam_20240925_0537.json
/home/shinbg/topik-korea-data/data/reading_1_v2/exam_data/generate_exam_20240925_0537.json has been created successfully.
Processing /home/shinbg/topik-korea-data/data/reading_1_v2/problem_data/generate_exam_20240925_0540.json
/home/shinbg/topik-korea-data/data/reading_1_v2/exam_data/generate_exam_20240925_0540.json has been created successfully.
Processing /home/shinbg/topik-korea-data/data/reading_1_v2/problem_data/generate_exam_202409