In [2]:
import time
import uuid
import json
import os
import re
from util import find_random_question

def generate_uuid():
    time.sleep(0.01)
    return str(uuid.uuid1())

def get_answer_number(answer, selector):
    if answer is None or selector is None:
        return None
    for i, s in enumerate(selector):
        answer = re.sub(r"[^\uAC00-\uD7A30-9a-zA-Z\s]", "", answer)
        s = re.sub(r"[^\uAC00-\uD7A30-9a-zA-Z\s]", "", s)
        if s == answer:
            return i+1
    raise ValueError('Answer not found in selector', answer, selector)

def convert_string_to_list_format(input_string):
    # 정규 표현식을 사용하여 (가), (나), (다), (라) 등을 기준으로 분리합니다.
    sentences = re.split(r'\s*(\([가-힣]\))\s*', input_string)
    
    # 빈 문자열과 None을 제거하고 유효한 문장과 부호를 페어로 묶습니다.
    sentences = [s for s in sentences if s and s != ' ']
    paired_sentences = [(sentences[i], sentences[i+1]) for i in range(0, len(sentences), 2)]
    
    # 지정된 형식으로 문자열을 작성합니다.
    result = "[["
    for pair in paired_sentences:
        result += f"'{pair[0]} {pair[1]}', "
    
    # 마지막 문장의 끝에 있는 ', '를 제거하고 닫는 대괄호를 추가합니다.
    result = result.rstrip(', ') + "]]"
    
    return result

def update_json(json_data, single_problems, problem_type, detailed_problem_type=None):
    if 'uuid' not in json_data:
        json_data['uuid'] = generate_uuid()
    if 'title' not in json_data:
        json_data['title'] = 'Generated_TESTN_TOPIK_2_READING'
    if 'type' not in json_data:
        json_data['type'] = 'TOPIK_2_READING'
    if 'year' not in json_data:
        json_data['year'] = 2024
    if 'problems' not in json_data:
        json_data['problems'] = []
    
    ### get the integer from the problem type
    problem_type_num = int(problem_type.split('_')[-1])
    
    if problem_type_num not in problem_types_dependent_on_example:
        problem = {}
        problem['uuid'] = generate_uuid()
        problem['problem'] = problem_type_to_question[problem_type]
        problem['PType'] = problem_type
        problem['EType'] = problem_type_to_EType[problem_type]
        example = {
            "conversation": None,
            "answers": None,
            "selected": None
        }
        #problem['example'] = json.dumps(example, ensure_ascii=False)
        problem['example'] = str(example)

        questions = []
        for single_problem in single_problems:
            question = {}
            question['uuid'] = generate_uuid()
            if detailed_problem_type is None:
                question['QEType'] = problem_type_to_QEType[problem_type]
            else:
                question['QEType'] = problem_type_to_QEType[detailed_problem_type]
            question['questionProblem'] = ''
            question['score'] = single_problem['score']
            question['rightAnswer'] = get_answer_number(single_problem['eval_answer'].strip(), single_problem['selector'])
            if question['QEType'] == 'IMAGE':
                question['example'] = single_problem['example']
            elif problem_type_num == 1:
                question['questionProblem'] = single_problem['example']
                question['example'] = ""
            elif problem_type_num == 2:
                question['questionProblem'] = single_problem['example']
                question['example'] = ""
            elif problem_type_num == 5:
                question['example'] = convert_string_to_list_format(single_problem['example'])
            elif problem_type_num == 12:
                question['example'] = single_problem['example']
            else:
                question['example'] = str([[single_problem['example']]])
            answers = []
            for i, s in enumerate(single_problem['selector']):
                answer = {}
                answer['uuid'] = generate_uuid()
                answer['AType'] = problem_type_to_AType[problem_type]
                answer['answer'] = s
                answers.append(answer)
            question['answers'] = answers
            questions.append(question)

        problem['questions'] = questions
        ### Append the problem to problems
        json_data['problems'].append(problem)

    ### Problem types dependent on example
    else:
        for single_problem in single_problems:
            problem = {}
            problem['uuid'] = generate_uuid()
            problem['problem'] = problem_type_to_question[problem_type]
            problem['PType'] = problem_type
            problem['EType'] = problem_type_to_EType[problem_type]
            if problem['EType'] == 'IMAGE':
                problem['example'] = single_problem['example']
            else:
                example = {
                    "conversation": str([single_problem['example']]),
                    "answers": None,
                    "selected": None
                }
                #problem['example'] = json.dumps(example, ensure_ascii=False)
                problem['example'] = str(example)
            questions = []
            for q in single_problem['questions']:
                question = {}
                question['uuid'] = generate_uuid()
                if detailed_problem_type is None:
                    question['QEType'] = problem_type_to_QEType[problem_type]
                else:
                    question['QEType'] = problem_type_to_QEType[detailed_problem_type]
                question['questionProblem'] = q['question']
                question['score'] = q['score']
                question['rightAnswer'] = get_answer_number(q['eval_answer'].strip(), q['selector'])
                try:
                    question['example'] = str([[q['example']]])
                except KeyError:
                    question['example'] = ""
                answers = []
                for _, s in enumerate(q['selector']):
                    answer = {}
                    answer['uuid'] = generate_uuid()
                    answer['AType'] = problem_type_to_AType[problem_type]
                    answer['answer'] = s
                    answers.append(answer)
                question['answers'] = answers
                questions.append(question)

            problem['questions'] = questions
            ### Append the problem to problems
            json_data['problems'].append(problem)

    return json_data

def create_json_file(base_path, file_name, data):
    file_path = f'{base_path}/{file_name}'
    if os.path.exists(file_path):
        # If the file exists, load the existing data
        with open(file_path, 'r', encoding='utf-8-sig') as file:
            existing_data = json.load(file)
        
        # Update the existing data with the new data
        existing_data.update(data)
        data = existing_data
    
    # Write the data to the JSON file
    with open(file_path, 'w', encoding='utf-8-sig') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)
        print(f'{file_path} has been created')

def clean_text(text):
    if text.startswith('http'):
        return text
    text = text.replace("'", '＇')
    pattern = r':\s*(.*)$'
    match = re.search(pattern, text)
    if match:
        text = match.group(1).strip()
    text = re.sub(r'\d+\.', '', text)  # 숫자 다음에 오는 점 (예: "1.", "2.") 제거
    text = re.sub(r'\n', ' ', text)  # 개행 문자를 공백으로 대체
    return text.strip()

# TODO: exam_scoring 함수 수정 필요
def exam_scoring(json_data):
    scores_per_type = [4, 4, 8, 4, 4, 6, 14, 8, 4, 6, 6, 8, 6, 4, 4, 4, 6]  # 각 유형별 총 점수

    problem_num = 0
    for type_index, questions in enumerate(json_data):
        total_score = scores_per_type[type_index]
        current_score = total_score
        if type_index in [7, 8, 13, 14, 15]:  # 유형 번호가 7, 8, 13, 14, 15, 16인 경우 (인덱스는 0부터 시작하므로 6, 7, 12, 13, 14, 15 사용)
            for question in questions:
                question["example"] = clean_text(question["example"])
                question["score"] = 4
                #print(f"Type {type_index + 1} problem {problem_num + 1} score: {current_score}")
                sub_scores = [2,2]  # 하위 문제에 점수 할당
                for sub_question_index, sub_question in enumerate(question["questions"]):
                    sub_question["score"] = sub_scores[sub_question_index]  # 하위 문제에 점수 할당
                    #exam_score += sub_question["score"]
                    current_score -= sub_question["score"]
                    for i in range(len(sub_question["selector"])):
                        sub_question["selector"][i] = clean_text(sub_question["selector"][i])
                    sub_question["answer"] = clean_text(sub_question["answer"])
                    sub_question["eval_answer"] = clean_text(sub_question["eval_answer"])
        elif type_index == 12:
            scores = 2
            for question_index, question in enumerate(questions):
                question["example"] = question["example"]
                for i in range(len(question["selector"])):
                        question["selector"][i] = clean_text(question["selector"][i])
                question["answer"] = clean_text(question["answer"])
                question["eval_answer"] = clean_text(question["eval_answer"])
                question["score"] = scores
                #exam_score+=question["score"]
                problem_num += 1  
        elif type_index == 16:  # 유형 번호가 7, 8, 13, 14, 15, 16인 경우 (인덱스는 0부터 시작하므로 6, 7, 12, 13, 14, 15 사용)
            for question in questions:
                question["example"] = clean_text(question["example"])
                question["score"] = 6
                #print(f"Type {type_index + 1} problem {problem_num + 1} score: {current_score}")
                sub_scores = [2,2,2]  # 하위 문제에 점수 할당
                for sub_question_index, sub_question in enumerate(question["questions"]):
                    sub_question["score"] = sub_scores[sub_question_index]  # 하위 문제에 점수 할당
                    #exam_score += sub_question["score"]
                    for i in range(len(sub_question["selector"])):
                        sub_question["selector"][i] = clean_text(sub_question["selector"][i])
                    sub_question["answer"] = clean_text(sub_question["answer"])
                    sub_question["eval_answer"] = clean_text(sub_question["eval_answer"])
        else:
            scores = 2
            for question_index, question in enumerate(questions):
                question["example"] = clean_text(question["example"])
                for i in range(len(question["selector"])):
                        question["selector"][i] = clean_text(question["selector"][i])
                question["answer"] = clean_text(question["answer"])
                question["eval_answer"] = clean_text(question["eval_answer"])
                question["score"] = scores
                #exam_score+=question["score"]
                problem_num += 1
    return json_data

def example_preprocess(json_data):
    json_data = exam_scoring(json_data)
    return json_data

from datetime import datetime
import glob
# folder_path = 'C:/Users/wlsdu/Desktop/developer/final_project/topik/topik-korea-data/data/problem_data'
folder_path = '/home/shinbg/topik-korea-data/data/reading_2_v2/problem_data' 
# 'generate_exam_'로 시작하고 '.json'으로 끝나는 모든 파일을 검색합니다.
file_pattern = os.path.join(folder_path, "generate_exam_*.json")
files = glob.glob(file_pattern)

if not files:
    print("JSON 파일을 찾을 수 없습니다.")
else:
    for file in files:
        print(f"Processing {file}")
        with open(file, "r") as json_file:
            
            problem_types_dependent_on_example = [7, 8, 13, 14, 15, 16]

            problem_type_to_question = {
                'READING_2_PROBLEM_TYPE_1': '※ (   )에 들어갈 말로 가장 알맞은 것을 고르십시오.',
                'READING_2_PROBLEM_TYPE_2': '※ 밑줄 친 부분과 의미가 가장 비슷한 것을 고르십시오.',
                'READING_2_PROBLEM_TYPE_3': '※ 다음은 무엇에 대한 글인지 고르십시오.',
                'READING_2_PROBLEM_TYPE_4': '※ 다음 글 또는 그래프의 내용과 같은 것을 고르십시오.',
                'READING_2_PROBLEM_TYPE_5': '※ 다음을 순서에 맞게 배열한 것을 고르십시오.',
                'READING_2_PROBLEM_TYPE_6': '※ (   )에 들어갈 말로 가장 알맞은 것을 고르십시오.',
                'READING_2_PROBLEM_TYPE_7': '※ 다음을 읽고 물음에 답하십시오.',
                'READING_2_PROBLEM_TYPE_8': '※ 다음을 읽고 물음에 답하십시오.',
                'READING_2_PROBLEM_TYPE_9': '※ 다음 신문 기사의 제목을 가장 잘 설명한 것을 고르십시오.',
                'READING_2_PROBLEM_TYPE_10': '※ 다음을 읽고 글의 내용과 같은 것을 고르십시오.',
                'READING_2_PROBLEM_TYPE_11': '※ 다음을 읽고 글의 주제로 가장 알맞은 것을 고르십시오.',
                'READING_2_PROBLEM_TYPE_12': '※ 주어진 문장이 들어갈 곳으로 가장 알맞은 것을 고르십시오.',
                'READING_2_PROBLEM_TYPE_13': '※ 다음을 읽고 물음에 답하십시오.',
                'READING_2_PROBLEM_TYPE_14': '※ 다음을 읽고 물음에 답하십시오.',
                'READING_2_PROBLEM_TYPE_15': '※ 다음을 읽고 물음에 답하십시오.',
                'READING_2_PROBLEM_TYPE_16': '※ 다음을 읽고 물음에 답하십시오.'
            }

            problem_type_to_EType = {
                'READING_2_PROBLEM_TYPE_1'  : 'TEXT',
                'READING_2_PROBLEM_TYPE_2'  : 'TEXT',
                'READING_2_PROBLEM_TYPE_3'  : 'TEXT',
                'READING_2_PROBLEM_TYPE_4'  : 'TEXT',
                'READING_2_PROBLEM_TYPE_5'  : 'TEXT',
                'READING_2_PROBLEM_TYPE_6'  : 'TEXT',
                'READING_2_PROBLEM_TYPE_7'  : 'TEXT',
                'READING_2_PROBLEM_TYPE_8'  : 'TEXT',
                'READING_2_PROBLEM_TYPE_9'  : 'TEXT',
                'READING_2_PROBLEM_TYPE_10' : 'TEXT',
                'READING_2_PROBLEM_TYPE_11' : 'TEXT',
                'READING_2_PROBLEM_TYPE_12' : 'TEXT',
                'READING_2_PROBLEM_TYPE_13' : 'TEXT',
                'READING_2_PROBLEM_TYPE_14' : 'TEXT',
                'READING_2_PROBLEM_TYPE_15' : 'TEXT',
                'READING_2_PROBLEM_TYPE_16' : 'TEXT',
            }

            problem_type_to_QEType = {
                'READING_2_PROBLEM_TYPE_1'  : 'TEXT',
                'READING_2_PROBLEM_TYPE_2'  : 'TEXT',
                'READING_2_PROBLEM_TYPE_3'  : 'IMAGE',
                'READING_2_PROBLEM_TYPE_4_1'  : 'TEXT', ## 내용 일치 문제로, QEType이 TEXT
                'READING_2_PROBLEM_TYPE_4_2'  : 'IMAGE',  ## 내용 일치 문제로, QEType이 IMAGE
                'READING_2_PROBLEM_TYPE_5'  : 'TEXT',
                'READING_2_PROBLEM_TYPE_6'  : 'TEXT',
                'READING_2_PROBLEM_TYPE_7'  : 'TEXT',
                'READING_2_PROBLEM_TYPE_8'  : 'TEXT',
                'READING_2_PROBLEM_TYPE_9'  : 'TEXT',
                'READING_2_PROBLEM_TYPE_10' : 'TEXT',
                'READING_2_PROBLEM_TYPE_11' : 'TEXT',
                'READING_2_PROBLEM_TYPE_12' : 'TEXT',
                'READING_2_PROBLEM_TYPE_13' : 'TEXT',
                'READING_2_PROBLEM_TYPE_14' : 'TEXT',
                'READING_2_PROBLEM_TYPE_15' : 'TEXT',
                'READING_2_PROBLEM_TYPE_16' : 'TEXT',
            }

            problem_type_to_AType = {
                'READING_2_PROBLEM_TYPE_1'  : 'TEXT',
                'READING_2_PROBLEM_TYPE_2'  : 'TEXT',
                'READING_2_PROBLEM_TYPE_3'  : 'TEXT',
                'READING_2_PROBLEM_TYPE_4'  : 'TEXT',
                'READING_2_PROBLEM_TYPE_5'  : 'TEXT',
                'READING_2_PROBLEM_TYPE_6'  : 'TEXT',
                'READING_2_PROBLEM_TYPE_7'  : 'TEXT',
                'READING_2_PROBLEM_TYPE_8'  : 'TEXT',
                'READING_2_PROBLEM_TYPE_9'  : 'TEXT',
                'READING_2_PROBLEM_TYPE_10' : 'TEXT',
                'READING_2_PROBLEM_TYPE_11' : 'TEXT',
                'READING_2_PROBLEM_TYPE_12' : 'TEXT',
                'READING_2_PROBLEM_TYPE_13' : 'TEXT',
                'READING_2_PROBLEM_TYPE_14' : 'TEXT',
                'READING_2_PROBLEM_TYPE_15' : 'TEXT',
                'READING_2_PROBLEM_TYPE_16' : 'TEXT',
            }
            
            exam_cost = []
            json_data = json.load(json_file)
            json_data = example_preprocess(json_data)
    
            list_of_dict_READING_2_PROBLEM_TYPE_1   = json_data[0]
            list_of_dict_READING_2_PROBLEM_TYPE_2   = json_data[1]
            list_of_dict_READING_2_PROBLEM_TYPE_3   = json_data[2]
            list_of_dict_READING_2_PROBLEM_TYPE_4_1 = json_data[3] ## 내용 일치 문제로, QEType이 TEXT
            list_of_dict_READING_2_PROBLEM_TYPE_4_2 = json_data[4] ## 내용 일치 문제로, QEType이 IMAGE
            list_of_dict_READING_2_PROBLEM_TYPE_5   = json_data[5]
            list_of_dict_READING_2_PROBLEM_TYPE_6   = json_data[6]
            list_of_dict_READING_2_PROBLEM_TYPE_7   = json_data[7]
            list_of_dict_READING_2_PROBLEM_TYPE_8   = json_data[8]
            list_of_dict_READING_2_PROBLEM_TYPE_9   = json_data[9]
            list_of_dict_READING_2_PROBLEM_TYPE_10  = json_data[10]
            list_of_dict_READING_2_PROBLEM_TYPE_11  = json_data[11]
            list_of_dict_READING_2_PROBLEM_TYPE_12  = json_data[12]
            list_of_dict_READING_2_PROBLEM_TYPE_13  = json_data[13]
            list_of_dict_READING_2_PROBLEM_TYPE_14  = json_data[14]
            list_of_dict_READING_2_PROBLEM_TYPE_15  = json_data[15]
            list_of_dict_READING_2_PROBLEM_TYPE_16  = json_data[16]

            my_json = {}
            my_json = update_json(
                json_data=my_json, 
                single_problems=list_of_dict_READING_2_PROBLEM_TYPE_1, 
                problem_type='READING_2_PROBLEM_TYPE_1', 
                )
            my_json = update_json(
                json_data=my_json, 
                single_problems=list_of_dict_READING_2_PROBLEM_TYPE_2, 
                problem_type='READING_2_PROBLEM_TYPE_2', 
                )
            my_json = update_json(
                json_data=my_json, 
                single_problems=list_of_dict_READING_2_PROBLEM_TYPE_3, 
                problem_type='READING_2_PROBLEM_TYPE_3', 
                )
            my_json = update_json(
                json_data=my_json, 
                single_problems=list_of_dict_READING_2_PROBLEM_TYPE_4_1, ## 내용 일치 문제로, QEType이 TEXT
                problem_type='READING_2_PROBLEM_TYPE_4',
                detailed_problem_type='READING_2_PROBLEM_TYPE_4_1', 
                )
            my_json = update_json(
                json_data=my_json, 
                single_problems=list_of_dict_READING_2_PROBLEM_TYPE_4_2, ## 내용 일치 문제로, QEType이 IMAGE
                problem_type='READING_2_PROBLEM_TYPE_4',
                detailed_problem_type='READING_2_PROBLEM_TYPE_4_2', 
                )
            my_json = update_json(
                json_data=my_json, 
                single_problems=list_of_dict_READING_2_PROBLEM_TYPE_5, 
                problem_type='READING_2_PROBLEM_TYPE_5', 
                )
            my_json = update_json(
                json_data=my_json, 
                single_problems=list_of_dict_READING_2_PROBLEM_TYPE_6, 
                problem_type='READING_2_PROBLEM_TYPE_6', 
                )
            my_json = update_json(
                json_data=my_json, 
                single_problems=list_of_dict_READING_2_PROBLEM_TYPE_7, 
                problem_type='READING_2_PROBLEM_TYPE_7', 
                )
            my_json = update_json(
                json_data=my_json, 
                single_problems=list_of_dict_READING_2_PROBLEM_TYPE_8, 
                problem_type='READING_2_PROBLEM_TYPE_8', 
                )
            my_json = update_json(
                json_data=my_json, 
                single_problems=list_of_dict_READING_2_PROBLEM_TYPE_9, 
                problem_type='READING_2_PROBLEM_TYPE_9', 
                )
            my_json = update_json(
                json_data=my_json, 
                single_problems=list_of_dict_READING_2_PROBLEM_TYPE_10, 
                problem_type='READING_2_PROBLEM_TYPE_10', 
                )
            my_json = update_json(
                json_data=my_json, 
                single_problems=list_of_dict_READING_2_PROBLEM_TYPE_11, 
                problem_type='READING_2_PROBLEM_TYPE_11', 
                )
            my_json = update_json(
                json_data=my_json, 
                single_problems=list_of_dict_READING_2_PROBLEM_TYPE_12, 
                problem_type='READING_2_PROBLEM_TYPE_12', 
                )
            my_json = update_json(
                json_data=my_json, 
                single_problems=list_of_dict_READING_2_PROBLEM_TYPE_13, 
                problem_type='READING_2_PROBLEM_TYPE_13', 
                )
            my_json = update_json(
                json_data=my_json, 
                single_problems=list_of_dict_READING_2_PROBLEM_TYPE_14, 
                problem_type='READING_2_PROBLEM_TYPE_14', 
                )
            my_json = update_json(
                json_data=my_json, 
                single_problems=list_of_dict_READING_2_PROBLEM_TYPE_15, 
                problem_type='READING_2_PROBLEM_TYPE_15', 
                )
            my_json = update_json(
                json_data=my_json, 
                single_problems=list_of_dict_READING_2_PROBLEM_TYPE_16, 
                problem_type='READING_2_PROBLEM_TYPE_16', 
                )
            
            file_name = file.split('/')[-1]
            # base_path = 'C:/Users/User/Desktop/temp/uni/final_project/topik-korea-data/crawling/data/json'
            base_path = '/home/shinbg/topik-korea-data/data/reading_2_v2/exam_data' 
            if base_path == '':
                raise ValueError('Please specify the base path')
            create_json_file(base_path, file_name, my_json)

Processing /home/shinbg/topik-korea-data/data/reading_2_v2/problem_data/generate_exam_20240925_0912.json
/home/shinbg/topik-korea-data/data/reading_2_v2/exam_data/generate_exam_20240925_0912.json has been created
Processing /home/shinbg/topik-korea-data/data/reading_2_v2/problem_data/generate_exam_20240925_0845.json
/home/shinbg/topik-korea-data/data/reading_2_v2/exam_data/generate_exam_20240925_0845.json has been created
Processing /home/shinbg/topik-korea-data/data/reading_2_v2/problem_data/generate_exam_20240925_0958.json
/home/shinbg/topik-korea-data/data/reading_2_v2/exam_data/generate_exam_20240925_0958.json has been created
Processing /home/shinbg/topik-korea-data/data/reading_2_v2/problem_data/generate_exam_20240925_0805.json
/home/shinbg/topik-korea-data/data/reading_2_v2/exam_data/generate_exam_20240925_0805.json has been created
Processing /home/shinbg/topik-korea-data/data/reading_2_v2/problem_data/generate_exam_20240925_0841.json
/home/shinbg/topik-korea-data/data/reading_

KeyboardInterrupt: 