In [20]:
from datasets import load_dataset, Dataset, DatasetDict, load_from_disk, concatenate_datasets
import pandas as pd
import random

In [2]:
race_dataset = load_dataset("ehovy/race", "all")

Generating test split: 100%|██████████| 4934/4934 [00:00<00:00, 169331.63 examples/s]
Generating train split: 100%|██████████| 87866/87866 [00:00<00:00, 361070.85 examples/s]
Generating validation split: 100%|██████████| 4887/4887 [00:00<00:00, 243543.13 examples/s]


In [3]:
cloth_dataset = load_dataset("AndyChiang/cloth")

Generating train split: 100%|██████████| 76850/76850 [00:00<00:00, 137471.71 examples/s]
Generating validation split: 100%|██████████| 11067/11067 [00:00<00:00, 157265.62 examples/s]
Generating test split: 100%|██████████| 11516/11516 [00:00<00:00, 172892.89 examples/s]


In [4]:
import random

cloth_train = cloth_dataset['train']
train_len = len(cloth_train)

index_to_letter = ["A", "B", "C", "D"]

korean_question = [
    "빈칸에 들어갈 말로 적절한 것은?",
    "빈칸에 들어갈 적절한 말을 고르시오.",
    "다음 중 빈칸에 들어가기 가장 적절한 말은?"
]

example_id = []
articles = []
questions = []
options = []
labels = []

for i in range(train_len):
    example_id.append(i)
    modified_setence = cloth_train[i]['sentence'].replace(' [MASK] ', '_')
    articles.append(modified_setence)
    rand_num = random.random()
    if(rand_num < 0.7):
      questions.append("Which word best fits in the blank?")
    else:
      questions.append(random.choice(korean_question))

    choices = cloth_train[i]['distractors'] + [cloth_train[i]['answer']]
    shuffled = random.sample(choices, k=4)

    options.append(shuffled)
    labels.append(index_to_letter[shuffled.index(cloth_train[i]['answer'])])

data = {
    "example_id": example_id,
    "article": articles,
    "question": questions,
    "options": options,
    "answer": labels  # 문자열 "A", "B", "C", "D"
}

new_cloth_dataset_train = Dataset.from_dict(data)


In [6]:
race_train = race_dataset['train']
train_len = len(race_train)

example_id = []
articles = []
questions = []
options = []
labels = []

for i in range(train_len):
    example_id.append(i)
    articles.append(race_train[i]['article'])

    fixed_question = race_train[i]['question'].replace('_', ' _')
    questions.append(fixed_question)
    options.append(race_train[i]['options'])
    labels.append(race_train[i]['answer'])

data = {
    "example_id": example_id,
    "article": articles,
    "question": questions,
    "options": options,
    "answer": labels  # 문자열 "A", "B", "C", "D"
}

new_race_dataset_train = Dataset.from_dict(data)

In [7]:
cloth_validation = cloth_dataset['validation']
validation_len = len(cloth_validation)

index_to_letter = ["A", "B", "C", "D"]

example_id = []
articles = []
questions = []
options = []
labels = []

korean_question = [
    "빈칸에 들어갈 말로 적절한 것은?",
    "빈칸에 들어갈 적절한 말을 고르시오.",
    "다음 중 빈칸에 들어가기 가장 적절한 말은?"
]

for i in range(validation_len):
    example_id.append(i)
    modified_setence = cloth_validation[i]['sentence'].replace(' [MASK] ', '_')
    articles.append(modified_setence)
    rand_num = random.random()
    if(rand_num < 0.7):
      questions.append("Which word best fits in the blank?")
    else:
      questions.append(random.choice(korean_question))

    choices = cloth_validation[i]['distractors'] + [cloth_validation[i]['answer']]
    shuffled = random.sample(choices, k=4)

    options.append(shuffled)
    labels.append(index_to_letter[shuffled.index(cloth_validation[i]['answer'])])

data = {
    "example_id": example_id,
    "article": articles,
    "question": questions,
    "options": options,
    "answer": labels  # 문자열 "A", "B", "C", "D"
}

new_cloth_dataset_validation = Dataset.from_dict(data)


In [8]:
race_validation = race_dataset['validation']
train_len = len(race_validation)

example_id = []
articles = []
questions = []
options = []
labels = []

for i in range(train_len):
    example_id.append(i)
    articles.append(race_validation[i]['article'])
    fixed_question = race_validation[i]['question'].replace('_', ' _')
    questions.append(fixed_question)
    options.append(race_validation[i]['options'])
    labels.append(race_validation[i]['answer'])

data = {
    "example_id": example_id,
    "article": articles,
    "question": questions,
    "options": options,
    "answer": labels  # 문자열 "A", "B", "C", "D"
}

new_race_dataset_validation = Dataset.from_dict(data)

In [9]:
cloth_test = cloth_dataset['test']
test_len = len(cloth_test)

index_to_letter = ["A", "B", "C", "D"]

example_id = []
articles = []
questions = []
options = []
labels = []

korean_question = [
    "빈칸에 들어갈 말로 적절한 것은?",
    "빈칸에 들어갈 적절한 말을 고르시오.",
    "다음 중 빈칸에 들어가기 가장 적절한 말은?"
]

for i in range(validation_len):
    example_id.append(i)
    modified_setence = cloth_test[i]['sentence'].replace(' [MASK] ', '_')
    articles.append(modified_setence)
    rand_num = random.random()
    if(rand_num < 0.7):
      questions.append("Which word best fits in the blank?")
    else:
      questions.append(random.choice(korean_question))

    choices = cloth_test[i]['distractors'] + [cloth_test[i]['answer']]
    shuffled = random.sample(choices, k=4)

    options.append(shuffled)
    labels.append(index_to_letter[shuffled.index(cloth_test[i]['answer'])])

data = {
    "example_id": example_id,
    "article": articles,
    "question": questions,
    "options": options,
    "answer": labels  # 문자열 "A", "B", "C", "D"
}

new_cloth_dataset_test = Dataset.from_dict(data)


In [10]:
race_test = race_dataset['test']
train_len = len(race_test)

example_id = []
articles = []
questions = []
options = []
labels = []

for i in range(train_len):
    example_id.append(i)
    articles.append(race_test[i]['article'])
    fixed_question = race_test[i]['question'].replace('_', ' _')
    questions.append(fixed_question)
    options.append(race_test[i]['options'])
    labels.append(race_test[i]['answer'])

data = {
    "example_id": example_id,
    "article": articles,
    "question": questions,
    "options": options,
    "answer": labels  # 문자열 "A", "B", "C", "D"
}

new_race_dataset_test = Dataset.from_dict(data)

In [11]:
ds_dict_cloth = DatasetDict({
    "train": new_cloth_dataset_train,
    "validation": new_cloth_dataset_validation,
    "test": new_cloth_dataset_test
})

ds_dict_race = DatasetDict({
    "train": new_race_dataset_train,
    "validation": new_race_dataset_validation,
    "test": new_race_dataset_test
})

In [12]:
ds_dict_cloth.save_to_disk("my_cloth")
ds_dict_race.save_to_disk("my_race")

Saving the dataset (1/1 shards): 100%|██████████| 76850/76850 [00:00<00:00, 1597752.87 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 11067/11067 [00:00<00:00, 937687.86 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 11067/11067 [00:00<00:00, 1104410.24 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 87866/87866 [00:00<00:00, 769837.72 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 4887/4887 [00:00<00:00, 411911.97 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 4934/4934 [00:00<00:00, 573482.68 examples/s]


In [13]:
real_loaded_cloth = load_from_disk("my_cloth")
real_loaded_race = load_from_disk("my_race")

In [14]:
merged_dataset_train = concatenate_datasets([real_loaded_cloth['train'], real_loaded_race['train']])
merged_dataset_validation = concatenate_datasets([real_loaded_cloth['validation'], real_loaded_race['validation']])
merged_dataset_test = concatenate_datasets([real_loaded_cloth['test'], real_loaded_race['test']])

In [15]:
merged_dataset_dict = DatasetDict({
    "train": merged_dataset_train,
    "validation": merged_dataset_validation,
    "test": merged_dataset_test
})

In [16]:
merged_dataset_dict.save_to_disk("my_merged")

Saving the dataset (1/1 shards): 100%|██████████| 164716/164716 [00:00<00:00, 1173340.88 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 15954/15954 [00:00<00:00, 1151083.31 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 16001/16001 [00:00<00:00, 1120193.92 examples/s]


In [17]:
race_dataset_middle = load_dataset("ehovy/race", "middle")
race_dataset_high = load_dataset("ehovy/race", "high")

Generating test split: 100%|██████████| 1436/1436 [00:00<00:00, 196933.71 examples/s]
Generating train split: 100%|██████████| 25421/25421 [00:00<00:00, 458226.73 examples/s]
Generating validation split: 100%|██████████| 1436/1436 [00:00<00:00, 262475.29 examples/s]
Generating test split: 100%|██████████| 3498/3498 [00:00<00:00, 168270.53 examples/s]
Generating train split: 100%|██████████| 62445/62445 [00:00<00:00, 324179.43 examples/s]
Generating validation split: 100%|██████████| 3451/3451 [00:00<00:00, 234181.80 examples/s]


In [18]:
race_middle_train = race_dataset_middle['train']
train_len = len(race_middle_train)

example_id = []
articles = []
questions = []
options = []
labels = []

for i in range(train_len):
    example_id.append(i)
    articles.append(race_middle_train[i]['article'])

    fixed_question = race_middle_train[i]['question'].replace('_', ' _')
    questions.append(fixed_question)
    options.append(race_middle_train[i]['options'])
    labels.append(race_middle_train[i]['answer'])

data = {
    "example_id": example_id,
    "article": articles,
    "question": questions,
    "options": options,
    "answer": labels  # 문자열 "A", "B", "C", "D"
}

new_race_middle_dataset_train = Dataset.from_dict(data)

In [19]:
race_middle_validation = race_dataset_middle['validation']
train_len = len(race_middle_validation)

example_id = []
articles = []
questions = []
options = []
labels = []

for i in range(train_len):
    example_id.append(i)
    articles.append(race_middle_validation[i]['article'])

    fixed_question = race_middle_validation[i]['question'].replace('_', ' _')
    questions.append(fixed_question)
    options.append(race_middle_validation[i]['options'])
    labels.append(race_middle_validation[i]['answer'])

data = {
    "example_id": example_id,
    "article": articles,
    "question": questions,
    "options": options,
    "answer": labels  # 문자열 "A", "B", "C", "D"
}

new_race_middle_dataset_validation = Dataset.from_dict(data)

In [20]:
race_middle_test = race_dataset_middle['test']
train_len = len(race_middle_test)

example_id = []
articles = []
questions = []
options = []
labels = []

for i in range(train_len):
    example_id.append(i)
    articles.append(race_middle_test[i]['article'])

    fixed_question = race_middle_test[i]['question'].replace('_', ' _')
    questions.append(fixed_question)
    options.append(race_middle_test[i]['options'])
    labels.append(race_middle_test[i]['answer'])

data = {
    "example_id": example_id,
    "article": articles,
    "question": questions,
    "options": options,
    "answer": labels  # 문자열 "A", "B", "C", "D"
}

new_race_middle_dataset_test = Dataset.from_dict(data)

In [21]:
ds_dict_race_middle = DatasetDict({
    "train": new_race_middle_dataset_train,
    "validation": new_race_middle_dataset_validation,
    "test": new_race_middle_dataset_test
})

In [22]:
ds_dict_race_middle.save_to_disk('my_race_middle')

Saving the dataset (1/1 shards): 100%|██████████| 25421/25421 [00:00<00:00, 805957.96 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1436/1436 [00:00<00:00, 335469.56 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1436/1436 [00:00<00:00, 220858.07 examples/s]


In [24]:
race_high_train = race_dataset_high['train']
train_len = len(race_high_train)

example_id = []
articles = []
questions = []
options = []
labels = []

for i in range(train_len):
    example_id.append(i)
    articles.append(race_high_train[i]['article'])

    fixed_question = race_high_train[i]['question'].replace('_', ' _')
    questions.append(fixed_question)
    options.append(race_high_train[i]['options'])
    labels.append(race_high_train[i]['answer'])

data = {
    "example_id": example_id,
    "article": articles,
    "question": questions,
    "options": options,
    "answer": labels  # 문자열 "A", "B", "C", "D"
}

new_race_high_dataset_train = Dataset.from_dict(data)

In [25]:
race_high_validation = race_dataset_high['validation']
train_len = len(race_high_validation)

example_id = []
articles = []
questions = []
options = []
labels = []

for i in range(train_len):
    example_id.append(i)
    articles.append(race_high_validation[i]['article'])

    fixed_question = race_high_validation[i]['question'].replace('_', ' _')
    questions.append(fixed_question)
    options.append(race_high_validation[i]['options'])
    labels.append(race_high_validation[i]['answer'])

data = {
    "example_id": example_id,
    "article": articles,
    "question": questions,
    "options": options,
    "answer": labels  # 문자열 "A", "B", "C", "D"
}

new_race_high_dataset_validation = Dataset.from_dict(data)

In [26]:
race_high_test = race_dataset_high['test']
train_len = len(race_high_test)

example_id = []
articles = []
questions = []
options = []
labels = []

for i in range(train_len):
    example_id.append(i)
    articles.append(race_high_test[i]['article'])

    fixed_question = race_high_test[i]['question'].replace('_', ' _')
    questions.append(fixed_question)
    options.append(race_high_test[i]['options'])
    labels.append(race_high_test[i]['answer'])

data = {
    "example_id": example_id,
    "article": articles,
    "question": questions,
    "options": options,
    "answer": labels  # 문자열 "A", "B", "C", "D"
}

new_race_high_dataset_test = Dataset.from_dict(data)

In [27]:
ds_dict_race_middle = DatasetDict({
    "train": new_race_high_dataset_train,
    "validation": new_race_high_dataset_validation,
    "test": new_race_high_dataset_test
})

In [28]:
ds_dict_race_middle.save_to_disk('my_race_high')

Saving the dataset (1/1 shards): 100%|██████████| 62445/62445 [00:00<00:00, 771988.51 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 3451/3451 [00:00<00:00, 516413.11 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 3498/3498 [00:00<00:00, 552959.54 examples/s]


In [44]:
import pandas as pd
from sklearn.model_selection import train_test_split
import ast

In [68]:
df1 = pd.read_csv('한국문제csv/dataset_part1_fix.csv')  # 파일 경로를 정확히 지정
df2 = pd.read_csv('한국문제csv/dataset_part2_fix.csv')  # 파일 경로를 정확히 지정
df3 = pd.read_csv('한국문제csv/koreaExamDataset2018검고.csv')  # 파일 경로를 정확히 지정
df4 = pd.read_csv('한국문제csv/koreaExamDataset2019검고.csv')  # 파일 경로를 정확히 지정
df5 = pd.read_csv('한국문제csv/koreaExamDataset2020검고.csv')  # 파일 경로를 정확히 지정
df6 = pd.read_csv('한국문제csv/koreaExamDataset2021검고.csv')  # 파일 경로를 정확히 지정
df7 = pd.read_csv('한국문제csv/koreaExamDataset2022검고.csv')  # 파일 경로를 정확히 지정
df8 = pd.read_csv('한국문제csv/koreaExamDataset2023검고.csv')  # 파일 경로를 정확히 지정
df9 = pd.read_csv('한국문제csv/koreaExamDataset2024검고.csv')  # 파일 경로를 정확히 지정

df_all = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8, df9], ignore_index=True)

print(df_all.shape)
print(df_all)

(1237, 5)
          example_id                                            article  \
0     2020_06_31.txt  Some people have defined wildlife damage manag...   
1     2020_06_32.txt  Through recent decades academic archaeologists...   
2     2020_06_33.txt  Digital technology accelerates dematerializati...   
3     2020_06_34.txt  Not all Golden Rules are alike; two kinds emer...   
4     2020_09_31.txt  When you begin to tell a story again that you ...   
...              ...                                                ...   
1232  2024_04_21.txt  What is a 3D printer? It’s like a normal print...   
1233  2024_04_22.txt  On New Year’s Day, my friend and I planned to ...   
1234  2024_04_23.txt  Today, pets such as dogs, cats, and rabbits ho...   
1235  2024_04_24.txt  Humans are social beings. We cannot live alone...   
1236  2024_04_25.txt  Humans are social beings. We cannot live alone...   

                                               question  \
0                         다음 빈

In [69]:
df_all = df_all.dropna(subset=['answer'])

In [70]:
train_df, temp_df = train_test_split(
    df_all,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

# 3. 남은 20%를 다시 50:50으로 → val 10%, test 10%
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    random_state=42,
    shuffle=True
)

# 4. 확인
print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")


Train: 989, Val: 124, Test: 124


In [71]:
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)


In [72]:
train_df['options'] = train_df['options'].apply(ast.literal_eval)
val_df['options'] = val_df['options'].apply(ast.literal_eval)
test_df['options'] = test_df['options'].apply(ast.literal_eval)

In [73]:
print(train_df)


         example_id                                            article  \
0    2019_06_30.txt  John was once in the office of a manager, Mich...   
1    2023_09_36.txt  \nWhen two natural bodies of water stand at di...   
2    2021_09_36.txt  \nIn the fifth century B.C.E., the Greek philo...   
3    2018_11_35.txt  In the context of SNS, media literacy has been...   
4    2024_09_21.txt  Gold plating in the project means needlessly e...   
..              ...                                                ...   
984  2022_04_13.txt  A : Mary’s birthday is coming. _ ?\nB : Good i...   
985  2023_02_22.txt  ◦ Don’t put up a tent right next to the river....   
986  2023_04_07.txt  ◦ John, _ many countries are there in Asia?\n◦...   
987  2020_04_15.txt  To get to my office from Central Station, take...   
988  2023_04_03.txt  Many animals like to play with toys. **For exa...   

                                              question  \
0                      밑줄 친 부분이 가리키는 대상이 나머지 넷과 다른 것은

In [74]:
data = {
    "example_id": list(range(len(train_df))),
    "article": train_df["article"].tolist(),
    "question": train_df["question"].tolist(),
    "options": train_df["options"].tolist(),
    "answer": train_df["answer"].tolist(),  # "A", "B", "C", "D"
}

new_train_df_dataset_train = Dataset.from_dict(data)

In [75]:
data = {
    "example_id": list(range(len(val_df))),
    "article": val_df["article"].tolist(),
    "question": val_df["question"].tolist(),
    "options": val_df["options"].tolist(),
    "answer": val_df["answer"].tolist(),  # "A", "B", "C", "D"
}

new_train_df_dataset_val = Dataset.from_dict(data)

In [76]:
data = {
    "example_id": list(range(len(test_df))),
    "article": test_df["article"].tolist(),
    "question": test_df["question"].tolist(),
    "options": test_df["options"].tolist(),
    "answer": test_df["answer"].tolist(),  # "A", "B", "C", "D"
}

new_train_df_dataset_test = Dataset.from_dict(data)

In [77]:
ds_dict_race_korean = DatasetDict({
    "train": new_train_df_dataset_train,
    "validation": new_train_df_dataset_val,
    "test": new_train_df_dataset_test
})

In [78]:
ds_dict_race_korean.save_to_disk('my_korean')

Saving the dataset (1/1 shards): 100%|██████████| 989/989 [00:00<00:00, 140663.50 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 124/124 [00:00<00:00, 47114.20 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 124/124 [00:00<00:00, 35537.66 examples/s]


In [79]:
my_merged = load_from_disk("my_merged")
my_korean = load_from_disk("my_korean")

In [80]:
merged_dataset_train = concatenate_datasets([my_merged['train'], my_korean['train']])
merged_dataset_validation = concatenate_datasets([my_merged['validation'], my_korean['validation']])
merged_dataset_test = concatenate_datasets([my_merged['test'], my_korean['test']])

In [81]:
merged_dataset_dict = DatasetDict({
    "train": merged_dataset_train,
    "validation": merged_dataset_validation,
    "test": merged_dataset_test
})

In [82]:
merged_dataset_dict.save_to_disk("my_full_merged")

Saving the dataset (1/1 shards): 100%|██████████| 165705/165705 [00:00<00:00, 911570.82 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 16078/16078 [00:00<00:00, 961037.76 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 16125/16125 [00:00<00:00, 982055.09 examples/s] 


In [97]:
from datasets import load_from_disk
from collections import defaultdict

my_korean = load_from_disk('my_korean')

In [98]:
my_korean_train = my_korean['train']
my_korean_val = my_korean['validation']
my_korean_test = my_korean['test']

In [88]:
question_counter = defaultdict(int)

for example in my_korean_test:
    question = example["question"].strip()  # 필요 시 .lower() 도 가능
    question_counter[question] += 1

In [89]:
for i, (question, count) in enumerate(question_counter.items()):
    print(f"[{count}회] {question}")

[1회] 다음 중 밑줄 친 부분의 뜻으로 가장 적절한 것을 고르시오.
[5회] 다음 대화의 빈칸에 들어갈 말로 가장 적절한 것을 고르시오.
[1회] 다음 글에서 밑줄 친 It(it)이 가리키는 것으로 가장 적절한 것은?
[7회] 다음 글의 요지로 가장 적절한 것은?
[1회] 다음 대화에서 B가 제안을 거절한 이유는?
[1회] 다음 빈칸에 들어갈 말로 가장 적절한 것은?
[1회] 글의 흐름으로 보아, 주어진 문장이 들어가기에 가장 적절한 곳을 고르시오.
They also rated how generally extroverted those fake extroverts appeared, based on their recorded voices and body language.
[12회] 다음 빈칸에 들어갈 말로 가장 적절한 것을 고르시오.
[3회] 다음 글에서 필자가 주장하는 바로 가장 적절한 것은?
[1회] 다음 Songkran에 대한 설명과 일치하지 않는 것은?
[1회] 다음 박물관에 대한 안내문의 내용과 일치하지 않는 것은?
[1회] 대화의 빈칸에 들어갈 말로 가장 적절한 것을 고르시오.
[2회] 다음 밑줄 친 부분의 뜻으로 가장 적절한 것을 고르시오.
[2회] 다음 글의 내용을 한 문장으로 요약하고자 한다. 빈칸 (A), (B)에 들어갈 말로 가장 적절한 것은?
[1회] seahorse에 관한 다음 글의 내용과 일치하지 않는 것은?
[1회] 다음 글에서 전체 흐름과 관계 없는 문장을 고르시오.
[5회] 다음 글의 주제로 가장 적절한 것은?
[3회] 윗글의 빈칸에 들어갈 말로 가장 적절한 것은?
[1회] Andy Warhol에 관한 다음 글에서 언급되지 않은 것은?
[1회] 다음 빈칸에 공통으로 들어갈 말로 가장 적절한 것을 고르시오.
[1회] 다음 글에서 Mike가 책을 빌리지 못한 이유로 가장 적절한 것은?
[1회] 다음 글에서 코끼리가 발로 땅을 치는 이유로 가장 적절한 것은?
[1회] 글의 흐름으로 보아, 주어진 문장이 들어가기에 가장 적절한 곳을 고르시

빈칸 문제

In [99]:
import re
from collections import defaultdict

# 유형별 키워드 패턴 정의
category_patterns = {
    '빈칸 추론': [
        r'빈칸에(?:서|)\s*가장(?:\s*적절한|)\s*것', r'다음 글의 빈칸', r'밑줄 친 부분에 들어갈 말', r'다음 빈칸에', r'문맥에 맞는 낱말', r'빈칸에 들어갈 말', r'빈칸'
    ],
    '문장 순서 배열': [
        r'글의 순서로 가장 적절한(?: 것|)', r'글 다음에 이어질 순서', r'주어진 글 다음에 올 순서',
    ],
    '주제/요지/제목': [
        r'주제(?:로|는)', r'요지(?:는|를)', r'제목(?:으로|은|을)', r'다음 글의 제목', r'글의 요지', r'글을 쓴 목적'
    ],
    '문장 삽입': [
        r'가장 적절한 (?:위치|곳)', r'글의 어느 위치에', r'다음 문장이 들어(?:가기에|갈 위치로)', r'문장을 넣기 가장 좋은 곳',
    ],
    '지칭 대상 파악': [
        r'(?:밑줄 친|밑줄 친 단어인)\s*\w+\s*이 가리키는 것', r'it\s*이(?:란|는|)\s*무엇을 가리키는가',
        r'(?:밑줄 친|밑줄 친 단어인)\s*[\w가-힣]+\s*(?:\([\w가-힣]+\))?\s*이\s*가리키는 것(?:으로 가장 적절한 것|)',
        r'가리키는 대상이 나머지', r'의미\s*하는 바로'

    ],
    '어휘/어법 추론': [
        r'밑줄 친 단어의 뜻', r'다음 중 문맥상 알맞은 것', r'가장 적절한 어휘', r'다음 중 어법상 옳은 것', r'어법에 맞는',
        r'어법', r'문맥상 낱말의 쓰임'
    ],
    '내용 일치/불일치': [
        r'내용과 일치하는 것', r'일치하지 않는 것은', r'글의 내용과 다른 것', r'다음 중 옳지 않은 것', r'글의 내용과 일치'
    ],
    '심경/태도': [
        r'심경(?:의 변화|)', r'태도로 가장 적절한 것', r'어떤 태도를 가지고 있는가',
    ],
    '정보 확인': [
        r'언급되지 않은 것', r'글에 나타난 정보로 옳지 않은 것', r'다음 중 알 수 없는 것은',
    ]
}

def classify_question(text):
    for category, patterns in category_patterns.items():
        for pattern in patterns:
            if re.search(pattern, text, re.IGNORECASE):
                return category
    return '기타'

# 예시 문제 리스트 (실제 사용 시 파일 읽기로 대체 가능)
questions = [
    i for i in question_counter.keys()
]

# 분류 실행
classified = defaultdict(list)

for example in my_korean_test:
    q_text = example['question']  # 또는 'text'나 'question_text' 등 실제 필드명 확인 필요
    category = classify_question(q_text)
    classified[category].append({'question': q_text, **example})

# 결과 출력
for category, qs in classified.items():
    print(f"\n[{category}] ({len(qs)}문항)")
    for q in qs:
        print(f"- {q}")



[기타] (29문항)
- {'question': '다음 중 밑줄 친 부분의 뜻으로 가장 적절한 것을 고르시오.', 'example_id': 0, 'article': 'After driving for two hours, we arrived home **at last**.', 'options': ['우선', '마침내', '요약하면', '예를 들면'], 'answer': 'B'}
- {'question': '다음 대화에서 B가 제안을 거절한 이유는?', 'example_id': 4, 'article': 'A: Let’s go to the movies.\nB : Sorry, I can’t. I have a cold.', 'options': ['감기에 걸려서', '날씨가 더워서', '숙제가 많아서', '동생을 돌봐야 해서'], 'answer': 'A'}
- {'question': '다음 글에서 필자가 주장하는 바로 가장 적절한 것은?', 'example_id': 9, 'article': 'We say to ourselves: “There is plenty of time. I’ll manage\nsomehow or other when the time comes for action.” We are\nrather proud of our ability to meet emergencies. So we do not\nplan and take precautions to prevent emergencies from\narising. It is too easy to drift through school and college,\ntaking the traditional, conventional studies that others take,\nfollowing the lines of least resistance, electing “snap courses,”\nand going with the crowd. It is too easy to take the attitude:\n“First 

In [100]:
categorized_dataset_dict = DatasetDict({
    category: Dataset.from_list(examples)
    for category, examples in classified.items()
})

# 확인
print(categorized_dataset_dict)

DatasetDict({
    기타: Dataset({
        features: ['question', 'example_id', 'article', 'options', 'answer'],
        num_rows: 29
    })
    빈칸 추론: Dataset({
        features: ['question', 'example_id', 'article', 'options', 'answer'],
        num_rows: 34
    })
    지칭 대상 파악: Dataset({
        features: ['question', 'example_id', 'article', 'options', 'answer'],
        num_rows: 8
    })
    주제/요지/제목: Dataset({
        features: ['question', 'example_id', 'article', 'options', 'answer'],
        num_rows: 20
    })
    문장 삽입: Dataset({
        features: ['question', 'example_id', 'article', 'options', 'answer'],
        num_rows: 7
    })
    내용 일치/불일치: Dataset({
        features: ['question', 'example_id', 'article', 'options', 'answer'],
        num_rows: 8
    })
    정보 확인: Dataset({
        features: ['question', 'example_id', 'article', 'options', 'answer'],
        num_rows: 4
    })
    문장 순서 배열: Dataset({
        features: ['question', 'example_id', 'article', 'options', 'an

In [101]:
categorized_dataset_dict.save_to_disk('categorized_test')

Saving the dataset (1/1 shards): 100%|██████████| 29/29 [00:00<00:00, 4570.50 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 34/34 [00:00<00:00, 13196.96 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 8/8 [00:00<00:00, 2419.38 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 20/20 [00:00<00:00, 4605.58 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 7/7 [00:00<00:00, 2824.99 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 8/8 [00:00<00:00, 2302.35 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 4/4 [00:00<00:00, 981.18 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 8/8 [00:00<00:00, 3252.03 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 4/4 [00:00<00:00, 1623.34 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 2/2 [00:00<00:00, 822.65 examples/s]
