In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import re
import json

In [None]:
import re

def clean_text(text):
    """
    텍스트에서 LaTeX 수식과 일부 특수 문자만 제거하고, 나머지 텍스트는 그대로 보존하는 함수.
    """
    # 1. LaTeX 수식 (예: $ ... $) 제거
    cleaned_text = re.sub(r'\$.*?\$', '', text)

    # 2. LaTeX 명령어 (예: \mathbf{...}, \textcircled{...} 등) 제거
    cleaned_text = re.sub(r'\\[a-zA-Z]+{.*?}', '', cleaned_text)

    # 3. 제어 문자 제거 (예: \u200c, \x07 등)
    cleaned_text = re.sub(r'[\u200c\u200b\x07]', '', cleaned_text)

    # 4. 특수 문자 및 숫자 외의 문자를 제거: 한글, 영어, 숫자, 공백을 제외한 문자 제거
    cleaned_text = re.sub(r'[^a-zA-Z0-9가-힣\s]', '', cleaned_text)

    # 5. 불필요한 공백 제거
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()

    return cleaned_text

In [None]:
import pandas as pd
import numpy as np
import pandas as pd

all_folders = [
    '<파일 경로들>'

]

for orig_file in all_folders:
    df = pd.read_pickle(orig_file+'.pkl')
    df['orig_page'] = df.apply(lambda row: [row['st_page'], row['en_page']], axis=1)
    df['Option'] = df['options'].str.split('\n')
    df['Option'] = df['Option'].replace(np.nan, 'NaN')
    df['type'] = df['Option'].apply(lambda x : '단답식' if x == 'NaN' else '객관식')
    df = df.drop(columns=['st_page','en_page','options','file_path','page'], axis=1)


    final_list = []

    def merge_text(content_list, st, en):
        merged_txt = ''
        for content in content_list:
            if "page_idx" in content and st <= content['page_idx'] <= en:
                if "text" in content:
                    merged_txt += content["text"] + " "
        return clean_text(merged_txt)

    def process_row(row):
        lecture_name, chapter_name = row['과목명'], row['강의명']
        file_path = '/content/drive/MyDrive/YouniB/MinerU추출데이터/대학교 PDF'
        add_file_name = f"{lecture_name}/{chapter_name}/auto/{chapter_name}_content_list.json"
        json_file_path = os.path.join(file_path, add_file_name)

        # json 파일 읽기
        try:
            with open(json_file_path, 'r', encoding='utf-8') as json_file:
                content_list = json.load(json_file)

            # 페이지 범위 추출 및 텍스트 병합
            st, en = row['orig_page']
            return merge_text(content_list, st, en)
        except FileNotFoundError:
            print(f"파일을 찾을 수 없습니다: {json_file_path}")
            return ""
        except Exception as e:
            print(f"오류 발생: {str(e)}, 파일: {json_file_path}")
            return ""

    # apply 함수를 사용하여 각 행에 대해 처리
    df['context'] = df.apply(process_row, axis=1)
    df.rename(columns={'orig_page':'Page'}, inplace=True)
    df.rename(columns={'question':"Question"}, inplace=True)
    df.rename(columns={'answer': 'Answer'}, inplace=True)
    df.rename(columns={'Option':'Options'}, inplace=True)
    df.rename(columns={'context':'Context'}, inplace=True)

    df.to_pickle(orig_file+'수정.pkl')
    df.to_json(orig_file+'수정.json', orient='records', force_ascii=False, indent=4)


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123 entries, 0 to 122
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Question  123 non-null    object
 1   Answer    123 non-null    object
 2   type      123 non-null    object
 3   강의명       123 non-null    object
 4   과목명       123 non-null    object
 5   Page      123 non-null    object
 6   Options   123 non-null    object
 7   Context   123 non-null    object
dtypes: object(8)
memory usage: 7.8+ KB


In [None]:
df

Unnamed: 0,Question,Answer,type,강의명,과목명,Page,Options,Context
0,Which of the following is NOT a pure substance?,C,객관식,Chapter_01,서강대학교-일반화학1,"[10, 14]","[A. Element, B. Compound, C. Mixture, D. None ...",Classification of Matter Based on Composition ...
1,Which of the following is a pure substance?,C,객관식,Chapter_01,서강대학교-일반화학1,"[10, 14]","[A. Element, B. Compound, C. Both a and b, D. ...",Classification of Matter Based on Composition ...
2,What are the two types of pure substances?,Elements and compounds.,단답식,Chapter_01,서강대학교-일반화학1,"[10, 14]",,Classification of Matter Based on Composition ...
3,Give an example of a heterogeneous mixture.,Granite,단답식,Chapter_01,서강대학교-일반화학1,"[10, 14]",,Classification of Matter Based on Composition ...
4,Which of the following is NOT a physical prope...,D,객관식,Chapter_01,서강대학교-일반화학1,"[13, 17]","[A. Boiling point, B. Density, C. Mass, D. Fla...",Classification of MatterMixtures Mixtures exhi...
...,...,...,...,...,...,...,...,...
118,"If the anion in the acid ends in -ate, what sh...",-ic acid,단답식,Chapter_02,서강대학교-일반화학1,"[58, 62]",,EXERCISE 211 Determining the Formula of an Oxy...
119,Which of the following statements is NOT part ...,C,객관식,Chapter_02,서강대학교-일반화학1,"[7, 11]",[A. Each element is composed of extremely sma...,21 THE ATOMIC THEORY OF MATTER Daltons Postula...
120,"According to Dalton's atomic theory, which of ...",B,객관식,Chapter_02,서강대학교-일반화학1,"[7, 11]",[A. They are formed by the destruction of ato...,21 THE ATOMIC THEORY OF MATTER Daltons Postula...
121,What is the key concept in the law of conserva...,The total mass of materials present before and...,단답식,Chapter_02,서강대학교-일반화학1,"[7, 11]",,21 THE ATOMIC THEORY OF MATTER Daltons Postula...
