## 데이터 및 라이브러리 로딩

In [1]:
import json
import re

In [4]:
# 데이터 경로
data_path=r'G:\내 드라이브\LAB_works\법률 LLM 프로젝트\data\데이터 전처리\2. 가공 텍스트\R078r3e_processed_corrected_tagged.txt'

# 텍스트 추출
with open(data_path, 'rb') as source:
    lines = source.readlines()
    lines = [element.decode('utf-8') for element in lines]

lines = lines[:len(lines)-1]

#print(''.join(lines))    

## 쳅터 구분

In [5]:
# 쳅터 구분

 ## 챕터 시작 지점 확인
pattern_chapter = r'^\s{2}\d{1,}\.'
idxL_chapterStart = [index for index, line in enumerate(lines) if re.match(pattern_chapter, line)]

idxL_chapterEnd = [idx_chapterStart -1 for idx_chapterStart in idxL_chapterStart]
idxL_chapterEnd.pop(0)
idxL_chapterEnd.append(len(lines)-1)

 ## 챕터 구분 및 개별 저장
linesL_chapter = []
for idx_chapterStart, idx_chapterEnd in zip(idxL_chapterStart, idxL_chapterEnd):
   lines_chapter = lines[idx_chapterStart:idx_chapterEnd+1]
   linesL_chapter.append(lines_chapter) 

## 전역 변수 정의

In [6]:
pattern_main_title = ("main", r'\s*\d{1,}\.\s+')
pattern_article = ("art", r'\s*(\d{1,}\.){2}\s+')
pattern_paragraph = ("para", r'\s*(\d{1,}\.){3}\s+')
pattern_sub_paragraph = ("paraS", r'\s*(\d{1,}\.){4}\s+')
pattern_item = ("item", r'\s*\([a-hj-u]\)\s+')
pattern_sub_item = ("itemS", r'\s*\([ivx]{1,}\)\s+')
pattern_description = ("dsc", r'\s[A-Z]') # 추후에 수정이 필요할 수 있음

patternL = [pattern_main_title, pattern_article, pattern_paragraph, pattern_sub_paragraph, pattern_item, pattern_sub_item, pattern_description]

dict_tagName = {"item":"Item", "itemS":"Sub-item", "dsc":"Description",
                "main":"Chapter main title",
                "art":"Article", "para":"Paragraph", "paraS":"Sub-paragraph"}

tags_stage = ["main", "art", "para", "paraS"]

## 기능 함수 정의

### 세부 체계 구분

In [14]:
## 함수 정의

def lineTagger(linesL_input, flag_return_onlyTag=True):
    
    ## 전역 변수 로딩
    global patternL
    
    ## 행 별로 패턴 일치 여부 검색 및 결과 테깅
    tupleL_tagInfo = []
    for tag, pattern in patternL:
        pattern = r"^" + fr"{pattern}"
        tupleL_tagInfo.append([(index, tag) for index, line in enumerate(linesL_input) if re.match(pattern, line)])
    
    ## 리스트 평탄화 및 오름차순 정렬
    tupleL_tagInfo = sum(tupleL_tagInfo,[])
    tupleL_tagInfo.sort()
    
    ## 예외처리 (조건 중복, item의 알파뱃 i ~ sub item의 로마자 i)
    pattern_ish = r'^\s*\(h\)\s'
    pattern_isi = r'^\s*\(i\)\s'
    pattern_isii = r'^\s*\(ii\)\s'

    #/ Check point 1: item (h)가 존재하는가? (문제가 될 수 있는 item (i)의 선행 조건)
    #/ True: check point 2 확인
    idxL_ish = [index for index, line in enumerate(linesL_input) if re.match(pattern_ish, line)]
    for idx_ish in idxL_ish:
   
        #// Check point 2: item (h)의 sub item이 존재하는가? (tagInfo만으로는 구분할 수 없는 요소)
        #// True: check point 3-1 확인
        if re.match(pattern_isii, linesL_input[idx_ish+2]):
            idx_cursor = idx_ish + 3
            while (tupleL_tagInfo[idx_cursor][1]=="itemS"):
                idx_cursor += 1
            #/// Check point 3: 마지막 itemS로 테그된 행이 item (i)일 수도 있지 않은가?
            #/// True: 테그 변환    
            if re.match(pattern_isi, linesL_input[idx_cursor-1]):
                tupleL_tagInfo[idx_cursor-1] = (idx_cursor-1, "item")
   
    #// False: check point 3-2 확인
        else:
            idx_cursor = idx_ish + 1
            #/// Check point 3: item (h) 다음으로 itemS로 테그된 행이 item (i)이 존재하는가?
            #// True: 테그 변환
            if tupleL_tagInfo[idx_cursor][1]=="itemS":
                tupleL_tagInfo[idx_cursor] = (idx_cursor, "item")
    
    ## 결과 반환
    if flag_return_onlyTag:
        return([tag for idx, tag in tupleL_tagInfo])      
    else:
        return(tupleL_tagInfo)

### 인덱스 검색

In [15]:
def idxFinder(tags, idx_start, tag_target, tag_end=False):
    global tags_stage
    
    idxL_target = []
    idx_cursor = idx_start
    idx_end = len(tags) - 1
    #tags_out = tags_stage.copy().split()

    
    if tag_end:
        while (idx_cursor <= idx_end and tags[idx_cursor] not in tag_end):
            if tags[idx_cursor]==tag_target:
                idxL_target.append(idx_cursor)
            idx_cursor += 1
    
    else:
        while (idx_cursor <= idx_end):
            if tags[idx_cursor]==tag_target:
                idxL_target.append(idx_cursor)
            idx_cursor += 1
            
    return(idxL_target)

### 문장 페턴 반환

In [16]:
def patternFinder(target_tag, base_patternL):
    tags = [tag for tag, pattern in base_patternL] 
    patterns = [pattern for tag, pattern in base_patternL]
    
    target_idx = tags.index(target_tag)
    target_pattern = patterns[target_idx]
    
    return(target_pattern)

### 입력 페턴을 바탕으로 문장 분해

In [17]:
def lineParser(line, break_pattern, sel_group, opt_strip=False):
    breaks = re.search(break_pattern, line)
    
    parsed_result = []   
    for idx in sel_group:
        if opt_strip:
            parsed_result.append(breaks.group(idx).strip())
        else:
            parsed_result.append(breaks.group(idx))
            
    return(parsed_result)

### JSON 구성 요소 생성 및 입력

In [18]:
def listLoader(lines, tags, base_dict, target_tag, pivot_idx, target_tag_subList=None):
    global dict_tagName
    global tags_stage
    
    idxL_loader = idxFinder(tags, pivot_idx+1, target_tag, tag_end=tags_stage)
    if target_tag_subList:
        idx_end = len(lines)-1
        idxL_with_subList = [idx_loader for idx_loader in idxL_loader if idx_loader < idx_end and tags[idx_loader+1]==target_tag_subList]
    ##        
    lines_loader = []
    for idx_loader in idxL_loader:
        if target_tag_subList and idx_loader in idxL_with_subList:
            lines_subLoader = []
            idxL_subLoader = idxFinder(tags, idx_loader+1, target_tag_subList, tag_end=target_tag)
            for idx_subLoader in idxL_subLoader: lines_subLoader.append(lines[idx_subLoader].strip())
            
            lines_loader.append([lines[idx_loader].strip(), lines_subLoader])
            
        else:     
            lines_loader.append(lines[idx_loader].strip())

    ##
    if len(lines_loader):
        fullName = dict_tagName[target_tag]
        base_dict[fullName] = lines_loader

In [19]:
def dictLoader(lines, tags, base_dict, pivot_pattern, pivot_idx, opt_dsc=False, opt_item=False, opt_itemS=False):
    
    numbering, dsc_0 = lineParser(lines[pivot_idx], rf"({pivot_pattern})" + r"(.*)", sel_group=[1,3], opt_strip=True)
    dict_loaded = base_dict[numbering] = {}
    
    if opt_dsc:
        listLoader(lines, tags, dict_loaded, target_tag="dsc", pivot_idx=pivot_idx)
        if "Description" in list(dict_loaded.keys()): dict_loaded["Description"].insert(0,dsc_0)
        else: dict_loaded["Description"] = [dsc_0]
        
    if opt_item:
        if opt_itemS:
            listLoader(lines, tags, dict_loaded, target_tag="item", target_tag_subList="itemS", pivot_idx=pivot_idx)
        else:
            listLoader(lines, tags, dict_loaded, target_tag="item", pivot_idx=pivot_idx)
        
    return(dict_loaded)

In [20]:
## 독립형 sublistLoader (미사용)

def sublistLoader(lines, tags, base_dict, target_tag, pivot_idx, opt_parse=False, opt_strip=False):
    global patterL
    global dict_tagName
    global tags_stage

    tag_pivot = tags[pivot_idx]
    tags_end = tags_stage.copy()
    tags_end.append(tag_pivot)

    idxL_loader = idxFinder(tags, pivot_idx+1, target_tag, tag_end=tags_end)
    ##        
    lines_loader = []
    for idx_loader in idxL_loader:
        lines_loader.append(lines[idx_loader].strip())

    ##
    if len(lines_loader):
        if opt_parse:
            pattern_pivot = patternFinder(tag_pivot, patterL)
            ele_searchingKey = lineParser(lines[pivot_idx], rf"({pattern_pivot})" + r"(.*)", sel_group=[3], opt_strip=opt_strip)[0]
        else:
            ele_searchingKey = lines[pivot_idx].strip()
            
        idx_pivotInlist = base_dict[fullName_pivot].index(ele_searchingKey)

        fullName_pivot = dict_tagName[tag_pivot]
        base_dict[fullName_pivot][idx_pivotInlist].append(lines_loader)

## 워킹 스페이스

### 개별 쳅터

In [23]:
lines_chapter = linesL_chapter[4]
tags_chapter = lineTagger(lines_chapter, flag_return_onlyTag=True)

#     
dict_chapter = {}

#
line_main = lines_chapter[0]
num_chapter, title_chapter = [elm.strip() for elm in line_main.split('.')]

dict_chapter["Chapter"] = num_chapter
dict_chapter["Title"] = title_chapter

#
listLoader(lines_chapter, tags_chapter, dict_chapter, target_tag="dsc", pivot_idx=0)
listLoader(lines_chapter, tags_chapter, dict_chapter, target_tag="item", target_tag_subList="itemS", pivot_idx=0)

#
idxL_art = idxFinder(tags=tags_chapter, idx_start=0, tag_target="art")
pattern_art = patternFinder("art", patternL)
for idx_art in idxL_art:
    
    dict_art = dictLoader(lines_chapter, tags_chapter, dict_chapter, pattern_art, idx_art, opt_dsc=True, opt_item=True, opt_itemS=True)
    
    idxL_para = idxFinder(tags=tags_chapter, idx_start=idx_art+1, tag_target="para", tag_end="art")
    pattern_para = patternFinder("para", patternL)
    for idx_para in idxL_para:    

        dict_para = dictLoader(lines_chapter, tags_chapter, dict_art, pattern_para, idx_para, opt_dsc=True, opt_item=True, opt_itemS=True)
        
        idxL_paraS = idxFinder(tags=tags_chapter, idx_start=idx_para+1, tag_target="paraS", tag_end=["art", "para"])
        pattern_paraS = patternFinder("paraS", patternL)
        for idx_paraS in idxL_paraS:
            
            dictLoader(lines_chapter, tags_chapter, dict_para, pattern_paraS, idx_paraS, opt_dsc=True, opt_item=True, opt_itemS=True)

### 문서 전체

In [100]:
dict_full = {}
dict_full["Resource"] = "Addendum 77: UN Regulation No. 78"
dict_full["Title"] = "Agreement: Concerning the Adoption of Harmonized Technical United Nations Regulations for Wheeled Vehicles, Equipment and Parts which can be Fitted and/or be Used on Wheeled Vehicles and the Conditions for Reciprocal Recognition of Approvals Granted on the Basis of these United Nations Regulations"
dict_full["Version"] = "Revision 3"

for lines_chapter in linesL_chapter:
    tags_chapter = lineTagger(lines_chapter, flag_return_onlyTag=True)

    #     
    dict_chapter = {}

    #
    line_main = lines_chapter[0]
    num_chapter, title_chapter = [elm.strip() for elm in line_main.split('.')]

    dict_chapter["Chapter"] = num_chapter
    dict_chapter["Title"] = title_chapter

    #
    listLoader(lines_chapter, tags_chapter, dict_chapter, target_tag="dsc", pivot_idx=0)
    listLoader(lines_chapter, tags_chapter, dict_chapter, target_tag="item", target_tag_subList="itemS", pivot_idx=0)

    #
    idxL_art = idxFinder(tags=tags_chapter, idx_start=0, tag_target="art")
    pattern_art = patternFinder("art", patternL)
    for idx_art in idxL_art:
        
        dict_art = dictLoader(lines_chapter, tags_chapter, dict_chapter, pattern_art, idx_art, opt_dsc=True, opt_item=True, opt_itemS=True)
        
        idxL_para = idxFinder(tags=tags_chapter, idx_start=idx_art+1, tag_target="para", tag_end="art")
        pattern_para = patternFinder("para", patternL)
        for idx_para in idxL_para:    

            dict_para = dictLoader(lines_chapter, tags_chapter, dict_art, pattern_para, idx_para, opt_dsc=True, opt_item=True, opt_itemS=True)
            
            idxL_paraS = idxFinder(tags=tags_chapter, idx_start=idx_para+1, tag_target="paraS", tag_end=["art", "para"])
            pattern_paraS = patternFinder("paraS", patternL)
            for idx_paraS in idxL_paraS:
                
                dictLoader(lines_chapter, tags_chapter, dict_para, pattern_paraS, idx_paraS, opt_dsc=True, opt_item=True, opt_itemS=True)
    
    #
    dict_full[f"chapter {num_chapter}"] = dict_chapter

## JSON 직렬화 처리 후 파일 저장

### 문서 전체

In [106]:
with open("R078r3e_fullchpter_converted_ver1.json", "w") as file:
    json.dump(dict_full, file)

### 개별 챕터

In [118]:
keys_chapter = list(filter(lambda x: "chapter" in x, list(dict_full.keys())))

for number_of_chapter in range(len(linesL_chapter)):
    with open(f"R078r3e_chapter{number_of_chapter+1}_converted_ver1.json", "w") as file:
        json.dump(dict_full[keys_chapter[number_of_chapter]], file)