## 라이브러리 임포트

In [11]:
import tiktoken
import json
import pprint as ppr

## 데이터 임포트

In [67]:
data_path=r'G:\내 드라이브\LAB_works\법률 LLM 프로젝트\data\데이터 전처리\3. JSON 컨버팅\R078r3e_chapter5_converted_ver1.json'

# 텍스트 추출
with open(data_path, 'r') as source:
    dict_chapter = json.load(source)
    
ppr.pprint(dict_chapter)   

{'5.1.': {'5.1.1.': {'Description': ['Each vehicle shall meet each of the '
                                     'tests specified for a vehicle of its '
                                     'category and for those brake features on '
                                     'the vehicle.']},
          '5.1.10.': {'Description': ['For vehicles that use hydraulic fluid '
                                      'for brake force transmission, the '
                                      'master cylinder shall: "Item"'],
                      'Item': ['(a) Have a sealed, covered, separate reservoir '
                               'for each brake system;',
                               '(b) Have a minimum reservoir capacity '
                               'equivalent to 1.5 times the total fluid '
                               'displacement required to satisfy the new to '
                               'fully worn lining  condition with the worst '
                               'case brake ad

## 함수 및 전역 변수 정의

In [4]:
tokenizer = tiktoken.get_encoding("cl100k_base")

def tiktoken_len(text):
    tokens = tokenizer.encode(text)
    return len(tokens)

In [76]:
num_chapter = dict_chapter['Chapter']
title_chapter = dict_chapter["Title"]

## 워킹 코드

### 토큰 사이즈 계산 및 청크 단위 선정

In [70]:
dict_chunkGroupInfo = {}
keys_art = list(dict_chapter.keys())
for key_art in keys_art[2:]:
   
   keys_para = list(dict_chapter[key_art].keys())
   tokenLengths_para = [tiktoken_len(str(dict_chapter[key_art][key_para])) for key_para in keys_para]
   
   list_chunkGroups = []
   chunkGroup = []
   tokenLength_sum = 0
   for tokenLength_para, key_para in zip(tokenLengths_para[1:], keys_para[1:]):
      
       tokenLength_sum += tokenLength_para
       chunkGroup.append(key_para)
       
       if tokenLength_sum >= 500 or key_para == keys_para[-1]:
          
          chunkGroup.insert(0, keys_para[0])
          list_chunkGroups.append(tuple(chunkGroup))
          
          chunkGroup = []
          tokenLength_sum = 0
          
   if len(list_chunkGroups)==0:
      list_chunkGroups.append(tuple([keys_para[0]]))
          
   dict_chunkGroupInfo[key_art] = list_chunkGroups

In [71]:
dict_chunkGroupInfo

{'5.1.': [('Description',
   '5.1.1.',
   '5.1.2.',
   '5.1.3.',
   '5.1.4.',
   '5.1.5.',
   '5.1.6.',
   '5.1.7.'),
  ('Description',
   '5.1.8.',
   '5.1.9.',
   '5.1.10.',
   '5.1.11.',
   '5.1.12.',
   '5.1.13.'),
  ('Description', '5.1.14.', '5.1.15.', '5.1.16.'),
  ('Description', '5.1.17.', '5.1.18.')],
 '5.2.': [('Description', '5.2.1.', '5.2.2.', '5.2.3.')],
 '5.3.': [('Description', '5.3.1.', '5.3.2.', '5.3.3.')],
 '5.4.': [('Description',)]}

### 청크 단위 별로 하위 dict 생성

In [99]:
list_chapter_by_chunkGroupInfo = []

In [100]:
for key_art in dict_chunkGroupInfo.keys():
    base_dict = dict_chapter[key_art]
    chunkGroups = dict_chunkGroupInfo[key_art]
    for chunkGroup in chunkGroups:
        sub_dict = {key: base_dict[key] for key in chunkGroup if key in base_dict}
        list_chapter_by_chunkGroupInfo.append({f"chapter.{num_chapter}_{title_chapter}":{key_art:sub_dict}})

In [101]:
ppr.pprint(list_chapter_by_chunkGroupInfo)

[{'chapter.5_Specifications': {'5.1.': {'5.1.1.': {'Description': ['Each '
                                                                   'vehicle '
                                                                   'shall meet '
                                                                   'each of '
                                                                   'the tests '
                                                                   'specified '
                                                                   'for a '
                                                                   'vehicle of '
                                                                   'its '
                                                                   'category '
                                                                   'and for '
                                                                   'those '
                                                                

### 메타 데이터를 기록하여 JSONL 형식으로 저장

In [102]:
with open("R078r3e_chapter5_chunked.jsonl" , encoding= "utf-8",mode="w") as file: 
	for i in list_chapter_by_chunkGroupInfo: file.write(json.dumps(i) + "\n")

In [103]:
with open("R078r3e_chapter5_chunked.jsonl") as f: 
	for line in f: print(line)

{"chapter.5_Specifications": {"5.1.": {"Description": ["Brake system requirements"], "5.1.1.": {"Description": ["Each vehicle shall meet each of the tests specified for a vehicle of its category and for those brake features on the vehicle."]}, "5.1.2.": {"Description": ["Service brake system control operation", "Vehicles shall have configurations that enable a rider to actuate the service brake system control while seated in the normal driving position and with both hands on the steering control."]}, "5.1.3.": {"Description": ["Secondary brake system control operation", "Vehicles shall have configurations that enable a rider to actuate the secondary brake system control while seated in the normal driving position and with at least one hand on the steering control."]}, "5.1.4.": {"Description": ["Parking brake system", "If a parking brake system is fitted, it shall hold the vehicle stationary on the slope prescribed in paragraph 1.1.4. of Annex 3.", "The parking brake system shall: \"It