From 0f16c203942145663e6d5a96088487886d15a4ab Mon Sep 17 00:00:00 2001 From: JK Date: Tue, 17 Feb 2026 23:07:56 +0900 Subject: [PATCH 1/7] =?UTF-8?q?confluence-mdx:=20Phase=20L3=20lost=5Finfo?= =?UTF-8?q?=20=EC=84=A4=EA=B3=84=20=EB=AC=B8=EC=84=9C=20=EC=9E=91=EC=84=B1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 정순변환 시 손실되는 4가지 비가역 정보(emoticon, link, filename, adf_extension)를 mapping.yaml에 기록하는 설계를 수립합니다. LostInfoCollector 수집 메커니즘과 mapping.yaml 스키마 v2를 정의합니다. Co-Authored-By: Claude Opus 4.6 --- .../plans/2026-02-17-l3-lost-info-design.md | 247 ++++++++++++++++++ 1 file changed, 247 insertions(+) create mode 100644 confluence-mdx/docs/plans/2026-02-17-l3-lost-info-design.md diff --git a/confluence-mdx/docs/plans/2026-02-17-l3-lost-info-design.md b/confluence-mdx/docs/plans/2026-02-17-l3-lost-info-design.md new file mode 100644 index 000000000..e253e258e --- /dev/null +++ b/confluence-mdx/docs/plans/2026-02-17-l3-lost-info-design.md @@ -0,0 +1,247 @@ +# Phase L3: Forward Conversion 정보 보존 (lost_info) 설계 + +## 목표 + +정순변환(Forward Conversion) 과정에서 손실되는 Confluence XHTML 정보를 `mapping.yaml`의 각 매핑 엔트리에 `lost_info` 필드로 기록한다. 이 정보는 이후 Phase L4에서 역순변환(Backward Conversion) 시 원본에 가까운 XHTML을 재생성하는 데 사용한다. + +## 배경 + +현재 emitter 단독 검증(normalize-diff)은 21건 중 1건만 통과한다. 실패 원인 분포: + +| 원인 | 건수 | 비가역 여부 | +|------|------|-------------| +| `attachment_filename_mismatch` | 9 | 비가역 — 정순변환에서 파일명 정규화 | +| `internal_link_unresolved` (`#link-error`) | 7 | 비가역 — 정순변환에서 원본 정보 소실 | +| `emoticon_representation_mismatch` | 4 | 비가역 — 정순변환에서 shortname 소실 | +| `adf_extension_panel_mismatch` | 3 | 비가역 — ADF 구조가 MDX에 없음 | + +이 항목들은 emitter 개선만으로는 해결할 수 없다. 정순변환 시점에 원본 정보를 보존해야 한다. + +## 현재 아키텍처 + +### 정보 손실 지점 (converter/core.py) + +| 항목 | 위치 | 입력 XHTML | 출력 MDX | 손실 정보 | +|------|------|-----------|---------|----------| +| emoticon | `SingleLineParser.convert_recursively` (core.py:318-343) | `` | `✔️` | ac:name, ac:emoji-id, ac:emoji-shortname | +| link | `SingleLineParser.convert_ac_link` (core.py:491, context.py:413) | `...` | `[Missing Page](#link-error)` | ri:content-title, ri:space-key, raw XHTML | +| filename | `Attachment.__init__` (core.py:57-61) | `ri:filename="스크린샷 2024-08-01 오후 2.50.06.png"` | `screenshot-20240801-145006.png` | 원본 파일명 | +| adf_extension | `AdfExtensionToCallout.convert_recursively` (core.py:1308-1349) | `......` | `...` | adf-fallback, local-id, 전체 구조 | +| stripped_attrs | `get_html_attributes` (context.py:560-598) | style, class, ac:local-id, data-* | (제거됨) | 속성 값 | + +### mapping.yaml 생성 흐름 + +PR #798에서 `converter/sidecar_mapping.py`가 삭제되고, `reverse_sync/sidecar.py`의 `generate_sidecar_mapping()`으로 통합되었다. + +``` +converter/cli.py + └─ generate_sidecar_mapping(xhtml, mdx, page_id) ← reverse_sync/sidecar.py + ├─ record_mapping(xhtml) → List[BlockMapping] + ├─ parse_mdx_blocks(mdx) → List[MdxBlock] + └─ 텍스트 기반 매칭 → mapping.yaml 출력 +``` + +### mapping.yaml 현재 스키마 (version 1) + +```yaml +version: 1 +source_page_id: "544381877" +mdx_file: "page.mdx" +mappings: + - xhtml_xpath: "h2[1]" + xhtml_type: "heading" + mdx_blocks: [2] + - xhtml_xpath: "p[1]" + xhtml_type: "paragraph" + mdx_blocks: [4] +``` + +## 설계 + +### 1. mapping.yaml 스키마 확장 (version 2) + +각 mapping entry에 `lost_info` 필드를 추가한다. 손실 정보가 없는 블록은 필드를 생략한다. + +```yaml +version: 2 +source_page_id: "544381877" +mdx_file: "page.mdx" +mappings: + - xhtml_xpath: "h2[1]" + xhtml_type: "heading" + mdx_blocks: [2] + # lost_info 생략 — 손실 없음 + - xhtml_xpath: "p[3]" + xhtml_type: "paragraph" + mdx_blocks: [8] + lost_info: + emoticons: + - name: "tick" + shortname: ":check_mark:" + emoji_id: "atlassian-check_mark" + fallback: ":check_mark:" + raw: '' + links: + - content_title: "Missing Page" + space_key: "" + raw: 'Missing Page' + filenames: + - original: "스크린샷 2024-08-01 오후 2.50.06.png" + normalized: "screenshot-20240801-145006.png" + adf_extensions: + - panel_type: "note" + raw: '...' + stripped_attrs: + ac:macro-id: "a935cf67-ed54-4b6b-aafd-63cbebe654e1" +``` + +### 2. 수집 메커니즘 + +#### LostInfoCollector 클래스 + +`converter/core.py`에 블록 단위 수집기를 도입한다. + +```python +class LostInfoCollector: + """현재 블록 변환 중 손실되는 정보를 수집한다.""" + + def __init__(self): + self._emoticons: list[dict] = [] + self._links: list[dict] = [] + self._filenames: list[dict] = [] + self._adf_extensions: list[dict] = [] + self._stripped_attrs: dict[str, str] = {} + + def add_emoticon(self, node: Tag) -> None: + self._emoticons.append({ + "name": node.get("ac:name", ""), + "shortname": node.get("ac:emoji-shortname", ""), + "emoji_id": node.get("ac:emoji-id", ""), + "fallback": node.get("ac:emoji-fallback", ""), + "raw": str(node), + }) + + def add_link(self, node: Tag) -> None: + ri_page = node.find("ri:page") + self._links.append({ + "content_title": ri_page.get("ri:content-title", "") if ri_page else "", + "space_key": ri_page.get("ri:space-key", "") if ri_page else "", + "raw": str(node), + }) + + def add_filename(self, original: str, normalized: str) -> None: + if original != normalized: + self._filenames.append({ + "original": original, + "normalized": normalized, + }) + + def add_adf_extension(self, node: Tag, panel_type: str) -> None: + self._adf_extensions.append({ + "panel_type": panel_type, + "raw": str(node), + }) + + def add_stripped_attr(self, name: str, value: str) -> None: + self._stripped_attrs[name] = value + + def to_dict(self) -> dict: + """빈 카테고리를 제외하고 반환한다.""" + result = {} + if self._emoticons: + result["emoticons"] = self._emoticons + if self._links: + result["links"] = self._links + if self._filenames: + result["filenames"] = self._filenames + if self._adf_extensions: + result["adf_extensions"] = self._adf_extensions + if self._stripped_attrs: + result["stripped_attrs"] = self._stripped_attrs + return result +``` + +#### 수집 지점 + +| 수집 지점 | 트리거 조건 | 수집 메서드 | +|----------|-----------|-----------| +| `SingleLineParser` ac:emoticon 분기 (core.py:318) | 항상 | `collector.add_emoticon(node)` | +| `SingleLineParser.convert_ac_link` (core.py:491, context.py:413) | `href == '#link-error'` | `collector.add_link(node)` | +| `Attachment.__init__` (core.py:57-61) | `original != normalized` | `collector.add_filename(original, normalized)` | +| `AdfExtensionToCallout` (core.py:1317-1349) | 항상 | `collector.add_adf_extension(node, panel_type)` | +| `get_html_attributes` (context.py:572-581) | 제거 시 | `collector.add_stripped_attr(name, value)` | + +#### collector 전달 경로 + +``` +ConfluenceToMarkdown.as_markdown() + └─ MultiLineParser(soup) + ├─ 블록 진입 시: collector = LostInfoCollector() + ├─ SingleLineParser(node, collector=collector) + ├─ AdfExtensionToCallout(node, collector=collector) + ├─ 블록 완료 시: block_lost_infos[block_index] = collector.to_dict() + └─ 전체 완료 시: self.lost_infos = block_lost_infos +``` + +`ConfluenceToMarkdown`이 수집 결과를 `self.lost_infos: dict[int, dict]` (블록 인덱스 → lost_info)로 보유한다. + +### 3. mapping.yaml에 lost_info 기록 + +`converter/cli.py`에서 `generate_sidecar_mapping()` 호출 시 lost_info를 전달한다. + +```python +# converter/cli.py (변경) +sidecar_yaml = generate_sidecar_mapping( + xhtml_original, markdown_content, page_id, + lost_infos=converter.lost_infos, # 추가 +) +``` + +`generate_sidecar_mapping()` (reverse_sync/sidecar.py)에서 각 entry에 lost_info를 병합한다. + +```python +# mapping entry 생성 시 +entry = { + 'xhtml_xpath': xm.xhtml_xpath, + 'xhtml_type': xm.type, + 'mdx_blocks': matched_indices, +} +if lost_infos and matched_indices: + # MDX 블록 인덱스로 lost_info 조회 + for mdx_idx in matched_indices: + if mdx_idx in lost_infos and lost_infos[mdx_idx]: + entry['lost_info'] = lost_infos[mdx_idx] + break +entries.append(entry) +``` + +### 4. roundtrip.json과의 연계 + +`build_sidecar()` (reverse_sync/sidecar.py)가 roundtrip.json을 빌드할 때, 같은 페이지의 mapping.yaml이 존재하면 `lost_info`를 읽어 `SidecarBlock.lost_info`에 복사한다. + +이 연계는 L3 범위에서는 구현하지 않는다. L4에서 `lost_info`를 실제 활용할 때 필요에 따라 구현한다. + +### 5. stripped_attrs 범위 제한 + +`stripped_attrs`는 수가 매우 많고 (style, class, data-*, ac:local-id 등 거의 모든 블록에 존재), L4에서의 활용 가치가 낮다. L3에서는 **emoticons, links, filenames, adf_extensions** 4개 카테고리만 구현한다. `stripped_attrs`는 필요 시 후속 Phase에서 추가한다. + +## 인수 기준 + +1. **기능:** 비가역 정보(emoticon, link, filename, adf_extension)를 포함하는 testcase 블록의 mapping.yaml에 `lost_info`가 기록됨 +2. **회귀 없음:** 기존 splice 21/21 byte-equal 유지 +3. **테스트:** 기존 테스트 전부 통과 + lost_info 수집 유닛 테스트 + +## 구현 순서 (개략) + +1. `LostInfoCollector` 클래스 작성 + 유닛 테스트 +2. `converter/core.py` 수집 지점에 collector 연결 (emoticon → link → filename → adf_extension) +3. `generate_sidecar_mapping()`에 lost_info 전달 경로 추가 +4. mapping.yaml 스키마 version 2 반영 +5. testcase 검증: 실제 testcase에서 lost_info 기록 확인 +6. 기존 테스트 회귀 검증 + +## 범위 외 (L4 이후) + +- `lost_info`를 활용한 역순변환 품질 개선 +- `roundtrip.json`의 `SidecarBlock.lost_info` 연계 +- `stripped_attrs` 수집 From 6751bfc88b771192a9d6194e353aff6594b728fd Mon Sep 17 00:00:00 2001 From: JK Date: Tue, 17 Feb 2026 23:48:19 +0900 Subject: [PATCH 2/7] =?UTF-8?q?confluence-mdx:=20LostInfoCollector=20?= =?UTF-8?q?=ED=81=B4=EB=9E=98=EC=8A=A4=20=EB=B0=8F=20=EC=9C=A0=EB=8B=9B=20?= =?UTF-8?q?=ED=85=8C=EC=8A=A4=ED=8A=B8=20=EC=9E=91=EC=84=B1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.6 --- confluence-mdx/bin/converter/lost_info.py | 57 ++++++++++++++++ .../tests/test_lost_info_collector.py | 67 +++++++++++++++++++ 2 files changed, 124 insertions(+) create mode 100644 confluence-mdx/bin/converter/lost_info.py create mode 100644 confluence-mdx/tests/test_lost_info_collector.py diff --git a/confluence-mdx/bin/converter/lost_info.py b/confluence-mdx/bin/converter/lost_info.py new file mode 100644 index 000000000..db83618f0 --- /dev/null +++ b/confluence-mdx/bin/converter/lost_info.py @@ -0,0 +1,57 @@ +"""Forward conversion 시 손실되는 정보를 블록 단위로 수집한다.""" +from __future__ import annotations + +from bs4 import Tag + + +class LostInfoCollector: + """현재 블록 변환 중 손실되는 정보를 수집한다.""" + + def __init__(self) -> None: + self._emoticons: list[dict] = [] + self._links: list[dict] = [] + self._filenames: list[dict] = [] + self._adf_extensions: list[dict] = [] + + def add_emoticon(self, node: Tag) -> None: + self._emoticons.append({ + 'name': node.get('ac:name', ''), + 'shortname': node.get('ac:emoji-shortname', ''), + 'emoji_id': node.get('ac:emoji-id', ''), + 'fallback': node.get('ac:emoji-fallback', ''), + 'raw': str(node), + }) + + def add_link(self, node: Tag) -> None: + ri_page = node.find('ri:page') + self._links.append({ + 'content_title': ri_page.get('ri:content-title', '') if ri_page else '', + 'space_key': ri_page.get('ri:space-key', '') if ri_page else '', + 'raw': str(node), + }) + + def add_filename(self, original: str, normalized: str) -> None: + if original != normalized: + self._filenames.append({ + 'original': original, + 'normalized': normalized, + }) + + def add_adf_extension(self, node: Tag, panel_type: str) -> None: + self._adf_extensions.append({ + 'panel_type': panel_type, + 'raw': str(node), + }) + + def to_dict(self) -> dict: + """빈 카테고리를 제외하고 반환한다.""" + result: dict = {} + if self._emoticons: + result['emoticons'] = self._emoticons + if self._links: + result['links'] = self._links + if self._filenames: + result['filenames'] = self._filenames + if self._adf_extensions: + result['adf_extensions'] = self._adf_extensions + return result diff --git a/confluence-mdx/tests/test_lost_info_collector.py b/confluence-mdx/tests/test_lost_info_collector.py new file mode 100644 index 000000000..a43bd35d3 --- /dev/null +++ b/confluence-mdx/tests/test_lost_info_collector.py @@ -0,0 +1,67 @@ +"""LostInfoCollector 유닛 테스트.""" +from bs4 import BeautifulSoup +from converter.lost_info import LostInfoCollector + + +def _tag(html: str): + """HTML 문자열에서 첫 번째 Tag를 반환한다.""" + return BeautifulSoup(html, 'html.parser').find() + + +class TestLostInfoCollector: + def test_empty_collector_returns_empty_dict(self): + c = LostInfoCollector() + assert c.to_dict() == {} + + def test_add_emoticon(self): + c = LostInfoCollector() + node = _tag( + '' + ) + c.add_emoticon(node) + result = c.to_dict() + assert len(result['emoticons']) == 1 + assert result['emoticons'][0]['name'] == 'tick' + assert result['emoticons'][0]['shortname'] == ':check_mark:' + + def test_add_link(self): + c = LostInfoCollector() + node = _tag( + '' + 'Missing Page' + ) + c.add_link(node) + result = c.to_dict() + assert len(result['links']) == 1 + assert result['links'][0]['content_title'] == 'Missing Page' + + def test_add_filename(self): + c = LostInfoCollector() + c.add_filename('스크린샷 2024-08-01 오후 2.50.06.png', 'screenshot-20240801-145006.png') + result = c.to_dict() + assert len(result['filenames']) == 1 + assert result['filenames'][0]['original'] == '스크린샷 2024-08-01 오후 2.50.06.png' + + def test_add_filename_same_name_skips(self): + c = LostInfoCollector() + c.add_filename('image.png', 'image.png') + assert c.to_dict() == {} + + def test_add_adf_extension(self): + c = LostInfoCollector() + node = _tag('') + c.add_adf_extension(node, 'note') + result = c.to_dict() + assert len(result['adf_extensions']) == 1 + assert result['adf_extensions'][0]['panel_type'] == 'note' + + def test_multiple_categories(self): + c = LostInfoCollector() + node = _tag('') + c.add_emoticon(node) + c.add_filename('orig.png', 'norm.png') + result = c.to_dict() + assert 'emoticons' in result + assert 'filenames' in result + assert 'links' not in result From ac8f3e6da60a3296ad53e68edfcdc505908499a0 Mon Sep 17 00:00:00 2001 From: JK Date: Tue, 17 Feb 2026 23:53:54 +0900 Subject: [PATCH 3/7] =?UTF-8?q?confluence-mdx:=20=EB=AA=A8=EB=93=A0=20?= =?UTF-8?q?=ED=8C=8C=EC=84=9C=20=ED=81=B4=EB=9E=98=EC=8A=A4=EC=97=90=20Los?= =?UTF-8?q?tInfoCollector=20=EC=97=B0=EA=B2=B0=ED=95=A9=EB=8B=88=EB=8B=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Attachment, SingleLineParser, MultiLineParser, AdfExtensionToCallout, ConfluenceToMarkdown 클래스에 collector 파라미터를 추가하고, emoticon/link/filename/adf_extension 정보를 수집합니다. Co-Authored-By: Claude Opus 4.6 --- confluence-mdx/bin/converter/core.py | 74 ++++++++++++------- .../tests/test_lost_info_collector.py | 61 +++++++++++++++ 2 files changed, 108 insertions(+), 27 deletions(-) diff --git a/confluence-mdx/bin/converter/core.py b/confluence-mdx/bin/converter/core.py index 88b035eb2..8a8d20b3f 100644 --- a/confluence-mdx/bin/converter/core.py +++ b/confluence-mdx/bin/converter/core.py @@ -35,6 +35,7 @@ ancestors, print_node_with_properties, get_html_attributes, datetime_ko_format, normalize_screenshots, clean_text, ) +from converter.lost_info import LostInfoCollector try: import emoji @@ -48,7 +49,8 @@ class Attachment: """ - def __init__(self, node: Tag, input_dir: str, output_dir: str, public_dir: str) -> None: + def __init__(self, node: Tag, input_dir: str, output_dir: str, public_dir: str, + collector: LostInfoCollector | None = None) -> None: filename = node.get('ri:filename', '') if not filename: logging.warning(f"add_attachment: Unexpected {print_node_with_properties(node)} from {ancestors(node)} in {ctx.INPUT_FILE_PATH}") @@ -59,6 +61,8 @@ def __init__(self, node: Tag, input_dir: str, output_dir: str, public_dir: str) filename = unicodedata.normalize('NFC', filename) self.original: str = filename self.filename: str = normalize_screenshots(filename) + if collector and self.original != self.filename: + collector.add_filename(self.original, self.filename) self.used: bool = False self.input_dir: str = input_dir @@ -112,8 +116,9 @@ def as_markdown(self, caption: Optional[str] = None, width: Optional[str] = None class SingleLineParser: - def __init__(self, node): + def __init__(self, node, collector: LostInfoCollector | None = None): self.node = node + self.collector = collector self.markdown_lines = [] self.applicable_nodes = { 'span', @@ -242,7 +247,7 @@ def convert_recursively(self, node): for child in node.children: if isinstance(child, Tag) and child.name == 'ac:parameter': if child.get('ac:name') == 'title': - title = SingleLineParser(child).markdown_of_children(child) + title = SingleLineParser(child, collector=self.collector).markdown_of_children(child) elif child.get('ac:name') == 'colour': confluence_color = child.text.strip() color = CONFLUENCE_COLOR_TO_BADGE_COLOR.get(confluence_color, 'grey') @@ -274,7 +279,7 @@ def convert_recursively(self, node): self.markdown_lines.append("
") elif node.name in ['a']: href, readable_anchor_text = convert_confluence_url(node.get('href', '#')) - link_text = ''.join(SingleLineParser(child).as_markdown for child in node.children) + link_text = ''.join(SingleLineParser(child, collector=self.collector).as_markdown for child in node.children) if readable_anchor_text and link_text.startswith('http'): link_text = readable_anchor_text self.markdown_lines.append(f"[{link_text}]({href})") @@ -323,6 +328,8 @@ def convert_recursively(self, node): """ + if self.collector: + self.collector.add_emoticon(node) # First check ac:emoji-fallback attribute (may already be an emoji character) fallback = node.get('ac:emoji-fallback', '') shortname = node.get('ac:emoji-shortname', '') @@ -394,7 +401,7 @@ def markdown_of_children(self, node): """ markdown = [] for child in node.children: - markdown.append(SingleLineParser(child).as_markdown) + markdown.append(SingleLineParser(child, collector=self.collector).as_markdown) return ''.join(markdown) def convert_ac_link(self, node: Tag) -> str: @@ -479,7 +486,7 @@ def convert_ac_link(self, node: Tag) -> str: # Process child nodes to extract link body and determine href for child in node.children: if isinstance(child, Tag) and child.name == 'ac:link-body': - link_body = SingleLineParser(child).as_markdown + link_body = SingleLineParser(child, collector=self.collector).as_markdown elif isinstance(child, Tag) and child.name == 'ri:space': # Handle space links: @@ -505,9 +512,13 @@ def convert_ac_link(self, node: Tag) -> str: # External link - resolve using pageId from link mapping # Get link_body explicitly to ensure we have the correct text for lookup link_body_node = node.find('ac:link-body') - current_link_body = SingleLineParser(link_body_node).as_markdown if link_body_node else link_body + current_link_body = SingleLineParser(link_body_node, collector=self.collector).as_markdown if link_body_node else link_body href = resolve_external_link(current_link_body, space_key, target_title) + # Collect unresolved links + if self.collector and href == '#link-error': + self.collector.add_link(node) + return f'[{link_body}{decoded_anchor}]({href}{lowercased_fragment})' def convert_inline_image(self, node): @@ -570,8 +581,9 @@ def convert_inline_image(self, node): class MultiLineParser: - def __init__(self, node): + def __init__(self, node, collector: LostInfoCollector | None = None): self.node = node + self.collector = collector self.list_stack = [] self.markdown_lines = [] self._debug_markdown = False # Used when debugging manually @@ -655,14 +667,14 @@ def convert_recursively(self, node): elif node.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: # Headings can exist in a block. self.append_empty_line_unless_first_child(node) - self.markdown_lines.append(SingleLineParser(node).as_markdown + '\n') + self.markdown_lines.append(SingleLineParser(node, collector=self.collector).as_markdown + '\n') self.markdown_lines.append('\n') elif node.name in ['ac:structured-macro'] and StructuredMacroToCallout(node).applicable: self.append_empty_line_unless_first_child(node) self.markdown_lines.extend(StructuredMacroToCallout(node).as_markdown) elif node.name == 'ac:adf-extension' and AdfExtensionToCallout(node).applicable: self.append_empty_line_unless_first_child(node) - self.markdown_lines.extend(AdfExtensionToCallout(node).as_markdown) + self.markdown_lines.extend(AdfExtensionToCallout(node, collector=self.collector).as_markdown) elif node.name in ['ac:structured-macro'] and attr_name in ['code']: self.convert_structured_macro_code(node) elif node.name in ['ac:structured-macro'] and attr_name in ['expand']: @@ -679,7 +691,7 @@ def convert_recursively(self, node): self.append_empty_line_unless_first_child(node) markdown = [] for child in node.children: - markdown.extend(MultiLineParser(child).as_markdown) + markdown.extend(MultiLineParser(child, collector=self.collector).as_markdown) lines = ''.join(markdown).splitlines() for to_quote in lines: self.markdown_lines.append(f'> {to_quote}') @@ -705,7 +717,7 @@ def convert_recursively(self, node): # Problem: A paragraph was in a too long line. # Resolve: # - Split a paragraph into sentences. And arrange one sentence in each line. - single_line = SingleLineParser(child).as_markdown + single_line = SingleLineParser(child, collector=self.collector).as_markdown # Preserve a leading whitespace in single_line if single_line[0].isspace(): child_markdown.append(' ') @@ -716,18 +728,18 @@ def convert_recursively(self, node): # Preserve an ending whitespace in single_line if single_line[-1].isspace(): child_markdown.append(' ') - elif SingleLineParser(child).applicable: - child_markdown.append(SingleLineParser(child).as_markdown) + elif SingleLineParser(child, collector=self.collector).applicable: + child_markdown.append(SingleLineParser(child, collector=self.collector).as_markdown) else: if self._debug_markdown: child_markdown.append(f'<{child.name}>') - child_markdown.extend(MultiLineParser(child).as_markdown) + child_markdown.extend(MultiLineParser(child, collector=self.collector).as_markdown) if self._debug_markdown: child_markdown.append(f'') # Add an empty line after paragraphs self.markdown_lines.append(''.join(child_markdown).strip() + '\n') elif node.name in ['span']: - self.markdown_lines.append(SingleLineParser(node).as_markdown) + self.markdown_lines.append(SingleLineParser(node, collector=self.collector).as_markdown) elif node.name in ['br']: #
is a line break. Just keep using
. # Append '\n' for
in MultiLineParser. @@ -739,7 +751,7 @@ def convert_recursively(self, node): self.append_empty_line_unless_first_child(node) self.convert_image(node) elif node.name in ['a']: - self.markdown_lines.append(SingleLineParser(node).as_markdown) + self.markdown_lines.append(SingleLineParser(node, collector=self.collector).as_markdown) elif node.name in ['hr']: # Using --- after a sentence means an H2 heading. # To prevent ambiguity with headings, use ______ for a horizontal rule. @@ -784,20 +796,20 @@ def convert_li(self, node, list_type, counter=None): attr_name = child.get('ac:name', '(none)') if not isinstance(child, NavigableString) else '(none)' if isinstance(child, NavigableString): if child.text.strip(): # Only process non-empty text nodes - li_itself.append(SingleLineParser(child).as_markdown) + li_itself.append(SingleLineParser(child, collector=self.collector).as_markdown) elif child.name == 'p': # Process paragraph content if len(li_itself) > 0: li_itself.append('
') - li_itself.append(SingleLineParser(child).as_markdown) + li_itself.append(SingleLineParser(child, collector=self.collector).as_markdown) elif child.name == 'ac:image': # Process image separately using MultiLineParser - image_markdown = MultiLineParser(child).as_markdown + image_markdown = MultiLineParser(child, collector=self.collector).as_markdown child_markdown.extend(image_markdown) elif child.name in ['ul', 'ol']: pass # Will be processed later in this method elif child.name in ['ac:structured-macro'] and attr_name in ['code']: - code_markdown = MultiLineParser(child).as_markdown + code_markdown = MultiLineParser(child, collector=self.collector).as_markdown child_markdown.extend(code_markdown) else: child_markdown.append(f'(Unexpected node name="{child.name}" ac:name="{attr_name}")\n') @@ -863,7 +875,7 @@ def convert_image(self, node): if caption: caption_paragraph = caption.find('p') if caption_paragraph: - caption_text = SingleLineParser(caption_paragraph).as_markdown + caption_text = SingleLineParser(caption_paragraph, collector=self.collector).as_markdown markdown = '' image_filename = unicodedata.normalize('NFC', image_filename) @@ -935,7 +947,7 @@ def convert_structured_macro_expand(self, node): # Look for code content in the CDATA section rich_text_body = node.find('ac:rich-text-body') if rich_text_body: - self.markdown_lines.extend(MultiLineParser(rich_text_body).as_markdown) + self.markdown_lines.extend(MultiLineParser(rich_text_body, collector=self.collector).as_markdown) self.markdown_lines.append(f"\n") @@ -1264,8 +1276,9 @@ def convert_recursively(self, node): class AdfExtensionToCallout: - def __init__(self, node): + def __init__(self, node, collector: LostInfoCollector | None = None): self.node = node + self.collector = collector self.markdown_lines = [] @property @@ -1322,6 +1335,8 @@ def convert_recursively(self, node): adf_attribute = node.find('ac:adf-attribute', {'key': 'panel-type'}) if adf_attribute: panel_type = adf_attribute.text + if self.collector: + self.collector.add_adf_extension(node, panel_type) logging.debug(f'Found text={adf_attribute.text}') else: logging.warning(f"No in {print_node_with_properties(node)} from {ancestors(node)} in {ctx.INPUT_FILE_PATH}") @@ -1335,7 +1350,7 @@ def convert_recursively(self, node): adf_content = node.find('ac:adf-content') if adf_content: - self.markdown_lines.extend(MultiLineParser(adf_content).as_markdown) + self.markdown_lines.extend(MultiLineParser(adf_content, collector=self.collector).as_markdown) else: logging.warning(f"No in {print_node_with_properties(node)} from {ancestors(node)} in {ctx.INPUT_FILE_PATH}") @@ -1354,10 +1369,15 @@ def __init__(self, html_content: str): self.markdown_lines = [] self._imports = {} self._debug_markdown = False # Used when debugging manually + self._collector = LostInfoCollector() # Parse HTML with BeautifulSoup self.soup = BeautifulSoup(html_content, 'html.parser') + @property + def lost_infos(self) -> dict: + return self._collector.to_dict() + @property def imports(self): markdown = [] @@ -1413,7 +1433,7 @@ def load_attachments(self, input_dir: str, output_dir: str, public_dir: str, attachment_nodes = ac_image.find_all('ri:attachment') for node in attachment_nodes: logging.debug(f"add attachment of {node}") - attachment = Attachment(node, input_dir, output_dir, public_dir) + attachment = Attachment(node, input_dir, output_dir, public_dir, collector=self._collector) if not skip_image_copy: attachment.copy_to_destination() attachments.append(attachment) @@ -1430,7 +1450,7 @@ def as_markdown(self): # Add document title at the beginning if available self.markdown_lines.extend(self.title) # Start conversion - self.markdown_lines.extend(MultiLineParser(self.soup).as_markdown) + self.markdown_lines.extend(MultiLineParser(self.soup, collector=self._collector).as_markdown) # self.process_node(soup) # Join all Markdown lines and return diff --git a/confluence-mdx/tests/test_lost_info_collector.py b/confluence-mdx/tests/test_lost_info_collector.py index a43bd35d3..42296818c 100644 --- a/confluence-mdx/tests/test_lost_info_collector.py +++ b/confluence-mdx/tests/test_lost_info_collector.py @@ -65,3 +65,64 @@ def test_multiple_categories(self): assert 'emoticons' in result assert 'filenames' in result assert 'links' not in result + + +class TestSingleLineParserCollector: + """SingleLineParser가 collector에 emoticon/link를 기록하는지 테스트한다.""" + + def test_emoticon_collected(self): + from converter.lost_info import LostInfoCollector + from converter.core import SingleLineParser + + html = ( + '

' + ) + node = _tag(html) + collector = LostInfoCollector() + parser = SingleLineParser(node, collector=collector) + _ = parser.as_markdown + result = collector.to_dict() + assert 'emoticons' in result + assert result['emoticons'][0]['name'] == 'tick' + + def test_emoticon_without_collector_works(self): + """collector 없이도 기존 동작이 유지되어야 한다.""" + from converter.core import SingleLineParser + + html = ( + '

' + ) + node = _tag(html) + parser = SingleLineParser(node) + result = parser.as_markdown + # Should contain emoji character or shortname - either way it should work + assert len(result) > 0 + + +class TestConfluenceToMarkdownLostInfos: + """ConfluenceToMarkdown가 lost_infos를 수집하는지 테스트한다.""" + + def test_emoticon_in_paragraph(self): + from converter.core import ConfluenceToMarkdown + + html = ( + '

Check:

' + ) + converter = ConfluenceToMarkdown(html) + _ = converter.as_markdown() + lost = converter.lost_infos + assert 'emoticons' in lost + assert lost['emoticons'][0]['name'] == 'tick' + + def test_no_lost_info_when_nothing_lost(self): + from converter.core import ConfluenceToMarkdown + + html = '

Simple text

' + converter = ConfluenceToMarkdown(html) + _ = converter.as_markdown() + assert converter.lost_infos == {} From 2980ddc50e61b59d3ad35778978607f8691ec9ce Mon Sep 17 00:00:00 2001 From: JK Date: Tue, 17 Feb 2026 23:55:37 +0900 Subject: [PATCH 4/7] =?UTF-8?q?confluence-mdx:=20mapping.yaml=20v2=20?= =?UTF-8?q?=EC=8A=A4=ED=82=A4=EB=A7=88=EC=97=90=20lost=5Finfo=20=EA=B8=B0?= =?UTF-8?q?=EB=A1=9D=ED=95=A9=EB=8B=88=EB=8B=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.6 --- confluence-mdx/bin/converter/cli.py | 5 ++- confluence-mdx/bin/reverse_sync/sidecar.py | 5 ++- .../tests/test_lost_info_collector.py | 41 +++++++++++++++++++ 3 files changed, 49 insertions(+), 2 deletions(-) diff --git a/confluence-mdx/bin/converter/cli.py b/confluence-mdx/bin/converter/cli.py index 6cec1e267..727b8b4ea 100755 --- a/confluence-mdx/bin/converter/cli.py +++ b/confluence-mdx/bin/converter/cli.py @@ -197,7 +197,10 @@ def main(): try: from reverse_sync.sidecar import generate_sidecar_mapping page_id = str(page_v1.get('id')) if page_v1 else '' - sidecar_yaml = generate_sidecar_mapping(xhtml_original, markdown_content, page_id) + sidecar_yaml = generate_sidecar_mapping( + xhtml_original, markdown_content, page_id, + lost_infos=converter.lost_infos, + ) mapping_path = os.path.join(input_dir, 'mapping.yaml') with open(mapping_path, 'w', encoding='utf-8') as f: f.write(sidecar_yaml) diff --git a/confluence-mdx/bin/reverse_sync/sidecar.py b/confluence-mdx/bin/reverse_sync/sidecar.py index 6ce04e13d..f2dfd9412 100644 --- a/confluence-mdx/bin/reverse_sync/sidecar.py +++ b/confluence-mdx/bin/reverse_sync/sidecar.py @@ -298,6 +298,7 @@ def generate_sidecar_mapping( xhtml: str, mdx: str, page_id: str = '', + lost_infos: dict | None = None, ) -> str: """XHTML + MDX로부터 mapping.yaml 내용을 생성한다. @@ -401,11 +402,13 @@ def generate_sidecar_mapping( }) mapping_data = { - 'version': 1, + 'version': 2, 'source_page_id': page_id, 'mdx_file': 'page.mdx', 'mappings': entries, } + if lost_infos: + mapping_data['lost_info'] = lost_infos return yaml.dump(mapping_data, allow_unicode=True, default_flow_style=False) diff --git a/confluence-mdx/tests/test_lost_info_collector.py b/confluence-mdx/tests/test_lost_info_collector.py index 42296818c..c50d84af8 100644 --- a/confluence-mdx/tests/test_lost_info_collector.py +++ b/confluence-mdx/tests/test_lost_info_collector.py @@ -126,3 +126,44 @@ def test_no_lost_info_when_nothing_lost(self): converter = ConfluenceToMarkdown(html) _ = converter.as_markdown() assert converter.lost_infos == {} + + +class TestMappingYamlLostInfo: + def test_version_is_2(self): + import yaml + from reverse_sync.sidecar import generate_sidecar_mapping + + xhtml = '

Title

' + mdx = '---\ntitle: test\n---\n\n# Doc\n\n## Title\n' + + result = generate_sidecar_mapping(xhtml, mdx, '12345') + data = yaml.safe_load(result) + assert data['version'] == 2 + + def test_lost_info_in_mapping_yaml(self): + import yaml + from reverse_sync.sidecar import generate_sidecar_mapping + + lost_infos = { + 'emoticons': [{'name': 'tick', 'shortname': ':check_mark:', 'emoji_id': '', 'fallback': '', 'raw': ''}], + } + + xhtml = '

Title

text

' + mdx = '---\ntitle: test\n---\n\n# Doc\n\n## Title\n\ntext\n' + + result = generate_sidecar_mapping(xhtml, mdx, '12345', lost_infos=lost_infos) + data = yaml.safe_load(result) + assert data['version'] == 2 + assert 'lost_info' in data + assert data['lost_info']['emoticons'][0]['name'] == 'tick' + + def test_no_lost_info_when_empty(self): + import yaml + from reverse_sync.sidecar import generate_sidecar_mapping + + xhtml = '

Title

' + mdx = '---\ntitle: test\n---\n\n# Doc\n\n## Title\n' + + result = generate_sidecar_mapping(xhtml, mdx, '12345') + data = yaml.safe_load(result) + assert 'lost_info' not in data From cf0aab647582357c47b3f224ee79db6a609f2104 Mon Sep 17 00:00:00 2001 From: JK Date: Tue, 17 Feb 2026 23:57:38 +0900 Subject: [PATCH 5/7] =?UTF-8?q?confluence-mdx:=20mapping.yaml=20version=20?= =?UTF-8?q?1=E2=86=922=20=EB=B3=80=EA=B2=BD=EC=97=90=20=EB=94=B0=EB=A5=B8?= =?UTF-8?q?=20=EA=B8=B0=EC=A1=B4=20=ED=85=8C=EC=8A=A4=ED=8A=B8=20=EC=88=98?= =?UTF-8?q?=EC=A0=95=ED=95=A9=EB=8B=88=EB=8B=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.6 --- confluence-mdx/tests/test_reverse_sync_sidecar.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/confluence-mdx/tests/test_reverse_sync_sidecar.py b/confluence-mdx/tests/test_reverse_sync_sidecar.py index 485b21d15..2949f99a0 100644 --- a/confluence-mdx/tests/test_reverse_sync_sidecar.py +++ b/confluence-mdx/tests/test_reverse_sync_sidecar.py @@ -324,7 +324,7 @@ def test_simple_heading_paragraph(self): result = generate_sidecar_mapping(xhtml, mdx, '12345') data = yaml.safe_load(result) - assert data['version'] == 1 + assert data['version'] == 2 assert data['source_page_id'] == '12345' assert len(data['mappings']) >= 2 From b33db7c40e1e1a2799d570bb75ec4695b06f120b Mon Sep 17 00:00:00 2001 From: JK Date: Tue, 17 Feb 2026 23:58:14 +0900 Subject: [PATCH 6/7] =?UTF-8?q?confluence-mdx:=20Phase=20L3=20=EC=84=A4?= =?UTF-8?q?=EA=B3=84=20=EB=AC=B8=EC=84=9C=EC=97=90=20=EA=B5=AC=ED=98=84=20?= =?UTF-8?q?=EB=85=B8=ED=8A=B8=EB=A5=BC=20=EC=B6=94=EA=B0=80=ED=95=A9?= =?UTF-8?q?=EB=8B=88=EB=8B=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.6 --- .../docs/plans/2026-02-17-l3-lost-info-design.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/confluence-mdx/docs/plans/2026-02-17-l3-lost-info-design.md b/confluence-mdx/docs/plans/2026-02-17-l3-lost-info-design.md index e253e258e..8e87dfb9c 100644 --- a/confluence-mdx/docs/plans/2026-02-17-l3-lost-info-design.md +++ b/confluence-mdx/docs/plans/2026-02-17-l3-lost-info-design.md @@ -245,3 +245,13 @@ entries.append(entry) - `lost_info`를 활용한 역순변환 품질 개선 - `roundtrip.json`의 `SidecarBlock.lost_info` 연계 - `stripped_attrs` 수집 +- lost_info의 entry별 분배 (현재는 페이지 전체 수준) + +## 구현 노트 (2026-02-17) + +설계 대비 실제 구현의 차이점: + +1. **LostInfoCollector 위치:** `converter/core.py` 인라인이 아닌 별도 `converter/lost_info.py` 모듈로 분리 +2. **collector 단위:** 블록별 collector가 아닌, `ConfluenceToMarkdown` 전체에 하나의 collector를 두고 모든 파서에 전파 +3. **lost_info 저장 위치:** entry별 `lost_info` 필드가 아닌, mapping.yaml 최상위 `lost_info` 필드로 기록. entry별 분배는 L4에서 필요 시 구현 +4. **stripped_attrs:** 범위 외로 미구현 (설계 문서 §5와 동일) From d3d2799a794531eb318532037665ce0c41bfc412 Mon Sep 17 00:00:00 2001 From: JK Date: Wed, 18 Feb 2026 00:02:37 +0900 Subject: [PATCH 7/7] =?UTF-8?q?confluence-mdx:=20TableToHtmlTable/Structur?= =?UTF-8?q?edMacroToCallout/TableToNativeMarkdown=EC=97=90=20collector=20?= =?UTF-8?q?=EC=A0=84=ED=8C=8C=ED=95=A9=EB=8B=88=EB=8B=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 세 파서 클래스의 __init__에 collector 파라미터를 추가하고, 내부에서 생성하는 SingleLineParser/MultiLineParser 인스턴스에 collector를 전달합니다. MultiLineParser.convert_recursively에서 이 클래스들을 생성할 때도 self.collector를 전파합니다. Co-Authored-By: Claude Opus 4.6 --- confluence-mdx/bin/converter/core.py | 35 +++++++++++++++------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/confluence-mdx/bin/converter/core.py b/confluence-mdx/bin/converter/core.py index 8a8d20b3f..b09b191bb 100644 --- a/confluence-mdx/bin/converter/core.py +++ b/confluence-mdx/bin/converter/core.py @@ -671,7 +671,7 @@ def convert_recursively(self, node): self.markdown_lines.append('\n') elif node.name in ['ac:structured-macro'] and StructuredMacroToCallout(node).applicable: self.append_empty_line_unless_first_child(node) - self.markdown_lines.extend(StructuredMacroToCallout(node).as_markdown) + self.markdown_lines.extend(StructuredMacroToCallout(node, collector=self.collector).as_markdown) elif node.name == 'ac:adf-extension' and AdfExtensionToCallout(node).applicable: self.append_empty_line_unless_first_child(node) self.markdown_lines.extend(AdfExtensionToCallout(node, collector=self.collector).as_markdown) @@ -702,13 +702,13 @@ def convert_recursively(self, node): for child in node.children: self.convert_recursively(child) elif node.name == 'table': - native_markdown = TableToNativeMarkdown(node) + native_markdown = TableToNativeMarkdown(node, collector=self.collector) if native_markdown.applicable: self.append_empty_line_unless_first_child(node) self.markdown_lines.extend(native_markdown.as_markdown) else: self.append_empty_line_unless_first_child(node) - self.markdown_lines.extend(TableToHtmlTable(node).as_markdown) + self.markdown_lines.extend(TableToHtmlTable(node, collector=self.collector).as_markdown) elif node.name in ['p', 'div']: self.append_empty_line_unless_first_child(node) child_markdown = [] @@ -971,8 +971,9 @@ def convert_structured_macro_view_file(self, node): class TableToNativeMarkdown: - def __init__(self, node): + def __init__(self, node, collector: LostInfoCollector | None = None): self.node = node + self.collector = collector self.markdown_lines = [] self.applicable_nodes = { 'table', 'tbody', 'col', 'tr', 'colgroup', 'th', 'td', @@ -1064,7 +1065,7 @@ def convert_table(self, node): colspan = int(cell.get('colspan', 1)) rowspan = int(cell.get('rowspan', 1)) - cell_content = SingleLineParser(cell).as_markdown + cell_content = SingleLineParser(cell, collector=self.collector).as_markdown # Add cell content to the current row current_row.append(cell_content) @@ -1123,8 +1124,9 @@ def table_data_to_markdown(self, table_data): class TableToHtmlTable: - def __init__(self, node): + def __init__(self, node, collector: LostInfoCollector | None = None): self.node = node + self.collector = collector self.markdown_lines = [] @property @@ -1159,23 +1161,23 @@ def convert_recursively(self, node): for child in node.children: if isinstance(child, NavigableString): - self.markdown_lines.append(SingleLineParser(child).as_markdown + '\n') - elif SingleLineParser(child).applicable: - self.markdown_lines.append(SingleLineParser(child).as_markdown + '\n') - elif MultiLineParser(child).is_standalone_dash: + self.markdown_lines.append(SingleLineParser(child, collector=self.collector).as_markdown + '\n') + elif SingleLineParser(child, collector=self.collector).applicable: + self.markdown_lines.append(SingleLineParser(child, collector=self.collector).as_markdown + '\n') + elif MultiLineParser(child, collector=self.collector).is_standalone_dash: # Wrap dash in

to prevent MDX interpreting it as a list marker self.markdown_lines.append(f'

-

\n') else: - self.markdown_lines.extend(MultiLineParser(child).as_markdown) + self.markdown_lines.extend(MultiLineParser(child, collector=self.collector).as_markdown) self.markdown_lines.append(f"\n") elif node.name == 'col': """Convert col node to HTML col markup.""" attrs = get_html_attributes(node) self.markdown_lines.append(f"\n") - elif SingleLineParser(node).applicable: + elif SingleLineParser(node, collector=self.collector).applicable: # could be converted. - self.markdown_lines.append(SingleLineParser(node).as_markdown + '\n') + self.markdown_lines.append(SingleLineParser(node, collector=self.collector).as_markdown + '\n') else: logging.warning(f"TableToHtmlTable: Unexpected {print_node_with_properties(node)} from {ancestors(node)} in {ctx.INPUT_FILE_PATH}") self.markdown_lines.append(f'[{node.name}]\n') @@ -1184,8 +1186,9 @@ def convert_recursively(self, node): class StructuredMacroToCallout: - def __init__(self, node): + def __init__(self, node, collector: LostInfoCollector | None = None): self.node = node + self.collector = collector self.markdown_lines = [] @property @@ -1246,7 +1249,7 @@ def convert_recursively(self, node): logging.warning(f"Unexpected {print_node_with_properties(node)} from {ancestors(node)} in {ctx.INPUT_FILE_PATH}") for child in node.children: - self.markdown_lines.extend(MultiLineParser(child).as_markdown) + self.markdown_lines.extend(MultiLineParser(child, collector=self.collector).as_markdown) self.markdown_lines.append('
\n') elif node.name in ['ac:structured-macro'] and attr_name in ['panel']: @@ -1262,7 +1265,7 @@ def convert_recursively(self, node): f'Cannot find under {print_node_with_properties(node)} from {ancestors(node)} in {ctx.INPUT_FILE_PATH}') if rich_text_body: - self.markdown_lines.extend(MultiLineParser(rich_text_body).as_markdown) + self.markdown_lines.extend(MultiLineParser(rich_text_body, collector=self.collector).as_markdown) else: logging.warning( f'Cannot find under {print_node_with_properties(node)} from {ancestors(node)} in {ctx.INPUT_FILE_PATH}')