# DOCX → Markdown list preservation test
This notebook converts a sample `.docx` to Markdown using available libraries and checks whether bulleted/numbered lists are preserved.

In [3]:
from pathlib import Path
import re

DOCX_PATH = Path('23 - Test chapter - A walk to the river.docx')
assert DOCX_PATH.exists(), f'File not found: {DOCX_PATH}'

# Try imports lazily so the notebook works with whatever is installed.
try:
    import mammoth  # Preferred for DOCX→Markdown
    HAS_MAMMOTH = True
except Exception:
    HAS_MAMMOTH = False

try:
    import pypandoc  # Requires pandoc installed on system
    HAS_PYPANDOC = True
except Exception:
    HAS_PYPANDOC = False

try:
    from docx import Document  # python-docx for ground-truth list detection
    HAS_PYTHON_DOCX = True
except Exception:
    HAS_PYTHON_DOCX = False

def convert_docx_to_markdown(docx_path: Path):
    """Return (markdown_text, method) using the first available converter.
    Tries: mammoth → pypandoc. Raises RuntimeError if none available.
    """
    if HAS_MAMMOTH:
        with open(docx_path, 'rb') as f:
            result = mammoth.convert_to_markdown(f)
        return result.value, 'mammoth'
    if HAS_PYPANDOC:
        md = pypandoc.convert_file(str(docx_path), 'md', extra_args=['--wrap=none'])
        return md, 'pypandoc'
    raise RuntimeError('No DOCX→Markdown converter available. Install `mammoth` or `pypandoc` + `pandoc`.')

def is_list_paragraph(paragraph) -> bool:
    """Heuristic: detect if a python-docx paragraph is part of a list (bulleted/numbered).
    Requires python-docx.
    """
    p = getattr(paragraph, '_p', None)
    if p is None or p.pPr is None:
        return False
    return p.pPr.numPr is not None

def extract_list_items_from_docx(docx_path: Path):
    """Return list of (text) for list paragraphs from DOCX using python-docx.
    Empty/whitespace-only items are filtered out.
    """
    if not HAS_PYTHON_DOCX:
        return None
    doc = Document(str(docx_path))
    items = []
    for p in doc.paragraphs:
        if is_list_paragraph(p):
            text = p.text.strip()
            if text:
                items.append(text)
    return items

LIST_LINE_RE = re.compile(r'^(\s*[-*+] \S|\s*\d+[.)] \S)')

def extract_list_lines_from_markdown(md: str):
    """Return list of raw Markdown lines that look like list items.
    Matches bullets (-, *, +) and ordered lists (1., 1)).
    """
    lines = md.splitlines()
    return [ln for ln in lines if LIST_LINE_RE.match(ln)]

def normalize(s: str) -> str:
    # Lowercase + collapse internal whitespace for fuzzy matching
    return re.sub(r'\s+', ' ', s).strip().lower()


In [4]:
# Convert DOCX → Markdown
try:
    md_text, method = convert_docx_to_markdown(DOCX_PATH)
    print(f'Converter: {method}')
except Exception as e:
    print('ERROR: ', e)
    print('Hint: pip install mammoth  OR  pip install pypandoc (and install pandoc)')
    raise

# Analyze lists in the produced Markdown
md_list_lines = extract_list_lines_from_markdown(md_text)
print(f'Markdown list lines detected: {len(md_list_lines)}')

# If python-docx is available, establish ground truth from the source DOCX
docx_items = extract_list_items_from_docx(DOCX_PATH)
if docx_items is None:
    print('python-docx not installed — skipping ground-truth comparison.')
else:
    print(f'DOCX list items detected: {len(docx_items)}')
    norm_md = [normalize(ln) for ln in md_list_lines]
    matched = 0
    for item in docx_items:
        ni = normalize(item)
        if any(ni in ln for ln in norm_md):
            matched += 1
    print(f'Matched list items in Markdown: {matched}/{len(docx_items)}')
    if matched == len(docx_items) and len(docx_items) > 0:
        print('RESULT: PASS — All DOCX list items found in Markdown.')
    elif len(docx_items) == 0:
        print('RESULT: INCONCLUSIVE — No list items in DOCX sample.')
    else:
        print('RESULT: FAIL — Some list items not found in Markdown.')

# Optionally preview the first few Markdown list lines
print('Sample Markdown list lines:')
for ln in md_list_lines[:10]:
    print('  ', ln)

Converter: mammoth
Markdown list lines detected: 13
DOCX list items detected: 13
Matched list items in Markdown: 9/13
RESULT: FAIL — Some list items not found in Markdown.
Sample Markdown list lines:
   - Ayiara *visits Mo to apologize* for being so totally sidetracked and unavailable for two days
   - They __take a walk and__ he asks
   	- What’s so special about being a bloodwriter \(before the magicky stuff\)
   	- How does she like being a calligrapher
   		- Reveals secret that she’s the only one after promises not to tell anyone
   	- If there was one thing she could say to convert someone, what would it be?
   - She asks
   	- How he got into carpentry
   		- Built barracks for the army and had a knack for it
   - Mo isn’t packing up – he’s thinking through business opportunities
