Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,33 @@
3. 실행 명령어
4. `pyproject.toml` 또는 의존성 목록

### 업로드 입력 지원 범위 / 제약 / 준비 권장사항

#### 지원 범위
- CSV: `input_type=csv` 또는 파일 업로드 시 기본 경로로 처리
- Excel: `.xlsx`(OOXML) 지원, 시트 목록 조회(`/api/sheets`) + 시트 선택 후 CSV 정규화
- 문서: `.pdf`, `.docx`, `.pptx` 표 추출(`/api/document/extract`) 후 선택 테이블 분석

#### 제약
- Excel은 현재 `.xlsx`만 지원(`.xls` 바이너리 포맷 미지원)
- Excel 시트는 **첫 행 헤더 필수**, 빈 헤더/중복 헤더는 에러 처리
- 비어 있는 시트(실데이터 없음)는 분석 불가
- PDF는 암호화/스캔 이미지 기반 문서에서 표 추출이 실패할 수 있음
- 문서 표 추출 실패 시 `/api/analyze`는 `error` + `error_detail` + `preprocessing_stage=table_extraction` 포맷으로 반환

#### 권장 파일 준비 방법
- CSV/Excel 공통
- 첫 행을 명확한 컬럼명(중복/공백 없음)으로 구성
- 숫자 컬럼은 단위/통화를 가능한 일관되게 정리
- 완전 빈 행/열은 사전 제거
- Excel
- 분석 대상 시트를 분리(요약 시트/원본 시트 혼합 최소화)
- merged cell/복잡 서식보다 표 형태(행-열) 우선
- 문서(PDF/Word/PPT)
- 스캔본보다 텍스트 기반 원본 사용 권장
- 테이블 경계(|, 탭, 명확한 셀 구분)가 보존된 원본이 유리
- 추출 신뢰도가 낮거나 실패하면 CSV로 변환 후 업로드 경로를 권장

---

## 1) 이번 문서에서 바로 할 일
Expand Down
23 changes: 23 additions & 0 deletions tests/test_ui_contract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from pathlib import Path


def _app_js_text() -> str:
return (Path(__file__).resolve().parents[1] / 'bitnet_tools' / 'ui' / 'app.js').read_text(encoding='utf-8')


def test_api_error_detail_priority_is_consistent_for_post_and_get():
text = _app_js_text()
expected = "data?.error_detail || data?.error || JSON.stringify(data || {})"
assert text.count(expected) >= 2


def test_ui_failure_status_messages_are_defined_consistently():
text = _app_js_text()
for phrase in [
"setStatus('입력 전처리 실패')",
"setStatus('차트 작업 실패')",
"setStatus('분석 실패')",
"setStatus('멀티 분석 실패')",
"setStatus('모델 실행 실패')",
]:
assert phrase in text
176 changes: 176 additions & 0 deletions tests/test_web.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,93 @@
import time
import threading
import urllib.request
import urllib.error
import json
from pathlib import Path

import base64
import io
import zipfile

import bitnet_tools.web as web
from http.server import ThreadingHTTPServer


def _xlsx_sheet_xml(rows):
row_nodes = []
for r_idx, row in enumerate(rows, start=1):
cell_nodes = []
for c_idx, val in enumerate(row, start=1):
col = chr(ord('A') + c_idx - 1)
ref = f"{col}{r_idx}"
if val is None:
continue
if isinstance(val, (int, float)):
cell_nodes.append(f'<c r="{ref}"><v>{val}</v></c>')
else:
escaped = str(val).replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
cell_nodes.append(f'<c r="{ref}" t="inlineStr"><is><t>{escaped}</t></is></c>')
row_nodes.append(f'<row r="{r_idx}">{"".join(cell_nodes)}</row>')
return (
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
'<worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">'
f'<sheetData>{"".join(row_nodes)}</sheetData>'
'</worksheet>'
)


def _make_xlsx_b64(sheet_map):
workbook_sheets = []
rels = []
mem = io.BytesIO()
with zipfile.ZipFile(mem, 'w') as zf:
for idx, (name, rows) in enumerate(sheet_map.items(), start=1):
rid = f'rId{idx}'
workbook_sheets.append(f'<sheet name="{name}" sheetId="{idx}" r:id="{rid}"/>')
rels.append(
f'<Relationship Id="{rid}" '
'Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet" '
f'Target="worksheets/sheet{idx}.xml"/>'
)
zf.writestr(f'xl/worksheets/sheet{idx}.xml', _xlsx_sheet_xml(rows))

workbook_xml = (
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
'<workbook xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" '
'xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">'
f'<sheets>{"".join(workbook_sheets)}</sheets>'
'</workbook>'
)
rel_xml = (
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
'<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">'
f'{"".join(rels)}'
'</Relationships>'
)
zf.writestr('xl/workbook.xml', workbook_xml)
zf.writestr('xl/_rels/workbook.xml.rels', rel_xml)
return base64.b64encode(mem.getvalue()).decode('ascii')


def _run_server():
server = ThreadingHTTPServer(('127.0.0.1', 0), web.Handler)
thread = threading.Thread(target=server.serve_forever, daemon=True)
thread.start()
return server, thread


def _post_json(url, payload):
req = urllib.request.Request(
url,
data=json.dumps(payload).encode('utf-8'),
headers={'Content-Type': 'application/json'},
method='POST',
)
try:
with urllib.request.urlopen(req) as resp:
return resp.getcode(), json.loads(resp.read().decode('utf-8'))
except urllib.error.HTTPError as exc:
return exc.code, json.loads(exc.read().decode('utf-8'))


def test_submit_and_get_chart_job_done(monkeypatch, tmp_path):
Expand Down Expand Up @@ -106,3 +188,97 @@ def test_coerce_document_payload_to_csv_text():
assert source == 'sample.docx'
assert 'h1,h2' in csv_text
assert meta['table_id'] == 'docx_table_1'


def test_excel_single_sheet_normalization():
b64 = _make_xlsx_b64({'Sales': [['region', 'amount'], ['seoul', 100], ['busan', 120]]})

source, csv_text, meta = web._coerce_csv_text_from_file_payload({
'input_type': 'excel',
'name': 'sales.xlsx',
'file_base64': b64,
})

assert source == 'sales.xlsx'
assert 'region,amount' in csv_text
assert 'seoul,100' in csv_text
assert meta['sheet_name'] == '<first_sheet>'


def test_excel_multi_sheet_selection_uses_target_sheet():
b64 = _make_xlsx_b64({
'Raw': [['c1', 'c2'], ['a', 1]],
'Summary': [['city', 'score'], ['busan', 9]],
})

csv_text = web._normalize_excel_base64_to_csv_text(b64, sheet_name='Summary')

assert 'city,score' in csv_text
assert 'busan,9' in csv_text
assert 'c1,c2' not in csv_text


def test_excel_empty_sheet_raises_validation_error():
import pytest

b64 = _make_xlsx_b64({'Empty': []})

with pytest.raises(ValueError, match='selected sheet has no non-empty rows'):
web._normalize_excel_base64_to_csv_text(b64, sheet_name='Empty')


def test_excel_header_validation_rejects_empty_and_duplicate_columns():
import pytest

empty_header_b64 = _make_xlsx_b64({'BadHeader': [['id', ''], [1, 2]]})
with pytest.raises(ValueError, match='empty header at index 1'):
web._normalize_excel_base64_to_csv_text(empty_header_b64)

dup_header_b64 = _make_xlsx_b64({'DupHeader': [['id', 'id'], [1, 2]]})
with pytest.raises(ValueError, match='duplicated header'):
web._normalize_excel_base64_to_csv_text(dup_header_b64)


def test_document_extract_api_success_and_failure_payload_contract():
server, thread = _run_server()
base = f'http://127.0.0.1:{server.server_port}'
try:
ok_code, ok_body = _post_json(base + '/api/document/extract', {
'input_type': 'document',
'source_name': 'ok.docx',
'file_base64': _make_docx_b64(),
})
assert ok_code == 200
assert ok_body['tables']

fail_code, fail_body = _post_json(base + '/api/document/extract', {
'input_type': 'document',
'source_name': 'scan.pdf',
'file_base64': base64.b64encode(b'%PDF-1.4\n<< /Subtype /Image >>\n').decode('ascii'),
})
assert fail_code == 200
assert fail_body['tables'] == []
assert fail_body['failure_reason'] == '스캔 이미지'
assert fail_body['failure_detail']
finally:
server.shutdown()
thread.join(timeout=1)


def test_analyze_document_fallback_error_uses_error_and_error_detail():
server, thread = _run_server()
base = f'http://127.0.0.1:{server.server_port}'
try:
code, body = _post_json(base + '/api/analyze', {
'input_type': 'document',
'source_name': 'locked.pdf',
'file_base64': base64.b64encode(b'%PDF-1.4\n1 0 obj\n<< /Encrypt 2 0 R >>\nendobj\n').decode('ascii'),
'question': '요약',
})
assert code == 400
assert body['error'] == 'document table extraction failed'
assert 'error_detail' in body
assert body['preprocessing_stage'] == 'table_extraction'
finally:
server.shutdown()
thread.join(timeout=1)