diff --git a/README.md b/README.md index 045f3db..87b65ab 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ ## 0) 현재 완성도 빠른 진단 -현 시점 기준 기능 완성도(실사용 관점): **약 96%** +현 시점 기준 기능 완성도(실사용 관점): **약 97%** - 완료 - CSV 기초 요약(행/열/결측/숫자 통계) @@ -15,17 +15,20 @@ - 단일 CSV + 다중 CSV CLI 분석(`report`, `multi-analyze`) - 컬럼별 결측/고유/상위값 비율 산출 - 다중 CSV 분석용 코드 가이드(판다스 예시 코드 자동 생성) + - 인사이트 룰 엔진(결측/이상치/드리프트 경고) + - 파일 프로파일 캐시(.bitnet_cache)로 재분석 가속 - 다중 CSV 자동 시각화 차트 생성(histogram/boxplot/top bar/scatter/missing-bar, matplotlib 설치 시) - 브라우저 UI(`bitnet-analyze ui`) + - 웹 UI 대시보드(JSON 붙여넣기 기반 KPI/인사이트 뷰) - **윈도우 데스크톱 UI(`bitnet-analyze desktop`, `BitNet_Desktop_Start.bat`)** - 남은 과제 - - 대시보드형 시각화 UI 고도화(필터/드릴다운) - - 데이터 전처리 규칙(날짜/카테고리 자동 인식) 고도화 - - 수십 MB 이상 다중 파일에서 차트 생성 최적화(샘플링/청크화) + - 대시보드 상호작용 고도화(파일 업로드 기반 멀티 분석 원클릭) + - 대규모 차트 생성 최적화(청크-스트리밍 렌더러) ### 처리 규모 가이드 - 단일/다중 CSV 분석(`analyze`, `multi-analyze`)은 스트리밍 누적 통계를 사용해 수십 MB 수준까지 안정 처리하도록 개선됨 +- `multi-analyze`는 파일 단위 캐시(`.bitnet_cache`)를 사용해 재실행 성능을 개선 - 차트 생성(`--charts-dir`)은 matplotlib 기반이며 파일을 메모리에 적재해 그리므로 더 큰 파일에서는 샘플링 전략 권장 ### 파일 붙여넣기 분석 가능 범위 @@ -227,6 +230,9 @@ bitnet-analyze report sample.csv --question "핵심 요약" --out analysis_repor # 8) 다중 CSV 통합 분석(JSON+MD+코드가이드) bitnet-analyze multi-analyze a.csv b.csv c.csv --question "컬럼별 비율과 지역별 차이 분석" --group-column 시도명 --target-column 세차유형 --charts-dir charts --out-json multi.json --out-report multi.md + +# 캐시 없이 재분석 +bitnet-analyze multi-analyze a.csv b.csv --question "비교" --no-cache --out-json fresh.json --out-report fresh.md ``` --- diff --git a/bitnet_tools/cli.py b/bitnet_tools/cli.py index b5605c7..f278035 100644 --- a/bitnet_tools/cli.py +++ b/bitnet_tools/cli.py @@ -79,6 +79,7 @@ def _build_parser() -> argparse.ArgumentParser: default=None, help="Optional directory to save visualization charts", ) + multi_parser.add_argument("--no-cache", action="store_true", help="Disable file profile cache") report_parser = subparsers.add_parser("report", help="Build markdown summary report from CSV") report_parser.add_argument("csv", type=Path, help="Input CSV path") @@ -124,6 +125,7 @@ def main(argv: list[str] | None = None) -> int: args.question, group_column=args.group_column, target_column=args.target_column, + use_cache=not args.no_cache, ) if args.charts_dir is not None: try: diff --git a/bitnet_tools/multi_csv.py b/bitnet_tools/multi_csv.py index 729c78a..ee48828 100644 --- a/bitnet_tools/multi_csv.py +++ b/bitnet_tools/multi_csv.py @@ -1,15 +1,19 @@ from __future__ import annotations import csv +import hashlib import json import math import random from collections import Counter, defaultdict +from datetime import datetime from pathlib import Path from typing import Any from .analysis import _to_float +CACHE_DIR = Path('.bitnet_cache') + def _quantile(sorted_values: list[float], q: float) -> float: if not sorted_values: @@ -51,27 +55,59 @@ def _reservoir_sample(values: list[float], new_value: float, seen: int, cap: int values[idx] = new_value -def _finalize_group_ratio_table( - table: dict[str, Counter[str]], - group_col: str, - target_col: str, -) -> dict[str, Any]: +def _reservoir_sample_str(values: list[str], new_value: str, seen: int, cap: int) -> None: + if cap <= 0: + return + if len(values) < cap: + values.append(new_value) + return + idx = random.randint(0, seen - 1) + if idx < cap: + values[idx] = new_value + + +def _finalize_group_ratio_table(table: dict[str, Counter[str]], group_col: str, target_col: str) -> dict[str, Any]: ratio_table: dict[str, Any] = {} for g, counter in table.items(): total = sum(counter.values()) ratio_table[g] = { k: { - "count": v, - "ratio": round(v / total, 6) if total else 0.0, + 'count': v, + 'ratio': round(v / total, 6) if total else 0.0, } for k, v in counter.items() } - - return { - "group_column": group_col, - "target_column": target_col, - "groups": ratio_table, - } + return {'group_column': group_col, 'target_column': target_col, 'groups': ratio_table} + + +def _looks_like_date(value: str) -> bool: + candidates = ["%Y-%m-%d", "%Y/%m/%d", "%Y.%m.%d", "%Y-%m-%d %H:%M:%S"] + for fmt in candidates: + try: + datetime.strptime(value, fmt) + return True + except ValueError: + continue + return False + + +def _infer_semantic_type(col: str, dtype: str, samples: list[str], unique_ratio: float) -> str: + lower = col.lower() + if dtype == 'float' and ('lat' in lower or '위도' in col): + return 'geo_latitude' + if dtype == 'float' and ('lon' in lower or '경도' in col or 'lng' in lower): + return 'geo_longitude' + if dtype == 'string': + non_empty = [s for s in samples if s] + if non_empty: + date_hits = sum(1 for s in non_empty if _looks_like_date(s)) + if date_hits / len(non_empty) >= 0.7: + return 'date' + if unique_ratio <= 0.2: + return 'category' + if dtype == 'float': + return 'numeric' + return 'text' def _profile_csv_stream( @@ -79,17 +115,19 @@ def _profile_csv_stream( group_column: str | None = None, target_column: str | None = None, outlier_sample_cap: int = 20000, + value_sample_cap: int = 300, ) -> dict[str, Any]: - with path.open("r", encoding="utf-8-sig", newline="") as f: + with path.open('r', encoding='utf-8-sig', newline='') as f: reader = csv.DictReader(f) if reader.fieldnames is None: - raise ValueError(f"CSV header not found: {path}") + raise ValueError(f'CSV header not found: {path}') columns = [str(c) for c in reader.fieldnames] missing = {c: 0 for c in columns} non_missing = {c: 0 for c in columns} unique_sets: dict[str, set[str]] = {c: set() for c in columns} value_counts: dict[str, Counter[str]] = {c: Counter() for c in columns} + value_samples: dict[str, list[str]] = {c: [] for c in columns} numeric_positive = {c: 0 for c in columns} numeric_zero = {c: 0 for c in columns} @@ -99,7 +137,6 @@ def _profile_csv_stream( numeric_mins: dict[str, float] = {} numeric_maxs: dict[str, float] = {} text_seen = {c: False for c in columns} - numeric_outlier_samples: dict[str, list[float]] = {c: [] for c in columns} group_target_counter: dict[str, Counter[str]] = defaultdict(Counter) @@ -108,20 +145,20 @@ def _profile_csv_stream( for row in reader: row_count += 1 if group_column and target_column and group_column in columns and target_column in columns: - g = (row.get(group_column) or "").strip() - t = (row.get(target_column) or "").strip() + g = (row.get(group_column) or '').strip() + t = (row.get(target_column) or '').strip() if g and t: group_target_counter[g][t] += 1 for col in columns: - raw = (row.get(col) or "").strip() - if raw == "": + raw = (row.get(col) or '').strip() + if raw == '': missing[col] += 1 continue - non_missing[col] += 1 unique_sets[col].add(raw) value_counts[col][raw] += 1 + _reservoir_sample_str(value_samples[col], raw, non_missing[col], value_sample_cap) num = _to_float(raw) if num is None: @@ -142,12 +179,7 @@ def _profile_csv_stream( else: numeric_zero[col] += 1 - _reservoir_sample( - numeric_outlier_samples[col], - num, - numeric_counts[col], - outlier_sample_cap, - ) + _reservoir_sample(numeric_outlier_samples[col], num, numeric_counts[col], outlier_sample_cap) dtypes: dict[str, str] = {} numeric_stats: dict[str, dict[str, float]] = {} @@ -156,24 +188,20 @@ def _profile_csv_stream( for col in columns: count = numeric_counts[col] if count > 0 and not text_seen[col]: - dtypes[col] = "float" + dtypes[col] = 'float' numeric_stats[col] = { - "count": float(count), - "mean": float(numeric_sums[col] / count), - "min": float(numeric_mins[col]), - "max": float(numeric_maxs[col]), + 'count': float(count), + 'mean': float(numeric_sums[col] / count), + 'min': float(numeric_mins[col]), + 'max': float(numeric_maxs[col]), } else: - dtypes[col] = "string" + dtypes[col] = 'string' nn = non_missing[col] top = value_counts[col].most_common(5) top_values = [ - { - "value": v, - "count": cnt, - "ratio": round(cnt / row_count, 6) if row_count else 0.0, - } + {'value': v, 'count': cnt, 'ratio': round(cnt / row_count, 6) if row_count else 0.0} for v, cnt in top ] @@ -181,79 +209,114 @@ def _profile_csv_stream( numeric_distribution: dict[str, float] = {} if numeric_total: numeric_distribution = { - "positive_ratio": round(numeric_positive[col] / numeric_total, 6), - "zero_ratio": round(numeric_zero[col] / numeric_total, 6), - "negative_ratio": round(numeric_negative[col] / numeric_total, 6), - "outlier_ratio": _outlier_ratio(numeric_outlier_samples[col]), + 'positive_ratio': round(numeric_positive[col] / numeric_total, 6), + 'zero_ratio': round(numeric_zero[col] / numeric_total, 6), + 'negative_ratio': round(numeric_negative[col] / numeric_total, 6), + 'outlier_ratio': _outlier_ratio(numeric_outlier_samples[col]), } - dominant_value_ratio = top_values[0]["ratio"] if top_values else 0.0 + unique_ratio = round(len(unique_sets[col]) / nn, 6) if nn else 0.0 + dominant_value_ratio = top_values[0]['ratio'] if top_values else 0.0 profiles[col] = { - "missing_count": missing[col], - "missing_ratio": round(missing[col] / row_count, 6) if row_count else 0.0, - "non_missing_count": nn, - "unique_count": len(unique_sets[col]), - "unique_ratio": round(len(unique_sets[col]) / nn, 6) if nn else 0.0, - "dominant_value_ratio": dominant_value_ratio, - "top_values": top_values, - "numeric_distribution": numeric_distribution, - "dtype": dtypes[col], + 'missing_count': missing[col], + 'missing_ratio': round(missing[col] / row_count, 6) if row_count else 0.0, + 'non_missing_count': nn, + 'unique_count': len(unique_sets[col]), + 'unique_ratio': unique_ratio, + 'dominant_value_ratio': dominant_value_ratio, + 'top_values': top_values, + 'numeric_distribution': numeric_distribution, + 'dtype': dtypes[col], + 'semantic_type': _infer_semantic_type(col, dtypes[col], value_samples[col], unique_ratio), } summary = { - "row_count": row_count, - "column_count": len(columns), - "columns": columns, - "dtypes": dtypes, - "missing_counts": missing, - "numeric_stats": numeric_stats, + 'row_count': row_count, + 'column_count': len(columns), + 'columns': columns, + 'dtypes': dtypes, + 'missing_counts': missing, + 'numeric_stats': numeric_stats, } group_target_ratio: dict[str, Any] | None = None if group_column and target_column and group_column in columns and target_column in columns: - group_target_ratio = _finalize_group_ratio_table( - group_target_counter, - group_column, - target_column, - ) + group_target_ratio = _finalize_group_ratio_table(group_target_counter, group_column, target_column) - return { - "summary": summary, - "column_profiles": profiles, - "group_target_ratio": group_target_ratio, - } + return {'summary': summary, 'column_profiles': profiles, 'group_target_ratio': group_target_ratio} def _schema_drift(files: list[dict[str, Any]], shared_columns: list[str]) -> dict[str, Any]: drift: dict[str, Any] = {} for col in shared_columns: - dtypes = [f["column_profiles"][col]["dtype"] for f in files if col in f["column_profiles"]] - missing_ratios = [f["column_profiles"][col]["missing_ratio"] for f in files if col in f["column_profiles"]] - dominant_ratios = [f["column_profiles"][col]["dominant_value_ratio"] for f in files if col in f["column_profiles"]] + dtypes = [f['column_profiles'][col]['dtype'] for f in files if col in f['column_profiles']] + missing_ratios = [f['column_profiles'][col]['missing_ratio'] for f in files if col in f['column_profiles']] + dominant_ratios = [f['column_profiles'][col]['dominant_value_ratio'] for f in files if col in f['column_profiles']] means = [] for f in files: - stats = f["summary"]["numeric_stats"].get(col) + stats = f['summary']['numeric_stats'].get(col) if stats: - means.append(stats["mean"]) + means.append(stats['mean']) drift[col] = { - "dtype_changed": len(set(dtypes)) > 1, - "missing_ratio_range": round(max(missing_ratios) - min(missing_ratios), 6) if missing_ratios else 0.0, - "dominant_value_ratio_range": round(max(dominant_ratios) - min(dominant_ratios), 6) if dominant_ratios else 0.0, - "mean_range": round(max(means) - min(means), 6) if means else 0.0, + 'dtype_changed': len(set(dtypes)) > 1, + 'missing_ratio_range': round(max(missing_ratios) - min(missing_ratios), 6) if missing_ratios else 0.0, + 'dominant_value_ratio_range': round(max(dominant_ratios) - min(dominant_ratios), 6) if dominant_ratios else 0.0, + 'mean_range': round(max(means) - min(means), 6) if means else 0.0, } return drift +def _cache_key(path: Path, group_column: str | None, target_column: str | None) -> str: + st = path.stat() + raw = f"{path.resolve()}|{st.st_size}|{st.st_mtime_ns}|{group_column}|{target_column}" + return hashlib.sha256(raw.encode('utf-8')).hexdigest() + + +def _load_cached_profile(path: Path, group_column: str | None, target_column: str | None) -> dict[str, Any] | None: + CACHE_DIR.mkdir(exist_ok=True) + cp = CACHE_DIR / f"{_cache_key(path, group_column, target_column)}.json" + if not cp.exists(): + return None + try: + return json.loads(cp.read_text(encoding='utf-8')) + except Exception: + return None + + +def _save_cached_profile(path: Path, group_column: str | None, target_column: str | None, data: dict[str, Any]) -> None: + CACHE_DIR.mkdir(exist_ok=True) + cp = CACHE_DIR / f"{_cache_key(path, group_column, target_column)}.json" + cp.write_text(json.dumps(data, ensure_ascii=False), encoding='utf-8') + + +def _generate_insights(files: list[dict[str, Any]], schema_drift: dict[str, Any]) -> list[str]: + insights: list[str] = [] + for f in files: + for col, prof in f['column_profiles'].items(): + if prof['missing_ratio'] >= 0.2: + insights.append(f"{f['path']}:{col} 결측비율이 높음({prof['missing_ratio']:.2%})") + out_ratio = prof['numeric_distribution'].get('outlier_ratio', 0.0) + if out_ratio >= 0.1: + insights.append(f"{f['path']}:{col} 이상치 비율이 높음({out_ratio:.2%})") + for col, drift in schema_drift.items(): + if drift['dtype_changed']: + insights.append(f"공통 컬럼 {col}의 타입이 파일 간 다르게 탐지됨") + if drift['mean_range'] > 0: + insights.append(f"공통 컬럼 {col}의 평균 범위 변화: {drift['mean_range']:.4f}") + return insights[:30] + + def analyze_multiple_csv( csv_paths: list[Path], question: str, group_column: str | None = None, target_column: str | None = None, + use_cache: bool = True, ) -> dict[str, Any]: if not csv_paths: - raise ValueError("at least one CSV path is required") + raise ValueError('at least one CSV path is required') files: list[dict[str, Any]] = [] all_columns: list[set[str]] = [] @@ -261,49 +324,46 @@ def analyze_multiple_csv( for path in csv_paths: if not path.exists(): - raise FileNotFoundError(f"CSV file not found: {path}") + raise FileNotFoundError(f'CSV file not found: {path}') - profiled = _profile_csv_stream( - path, - group_column=group_column, - target_column=target_column, - ) - total_rows += profiled["summary"]["row_count"] - all_columns.append(set(profiled["summary"]["columns"])) + profiled = _load_cached_profile(path, group_column, target_column) if use_cache else None + if profiled is None: + profiled = _profile_csv_stream(path, group_column=group_column, target_column=target_column) + if use_cache: + _save_cached_profile(path, group_column, target_column, profiled) + total_rows += profiled['summary']['row_count'] + all_columns.append(set(profiled['summary']['columns'])) files.append( { - "path": str(path), - "question": question, - "summary": profiled["summary"], - "column_profiles": profiled["column_profiles"], - "group_target_ratio": profiled["group_target_ratio"], + 'path': str(path), + 'question': question, + 'summary': profiled['summary'], + 'column_profiles': profiled['column_profiles'], + 'group_target_ratio': profiled['group_target_ratio'], } ) shared_columns = sorted(set.intersection(*all_columns)) if all_columns else [] union_columns = sorted(set.union(*all_columns)) if all_columns else [] + schema_drift = _schema_drift(files, shared_columns) return { - "question": question, - "file_count": len(files), - "total_row_count": total_rows, - "shared_columns": shared_columns, - "union_columns": union_columns, - "files": files, - "schema_drift": _schema_drift(files, shared_columns), - "code_guidance": build_code_guidance(shared_columns, group_column, target_column), + 'question': question, + 'file_count': len(files), + 'total_row_count': total_rows, + 'shared_columns': shared_columns, + 'union_columns': union_columns, + 'files': files, + 'schema_drift': schema_drift, + 'insights': _generate_insights(files, schema_drift), + 'code_guidance': build_code_guidance(shared_columns, group_column, target_column), } -def build_code_guidance( - shared_columns: list[str], - group_column: str | None = None, - target_column: str | None = None, -) -> dict[str, str]: - join_key = shared_columns[0] if shared_columns else "공통키컬럼" - - group_block = "" +def build_code_guidance(shared_columns: list[str], group_column: str | None = None, target_column: str | None = None) -> dict[str, str]: + join_key = shared_columns[0] if shared_columns else '공통키컬럼' + group_block = '' if group_column and target_column: group_block = ( f"ratio_tbl = (merged.groupby('{group_column}')['{target_column}'].value_counts(normalize=True)" @@ -333,80 +393,84 @@ def build_code_guidance( ) return { - "recommended_steps": ( - "1) 공통 키 컬럼 확인 후 병합\n" - "2) 컬럼별 결측/고유값/상위값 비율 확인\n" - "3) 수치형 컬럼 비율(양수/0/음수), 이상치 비율, 분포 확인\n" - "4) 그룹 컬럼 기준 타깃 비율 분석(예: 시도명-세차유형)\n" - "5) 파일 간 스키마 변화/평균 변화 범위 확인" + 'recommended_steps': ( + '1) 공통 키 컬럼 확인 후 병합\n' + '2) 컬럼별 결측/고유값/상위값 비율 확인\n' + '3) 수치형 컬럼 비율(양수/0/음수), 이상치 비율, 분포 확인\n' + '4) 그룹 컬럼 기준 타깃 비율 분석(예: 시도명-세차유형)\n' + '5) 파일 간 스키마 변화/평균 변화 범위 확인' ), - "pandas_example": pandas_code, + 'pandas_example': pandas_code, } def build_multi_csv_markdown(result: dict[str, Any]) -> str: lines = [ - "# 다중 CSV 분석 리포트", - "", + '# 다중 CSV 분석 리포트', + '', f"- 질문: {result['question']}", f"- 파일 수: {result['file_count']}", f"- 전체 행 수: {result['total_row_count']}", f"- 공통 컬럼: {', '.join(result['shared_columns']) if result['shared_columns'] else '(없음)'}", - "", + '', ] - for file_info in result["files"]: + if result.get('insights'): + lines.extend(['## 핵심 인사이트', '']) + for it in result['insights'][:10]: + lines.append(f"- {it}") + lines.append('') + + for file_info in result['files']: lines.extend( [ f"## 파일: {file_info['path']}", - "", + '', f"- 행 수: {file_info['summary']['row_count']}", f"- 열 수: {file_info['summary']['column_count']}", - "", - "| 컬럼 | 타입 | 결측비율 | 고유비율 | 대표값비율 |", - "|---|---|---:|---:|---:|", + '', + '| 컬럼 | 타입 | 의미타입 | 결측비율 | 고유비율 | 대표값비율 |', + '|---|---|---|---:|---:|---:|', ] ) - for col in file_info["summary"]["columns"]: - prof = file_info["column_profiles"][col] + for col in file_info['summary']['columns']: + prof = file_info['column_profiles'][col] lines.append( - f"| {col} | {prof['dtype']} | {prof['missing_ratio']:.4f} | {prof['unique_ratio']:.4f} | {prof['dominant_value_ratio']:.4f} |" + f"| {col} | {prof['dtype']} | {prof.get('semantic_type','')} | {prof['missing_ratio']:.4f} | {prof['unique_ratio']:.4f} | {prof['dominant_value_ratio']:.4f} |" ) - if file_info.get("group_target_ratio"): - gtr = file_info["group_target_ratio"] - lines.extend(["", f"- 그룹비율: {gtr['group_column']} x {gtr['target_column']}"]) - lines.append("") + if file_info.get('group_target_ratio'): + gtr = file_info['group_target_ratio'] + lines.extend(['', f"- 그룹비율: {gtr['group_column']} x {gtr['target_column']}"]) + lines.append('') - lines.extend(["## 파일 간 스키마/분포 변화", "", "| 컬럼 | 타입변화 | 결측비율범위 | 대표값비율범위 | 평균범위 |", "|---|---|---:|---:|---:|"]) - for col, drift in result["schema_drift"].items(): + lines.extend(['## 파일 간 스키마/분포 변화', '', '| 컬럼 | 타입변화 | 결측비율범위 | 대표값비율범위 | 평균범위 |', '|---|---|---:|---:|---:|']) + for col, drift in result['schema_drift'].items(): lines.append( f"| {col} | {drift['dtype_changed']} | {drift['missing_ratio_range']:.4f} | {drift['dominant_value_ratio_range']:.4f} | {drift['mean_range']:.4f} |" ) - charts = result.get("charts") + charts = result.get('charts') if charts: - lines.extend(["", "## 생성된 차트 파일", ""]) + lines.extend(['', '## 생성된 차트 파일', '']) for file_path, chart_paths in charts.items(): lines.append(f"- {file_path}") for c in chart_paths: lines.append(f" - {c}") - lines.extend( - [ - "", - "## 코드 가이드", - "", - "```text", - result["code_guidance"]["recommended_steps"], - "```", - "", - "```python", - result["code_guidance"]["pandas_example"], - "```", - ] - ) - - return "\n".join(lines) + lines.extend([ + '', + '## 코드 가이드', + '', + '```text', + result['code_guidance']['recommended_steps'], + '```', + '', + '```python', + result['code_guidance']['pandas_example'], + '```', + ]) + + return '\n'.join(lines) def result_to_json(result: dict[str, Any]) -> str: diff --git a/bitnet_tools/ui/app.js b/bitnet_tools/ui/app.js index 9312dd4..519f8a3 100644 --- a/bitnet_tools/ui/app.js +++ b/bitnet_tools/ui/app.js @@ -8,6 +8,10 @@ const summary = document.getElementById('summary'); const prompt = document.getElementById('prompt'); const answer = document.getElementById('answer'); +const dashboardJson = document.getElementById('dashboardJson'); +const dashboardCards = document.getElementById('dashboardCards'); +const dashboardInsights = document.getElementById('dashboardInsights'); + let latestPrompt = ''; csvFile.addEventListener('change', async (e) => { @@ -68,3 +72,35 @@ runBtn.addEventListener('click', async () => { const data = await res.json(); answer.textContent = res.ok ? data.answer : (data.error || 'error'); }); + +document.getElementById('renderDashboardBtn').addEventListener('click', () => { + dashboardCards.innerHTML = ''; + dashboardInsights.textContent = ''; + + let parsed; + try { + parsed = JSON.parse(dashboardJson.value || '{}'); + } catch { + dashboardInsights.textContent = 'JSON 형식이 올바르지 않습니다.'; + return; + } + + const cardItems = [ + ['파일 수', parsed.file_count ?? '-'], + ['총 행 수', parsed.total_row_count ?? '-'], + ['공통 컬럼 수', (parsed.shared_columns || []).length], + ['인사이트 수', (parsed.insights || []).length], + ]; + + cardItems.forEach(([k, v]) => { + const div = document.createElement('div'); + div.className = 'card'; + div.innerHTML = `${k}${v}`; + dashboardCards.appendChild(div); + }); + + const insights = parsed.insights || []; + dashboardInsights.textContent = insights.length + ? insights.map((x, i) => `${i + 1}. ${x}`).join('\n') + : '인사이트 항목이 없습니다.'; +}); diff --git a/bitnet_tools/ui/index.html b/bitnet_tools/ui/index.html index bb7d18b..cf5224d 100644 --- a/bitnet_tools/ui/index.html +++ b/bitnet_tools/ui/index.html @@ -9,7 +9,7 @@

BitNet CSV Analyzer

-

CSV 업로드 → 자동 요약 → BitNet 답변까지 한 번에.

+

CSV 업로드 → 자동 요약 → BitNet 답변 + 멀티 분석 대시보드.

@@ -53,6 +53,17 @@

생성 프롬프트

BitNet 응답


       
+ +
+

멀티 분석 대시보드(JSON)

+

`multi-analyze` 결과 JSON을 붙여넣고 시각적으로 확인하세요.

+ +
+ +
+
+

+      
diff --git a/bitnet_tools/ui/styles.css b/bitnet_tools/ui/styles.css index 0ee1949..8585750 100644 --- a/bitnet_tools/ui/styles.css +++ b/bitnet_tools/ui/styles.css @@ -56,3 +56,20 @@ pre { max-height: 320px; overflow: auto; } +.cards { + margin-top: 12px; + display: grid; + grid-template-columns: repeat(auto-fit, minmax(140px, 1fr)); + gap: 8px; +} +.card { + background: #0b1220; + border: 1px solid #334155; + border-radius: 8px; + padding: 10px; + display: flex; + flex-direction: column; + gap: 6px; +} +.card strong { color: var(--muted); font-size: 12px; } +.card span { font-size: 18px; font-weight: 700; } diff --git a/tests/test_analysis.py b/tests/test_analysis.py index 5fa0806..a3a4517 100644 --- a/tests/test_analysis.py +++ b/tests/test_analysis.py @@ -102,3 +102,27 @@ def test_multi_csv_large_row_count(tmp_path): assert result["total_row_count"] == 5000 assert result["files"][0]["summary"]["row_count"] == 5000 + + +def test_multi_csv_semantic_type_and_insights(tmp_path): + p = tmp_path / "typed.csv" + p.write_text("dt,lat,val,cat\n2024-01-01,37.5,1,A\n2024-01-02,37.6,1000,A\n", encoding="utf-8") + + result = analyze_multiple_csv([p], "의미타입") + prof = result["files"][0]["column_profiles"] + + assert prof["dt"]["semantic_type"] == "date" + assert prof["lat"]["semantic_type"] in {"geo_latitude", "numeric"} + assert isinstance(result.get("insights"), list) + + +def test_multi_csv_cache_created(tmp_path, monkeypatch): + import bitnet_tools.multi_csv as multi + + monkeypatch.setattr(multi, "CACHE_DIR", tmp_path / ".cache") + p = tmp_path / "cache.csv" + p.write_text("a,b\n1,2\n", encoding="utf-8") + + result = multi.analyze_multiple_csv([p], "캐시") + assert result["file_count"] == 1 + assert any((tmp_path / ".cache").glob("*.json"))