Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
182 changes: 182 additions & 0 deletions bitnet_tools/explain.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
from __future__ import annotations

import csv
import re
from datetime import datetime
from pathlib import Path
from typing import Any

_DATE_FORMATS = ["%Y-%m-%d", "%Y/%m/%d", "%Y.%m.%d", "%Y-%m-%d %H:%M:%S"]


def _parse_date(value: str) -> datetime | None:
text = str(value or '').strip()
if not text:
return None
for fmt in _DATE_FORMATS:
try:
return datetime.strptime(text, fmt)
except ValueError:
continue
return None


def _extract_unit(value: str) -> str | None:
text = str(value or '').strip()
if not text or not any(ch.isdigit() for ch in text):
return None
m = re.search(r'([A-Za-z가-힣/%]+)$', text)
if not m:
return None
unit = m.group(1)
if len(unit) > 8:
return None
return unit.lower()


def _rule_missing_concentration(path: str, profiles: dict[str, dict[str, Any]]) -> dict[str, Any] | None:
targets = [(col, p.get('missing_ratio', 0.0)) for col, p in profiles.items() if p.get('missing_ratio', 0.0) >= 0.2]
if not targets:
return None
col, ratio = max(targets, key=lambda x: x[1])
score = round(min(100.0, ratio * 100.0), 2)
return {
'rule': '결측 집중',
'score': score,
'reason': f"{Path(path).name}:{col} 결측이 집중됨 (결측비율 {ratio:.1%})",
'evidence': {'column': col, 'missing_ratio': ratio},
}


def _rule_category_bias(path: str, profiles: dict[str, dict[str, Any]]) -> dict[str, Any] | None:
best: tuple[str, dict[str, Any]] | None = None
for col, p in profiles.items():
if p.get('dtype') != 'string' and p.get('semantic_type') != 'category':
continue
dom = float(p.get('dominant_value_ratio', 0.0))
if dom < 0.65:
continue
if best is None or dom > float(best[1].get('dominant_value_ratio', 0.0)):
best = (col, p)
if best is None:
return None
col, p = best
top_values = p.get('top_values') or []
top_val = top_values[0]['value'] if top_values else '(unknown)'
dom = float(p.get('dominant_value_ratio', 0.0))
score = round(min(100.0, dom * 100.0), 2)
return {
'rule': '특정 카테고리 편중',
'score': score,
'reason': f"{Path(path).name}:{col} 값 '{top_val}' 편중 ({dom:.1%})",
'evidence': {'column': col, 'dominant_value_ratio': dom, 'top_value': top_val},
}


def _rule_unit_mismatch(path: str, profiles: dict[str, dict[str, Any]]) -> dict[str, Any] | None:
best_candidate: dict[str, Any] | None = None

for col, p in profiles.items():
top_values = p.get('top_values') or []
units: dict[str, float] = {}
for item in top_values:
unit = _extract_unit(item.get('value', ''))
if not unit:
continue
units[unit] = units.get(unit, 0.0) + float(item.get('ratio', 0.0))

if len(units) < 2:
continue
coverage = sum(units.values())
if coverage < 0.2:
continue
score = round(min(100.0, (len(units) - 1) * 18 + coverage * 50), 2)
candidate = {
'rule': '단위 불일치',
'score': score,
'reason': f"{Path(path).name}:{col} 다중 단위 혼재 ({', '.join(sorted(units.keys()))})",
'evidence': {'column': col, 'units': sorted(units.keys()), 'coverage': round(coverage, 4)},
}
if best_candidate is None or candidate['score'] > best_candidate['score']:
best_candidate = candidate

return best_candidate


def _rule_recent_change(path: str, csv_path: Path, profiles: dict[str, dict[str, Any]]) -> dict[str, Any] | None:
date_cols = [col for col, p in profiles.items() if p.get('semantic_type') == 'date']
num_cols = [col for col, p in profiles.items() if p.get('dtype') == 'float']
if not date_cols or not num_cols:
return None

with csv_path.open('r', encoding='utf-8-sig', newline='') as f:
rows = list(csv.DictReader(f))
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Stream recent-change rows instead of materializing the CSV

This line loads the entire file into memory before computing recent-change signals, so every dataset with both a date and numeric column incurs an O(n) in-memory copy in addition to profiling; on large CSVs this can significantly slow analysis or trigger memory failures. Since analyze_multiple_csv is otherwise stream-oriented, this regression can break large-file workflows and should be replaced with incremental windowed processing.

Useful? React with 👍 / 👎.


best: dict[str, Any] | None = None
for date_col in date_cols:
series: list[tuple[datetime, dict[str, float]]] = []
for row in rows:
dt = _parse_date(row.get(date_col, ''))
if dt is None:
continue
values: dict[str, float] = {}
for ncol in num_cols:
raw = str(row.get(ncol, '')).strip().replace(',', '')
if not raw:
continue
try:
values[ncol] = float(raw)
Comment on lines +124 to +128
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Parse recent-change numerics with the shared normalizer

Recent-change detection parses numeric cells with float(raw) after only removing commas, which drops values like currency/percent/parenthesized numbers that profiling already treats as numeric via _to_float; in those common formats, dtype is float but prev_vals/recent_vals stay empty and the 최근 급변 reason is silently missed. Reusing the same numeric normalization here would keep reason generation consistent with the profile stage.

Useful? React with 👍 / 👎.

except ValueError:
continue
if values:
series.append((dt, values))

if len(series) < 6:
continue

series.sort(key=lambda x: x[0])
window = max(3, len(series) // 5)
prev = series[-2 * window:-window]
recent = series[-window:]
if not prev or not recent:
continue

for ncol in num_cols:
prev_vals = [v[ncol] for _, v in prev if ncol in v]
recent_vals = [v[ncol] for _, v in recent if ncol in v]
if len(prev_vals) < 2 or len(recent_vals) < 2:
continue
prev_mean = sum(prev_vals) / len(prev_vals)
recent_mean = sum(recent_vals) / len(recent_vals)
baseline = max(abs(prev_mean), 1e-9)
change_ratio = abs(recent_mean - prev_mean) / baseline
if change_ratio < 0.5:
continue
score = round(min(100.0, change_ratio * 100.0), 2)
candidate = {
'rule': '최근 급변',
'score': score,
'reason': f"{Path(path).name}:{ncol} 최근 평균 급변 ({change_ratio:.1%})",
'evidence': {
'date_column': date_col,
'column': ncol,
'prev_mean': round(prev_mean, 4),
'recent_mean': round(recent_mean, 4),
'change_ratio': round(change_ratio, 4),
},
}
if best is None or candidate['score'] > best['score']:
best = candidate
return best


def generate_reason_candidates(path: str, csv_path: Path, profiles: dict[str, dict[str, Any]], top_k: int = 3) -> list[dict[str, Any]]:
candidates = [
_rule_missing_concentration(path, profiles),
_rule_category_bias(path, profiles),
_rule_unit_mismatch(path, profiles),
_rule_recent_change(path, csv_path, profiles),
]
filtered = [c for c in candidates if c is not None]
filtered.sort(key=lambda x: (x.get('score', 0.0), x.get('rule', '')), reverse=True)
return filtered[:top_k]
15 changes: 15 additions & 0 deletions bitnet_tools/multi_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from typing import Any

from .analysis import _to_float
from .explain import generate_reason_candidates

CACHE_DIR = Path('.bitnet_cache')
UNIQUE_BITMAP_SIZE = 65536
Expand Down Expand Up @@ -340,6 +341,9 @@ def _generate_insights(files: list[dict[str, Any]], schema_drift: dict[str, Any]
insights.append(f"공통 컬럼 {col}의 타입이 파일 간 다르게 탐지됨")
if drift['mean_range'] > 0:
insights.append(f"공통 컬럼 {col}의 평균 범위 변화: {drift['mean_range']:.4f}")
for f in files:
for reason in f.get('reason_candidates', [])[:3]:
insights.append(f"{f['path']} 이유후보[{reason['rule']}] {reason['reason']}")
return insights[:30]


Expand Down Expand Up @@ -402,12 +406,22 @@ def analyze_multiple_csv(
'summary': profiled['summary'],
'column_profiles': profiled['column_profiles'],
'group_target_ratio': profiled['group_target_ratio'],
'reason_candidates': generate_reason_candidates(
str(path),
path,
profiled['column_profiles'],
),
}
)

shared_columns = sorted(set.intersection(*all_columns)) if all_columns else []
union_columns = sorted(set.union(*all_columns)) if all_columns else []
schema_drift = _schema_drift(files, shared_columns)
all_reason_candidates: list[dict[str, Any]] = []
for f in files:
for reason in f.get('reason_candidates', []):
all_reason_candidates.append({'file': f['path'], **reason})
all_reason_candidates.sort(key=lambda x: x.get('score', 0.0), reverse=True)

return {
'question': question,
Expand All @@ -418,6 +432,7 @@ def analyze_multiple_csv(
'files': files,
'schema_drift': schema_drift,
'insights': _generate_insights(files, schema_drift),
'reason_candidates': all_reason_candidates[:3],
'code_guidance': build_code_guidance(shared_columns, group_column, target_column),
}

Expand Down
8 changes: 8 additions & 0 deletions bitnet_tools/ui/app.js
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ const UI = {
filterColumn: document.getElementById('filterColumn'),
filterType: document.getElementById('filterType'),
insightList: document.getElementById('insightList'),
reasonCandidates: document.getElementById('reasonCandidates'),
insightDrilldown: document.getElementById('insightDrilldown'),
geoLatCol: document.getElementById('geoLatCol'),
geoLonCol: document.getElementById('geoLonCol'),
Expand Down Expand Up @@ -661,6 +662,13 @@ function renderDashboard(data) {
? insights.map((x, i) => `${i + 1}. ${x}`).join('\n')
: '인사이트 항목이 없습니다.';

const reasons = Array.isArray(data.reason_candidates) ? data.reason_candidates : [];
if (UI.reasonCandidates) {
UI.reasonCandidates.textContent = reasons.length
? reasons.map((x, i) => `${i + 1}. [${x.rule}] score=${x.score}\n- 파일: ${x.file}\n- 근거: ${x.reason}`).join('\n\n')
: '이유 후보가 없습니다.';
}

appState.structuredInsights = buildStructuredInsights(data);
renderFilters();
renderInsightList();
Expand Down
3 changes: 3 additions & 0 deletions bitnet_tools/ui/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,9 @@ <h3>인사이트 필터</h3>
<h3>인사이트 리스트</h3>
<div id="insightList" class="insight-list"></div>

<h3>이유 후보 보기</h3>
<pre id="reasonCandidates">이유 후보가 없습니다.</pre>

<h3>드릴다운(근거 데이터)</h3>
<pre id="insightDrilldown">인사이트를 선택하면 근거 데이터가 표시됩니다.</pre>

Expand Down
38 changes: 38 additions & 0 deletions tests/test_explain.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from bitnet_tools.multi_csv import analyze_multiple_csv


def test_reason_candidates_score_sort_and_reason_text(tmp_path):
p = tmp_path / 'reason_case.csv'
p.write_text(
'\n'.join(
[
'dt,cat,amount,desc',
'2024-01-01,A,100,10kg',
'2024-01-02,A,110,12kg',
'2024-01-03,A,120,1lb',
'2024-01-04,A,,3kg',
'2024-01-05,B,105,2lb',
'2024-01-06,A,,5kg',
'2024-01-07,A,1000,2lb',
'2024-01-08,A,1200,6kg',
'2024-01-09,A,1300,7kg',
'2024-01-10,A,1400,8kg',
]
)
+ '\n',
encoding='utf-8',
)

result = analyze_multiple_csv([p], '이상치 이유를 보여줘')
reasons = result['reason_candidates']

assert 1 <= len(reasons) <= 3
assert reasons == sorted(reasons, key=lambda x: x['score'], reverse=True)

for reason in reasons:
assert reason['score'] > 0
assert reason['rule']
assert reason['reason']

reason_text = ' '.join(r['reason'] for r in reasons)
assert any(token in reason_text for token in ['결측', '편중', '단위', '급변'])