diff --git a/engine/crates/lex-core/src/converter/explain.rs b/engine/crates/lex-core/src/converter/explain.rs index db3bb1d..2d520e9 100644 --- a/engine/crates/lex-core/src/converter/explain.rs +++ b/engine/crates/lex-core/src/converter/explain.rs @@ -12,8 +12,14 @@ use super::cost::{conn_cost, script_cost, DefaultCostFunction}; use super::features::{is_single_char_kanji_penalised, is_te_form_kanji_penalised}; use super::lattice::{build_lattice, Lattice}; use super::postprocess::{postprocess_observed, PostprocessContext, PostprocessObserver}; +use super::reranker::compute_history_boost; use super::viterbi::{viterbi_nbest, ScoredPath}; +// Re-export so downstream crates (e.g. lex-cli) can name the type behind +// `ExplainPath::history_breakdown` — the definition lives in the crate-private +// `reranker` module. +pub use super::reranker::HistoryBoostBreakdown; + /// Full diagnostic result for a single reading. #[derive(Debug, Serialize)] pub struct ExplainResult { @@ -57,8 +63,14 @@ pub struct ExplainPath { pub viterbi_cost: i64, /// Cost delta from structure reranking. pub rerank_delta: i64, - /// Total history boost applied (negative = better). + /// Per-component history boost (raw sums + whole-path × 5). + pub history_breakdown: HistoryBoostBreakdown, + /// History boost actually subtracted from the cost (post-normalization). pub history_boost: i64, + /// Segment count `history_rerank` used as the normalization denominator. + /// May differ from `segments.len()` when `group_segments` later merged + /// adjacent segments — keep this value when reporting `/N segs`. + pub history_segment_count: usize, /// Final cost after all adjustments. pub final_cost: i64, } @@ -91,38 +103,84 @@ pub struct ExplainSegment { // Observer that captures cost snapshots for explain diagnostics // --------------------------------------------------------------------------- -/// Build a key string from a ScoredPath for cost tracking across pipeline stages. -/// Uses ASCII control characters (US=\x1f, RS=\x1e) as delimiters to avoid -/// collisions with any reading/surface content. -fn path_key(path: &ScoredPath) -> String { - path.segments - .iter() - .map(|s| format!("{}\x1f{}", s.reading, s.surface)) - .collect::>() - .join("\x1e") +/// Snapshot of one path's state at the post-rerank / pre-history-rerank stage. +/// +/// Captured here (and not after history_rerank) so the recorded breakdown is +/// computed on the same segments that history_rerank actually scored — the +/// pipeline later runs rewriters that add new candidates and `group_segments` +/// that merges adjacent segments, both of which would invalidate a recompute +/// against the final path. +#[derive(Default, Clone, Copy)] +struct PreHistorySnapshot { + /// Cost after resegment + rerank, before any history adjustment. + cost: i64, + /// Per-component history boost (raw sums + whole-path × 5). + breakdown: HistoryBoostBreakdown, + /// Boost actually subtracted from `cost` by `history_rerank`. + applied_boost: i64, + /// Segment count at the moment `history_rerank` saw the path. May differ + /// from the final `segments.len()` after `group_segments` merges adjacent + /// segments — kept here so the displayed `/N segs` matches the denominator + /// actually used during normalization. + segment_count: usize, } -#[derive(Default)] -struct ExplainObserver { +/// Diagnostic observer. +/// +/// Keys are `ScoredPath::surface_key()` — i.e. the concatenated surface — so +/// that lookups survive `group_segments` (which merges adjacent segments but +/// preserves the overall surface). Paths that only appear after history_rerank +/// (rewriter-added candidates: numeric, katakana, kanji variants) are absent +/// from these maps and fall back to zero in the caller. +struct ExplainObserver<'a> { + history: Option<&'a UserHistory>, + now: u64, /// viterbi_cost before resegment/rerank — the raw Viterbi output. original_costs: HashMap, - /// viterbi_cost after resegment + rerank (before history_rerank). - post_rerank_costs: HashMap, + /// State at the post-rerank / pre-history-rerank boundary. + pre_history: HashMap, +} + +impl<'a> ExplainObserver<'a> { + fn new(history: Option<&'a UserHistory>, now: u64) -> Self { + Self { + history, + now, + original_costs: HashMap::new(), + pre_history: HashMap::new(), + } + } } -impl PostprocessObserver for ExplainObserver { +impl PostprocessObserver for ExplainObserver<'_> { fn after_viterbi(&mut self, paths: &[ScoredPath]) { self.original_costs = paths .iter() - .map(|p| (path_key(p), p.viterbi_cost)) + .map(|p| (p.surface_key(), p.viterbi_cost)) .collect(); } fn after_rerank(&mut self, paths: &[ScoredPath]) { - self.post_rerank_costs = paths - .iter() - .map(|p| (path_key(p), p.viterbi_cost)) - .collect(); + self.pre_history.clear(); + for p in paths { + let (breakdown, applied) = match self.history { + Some(h) => { + let b = compute_history_boost(p, h, self.now); + let a = b.applied(p.segments.len()); + (b, a) + } + None => (HistoryBoostBreakdown::default(), 0), + }; + self.pre_history.insert( + p.surface_key(), + PreHistorySnapshot { + cost: p.viterbi_cost, + breakdown, + applied_boost: applied, + segment_count: p.segments.len(), + }, + ); + } } } @@ -220,7 +278,8 @@ pub fn explain( let oversample = (n * 3).max(50); let mut raw_paths = viterbi_nbest(&lattice, &cost_fn, oversample); - let mut observer = ExplainObserver::default(); + let now = crate::user_history::now_epoch(); + let mut observer = ExplainObserver::new(history, now); let ctx = PostprocessContext { lattice: &lattice, conn, @@ -228,30 +287,40 @@ pub fn explain( history, kana, n, + now, }; let final_paths = postprocess_observed(&mut raw_paths, &ctx, &mut observer); let paths: Vec = final_paths .iter() .map(|scored| { - let key = path_key(scored); + let key = scored.surface_key(); + // Look up snapshots by surface_key — preserved through group_segments. + // Rewriter-added candidates (numeric / katakana / kanji variants) are + // synthesised after history_rerank and have no snapshot, so they fall + // back to zero history boost and use the final cost for `viterbi_cost`. let original = observer .original_costs .get(&key) .copied() .unwrap_or(scored.viterbi_cost); - let post_rerank = observer - .post_rerank_costs + let snapshot = observer + .pre_history .get(&key) .copied() - .unwrap_or(original); - let rerank_delta = post_rerank - original; - let history_boost = post_rerank - scored.viterbi_cost; + .unwrap_or(PreHistorySnapshot { + cost: original, + breakdown: HistoryBoostBreakdown::default(), + applied_boost: 0, + segment_count: scored.segments.len(), + }); ExplainPath { segments: explain_segments(scored, conn, dict), viterbi_cost: original, - rerank_delta, - history_boost, + rerank_delta: snapshot.cost - original, + history_breakdown: snapshot.breakdown, + history_boost: snapshot.applied_boost, + history_segment_count: snapshot.segment_count, final_cost: scored.viterbi_cost, } }) @@ -364,6 +433,13 @@ pub fn format_text(result: &ExplainResult) -> String { " viterbi={:<8} rerank={:<+8} history={:<+8} -> final={}\n", path.viterbi_cost, path.rerank_delta, -path.history_boost, path.final_cost, )); + let hb = &path.history_breakdown; + if hb.unigram_sum != 0 || hb.bigram_sum != 0 || hb.whole_path_boost != 0 { + out.push_str(&format!( + " history: uni_sum={:<+7} bi_sum={:<+7} whole×5={:<+7} (/{} segs)\n", + -hb.unigram_sum, -hb.bigram_sum, -hb.whole_path_boost, path.history_segment_count, + )); + } } out @@ -419,6 +495,86 @@ mod tests { wh.final_cost < w.final_cost, "final cost should be lower with history boost" ); + // Single-segment きょう→京: whole-path is the only contributor. + // Per-segment unigram is also recorded (same reading+surface), so + // unigram_sum is also nonzero, but bigram_sum should be 0. + assert!( + wh.history_breakdown.whole_path_boost > 0, + "whole-path boost should fire for explicit full-input selection" + ); + assert_eq!( + wh.history_breakdown.bigram_sum, 0, + "single-segment path has no bigram pairs" + ); + } + } + + #[test] + fn test_explain_history_breakdown_empty_without_history() { + let dict = test_dict(); + let result = explain(&dict, None, None, "きょう", 5); + for path in &result.paths { + assert_eq!(path.history_boost, 0); + assert_eq!(path.history_breakdown.unigram_sum, 0); + assert_eq!(path.history_breakdown.bigram_sum, 0); + assert_eq!(path.history_breakdown.whole_path_boost, 0); + } + } + + #[test] + fn test_explain_history_segment_count_consistent_with_boost() { + // The reported `history_segment_count` is the denominator that + // `history_rerank` used at normalization time. Without `group_segments` + // (no conn passed here) the pre-history and final segmentation match, + // so the field must equal `segments.len()` AND + // `history_breakdown.applied(history_segment_count)` must reproduce + // the displayed `history_boost`. Regression for PR #247 R2. + let dict = test_dict(); + let mut h = UserHistory::new(); + h.record(&[("きょう".into(), "京".into())]); + + let result = explain(&dict, None, Some(&h), "きょう", 5); + for path in &result.paths { + assert_eq!( + path.history_segment_count, + path.segments.len(), + "without grouping, history_segment_count should equal segments.len()", + ); + assert_eq!( + path.history_boost, + path.history_breakdown.applied(path.history_segment_count), + "history_boost must equal applied(history_segment_count)", + ); + } + } + + #[test] + fn test_explain_unrelated_paths_have_zero_history_boost() { + // Paths whose surface does NOT match the recorded history must show a + // zero breakdown regardless of how they entered the final candidate set: + // - Real Viterbi paths that simply don't match (lookup hit, zero score). + // - Rewriter-added paths (katakana / kanji variants) that were + // synthesised after history_rerank, so the observer never saw them. + // + // Regression for the PR #247 R1 review: previously the breakdown was + // recomputed against the final (post-grouping / post-rewriter) path, + // which could produce non-zero values for paths that never received + // an actual boost in `history_rerank`. + let dict = test_dict(); + let mut h = UserHistory::new(); + h.record(&[("きょう".into(), "京".into())]); + + let result = explain(&dict, None, Some(&h), "きょう", 10); + for path in result.paths.iter().filter(|p| p.surface() != "京") { + assert_eq!( + path.history_boost, + 0, + "non-matching surface {:?} must not receive a history boost", + path.surface(), + ); + assert_eq!(path.history_breakdown.unigram_sum, 0); + assert_eq!(path.history_breakdown.bigram_sum, 0); + assert_eq!(path.history_breakdown.whole_path_boost, 0); } } diff --git a/engine/crates/lex-core/src/converter/postprocess.rs b/engine/crates/lex-core/src/converter/postprocess.rs index 5f45cf0..8401f78 100644 --- a/engine/crates/lex-core/src/converter/postprocess.rs +++ b/engine/crates/lex-core/src/converter/postprocess.rs @@ -42,6 +42,10 @@ pub(crate) struct PostprocessContext<'a> { pub history: Option<&'a UserHistory>, pub kana: &'a str, pub n: usize, + /// Timestamp passed to `history_rerank_at`. Pinning it here lets diagnostic + /// observers compute breakdowns against the exact value the pipeline will + /// use, avoiding sub-second drift across the second boundary. + pub now: u64, } // --------------------------------------------------------------------------- @@ -65,6 +69,7 @@ pub(super) fn postprocess( history, kana, n, + now: crate::user_history::now_epoch(), }; postprocess_observed(paths, &ctx, &mut NoopObserver) .into_iter() @@ -112,7 +117,7 @@ pub(crate) fn postprocess_observed( }; if let Some(h) = ctx.history { - reranker::history_rerank(paths, h); + reranker::history_rerank_at(paths, h, ctx.now); } let mut top: Vec = paths.drain(..ctx.n.min(paths.len())).collect(); diff --git a/engine/crates/lex-core/src/converter/reranker.rs b/engine/crates/lex-core/src/converter/reranker.rs index 3304b7d..4c0848a 100644 --- a/engine/crates/lex-core/src/converter/reranker.rs +++ b/engine/crates/lex-core/src/converter/reranker.rs @@ -98,41 +98,82 @@ pub fn rerank( debug!(paths_out = paths.len()); } -/// Apply user-history boosts to N-best paths, then re-sort. +/// Breakdown of the history boost contributions for a single path. +/// +/// Per-segment unigram/bigram sums are raw (pre-normalization). The actually +/// applied boost is `(unigram_sum + bigram_sum) / max(seg_count, 1) + whole_path_boost`, +/// available via [`Self::applied`]. +#[derive(Debug, Default, Clone, Copy, serde::Serialize)] +pub struct HistoryBoostBreakdown { + /// Sum of per-segment unigram boosts (before /seg_count normalization). + pub unigram_sum: i64, + /// Sum of per-pair bigram boosts (before /seg_count normalization). + pub bigram_sum: i64, + /// Whole-path unigram boost × 5 (not normalized). + pub whole_path_boost: i64, +} + +impl HistoryBoostBreakdown { + /// Boost actually subtracted from viterbi_cost, given the path's segment count. + pub fn applied(&self, seg_count: usize) -> i64 { + let n = (seg_count.max(1)) as i64; + (self.unigram_sum + self.bigram_sum) / n + self.whole_path_boost + } +} + +/// Compute the history boost breakdown for a single path without mutating it. +/// +/// Mirrors the contribution logic used by [`history_rerank_at`] so callers +/// (e.g. `explain`) can inspect each component. +pub fn compute_history_boost( + path: &ScoredPath, + history: &UserHistory, + now: u64, +) -> HistoryBoostBreakdown { + let mut unigram_sum: i64 = 0; + for seg in &path.segments { + unigram_sum += history.unigram_boost(&seg.reading, &seg.surface, now); + } + let mut bigram_sum: i64 = 0; + for pair in path.segments.windows(2) { + bigram_sum += + history.bigram_boost(&pair[0].surface, &pair[1].reading, &pair[1].surface, now); + } + let whole_path_boost = + history.unigram_boost(&path.full_reading(), &path.surface_key(), now) * 5; + HistoryBoostBreakdown { + unigram_sum, + bigram_sum, + whole_path_boost, + } +} + +/// Apply user-history boosts to N-best paths using the given `now`, then re-sort. +/// +/// Callers that also want to inspect the breakdown (e.g. `explain`) should pass +/// the same `now` they used with [`compute_history_boost`]; otherwise the +/// stored breakdown can drift from the boost actually subtracted here when +/// execution crosses a second boundary. /// /// Unigram and bigram boosts are subtracted from each path's cost so that /// learned candidates float to the top. Because this operates on complete /// paths (not individual lattice nodes), it cannot cause the fragmentation /// problems that in-Viterbi boosting could. -pub fn history_rerank(paths: &mut [ScoredPath], history: &UserHistory) { +/// +/// Per-segment boosts are normalized by segment count: fragmented paths +/// (e.g. き→機 + が + し + ます) would otherwise accumulate boosts from common +/// particles across ALL prior conversions, gaining a structural advantage over +/// compound paths. The whole-path boost is the strongest signal and is not +/// normalized — it only fires when the full reading→surface was explicitly +/// selected. +pub fn history_rerank_at(paths: &mut [ScoredPath], history: &UserHistory, now: u64) { let _span = debug_span!("history_rerank", paths_count = paths.len()).entered(); if paths.is_empty() { return; } - let now = crate::user_history::now_epoch(); for path in paths.iter_mut() { - // Per-segment boosts normalized by segment count. Fragmented paths - // (e.g. き→機 + が + し + ます) accumulate boosts from common particles - // (が, し, は, etc.) across ALL prior conversions, giving them a structural - // advantage over compound paths. Dividing by segment count neutralizes this. - let seg_count = path.segments.len().max(1) as i64; - let mut seg_boost: i64 = 0; - for seg in &path.segments { - seg_boost += history.unigram_boost(&seg.reading, &seg.surface, now); - } - for pair in path.segments.windows(2) { - seg_boost += - history.bigram_boost(&pair[0].surface, &pair[1].reading, &pair[1].surface, now); - } - let mut boost = seg_boost / seg_count; - - // Whole-path boost (not normalized): reward paths whose full reading→surface - // has been explicitly selected. This is the strongest learning signal and is - // not subject to cross-reading contamination. - let full_reading = path.full_reading(); - let full_surface = path.surface_key(); - boost += history.unigram_boost(&full_reading, &full_surface, now) * 5; - path.viterbi_cost -= boost; + let breakdown = compute_history_boost(path, history, now); + path.viterbi_cost -= breakdown.applied(path.segments.len()); } paths.sort_by_key(|p| p.viterbi_cost); debug!(best_cost = paths.first().map(|p| p.viterbi_cost)); diff --git a/engine/crates/lex-core/src/converter/tests/reranker.rs b/engine/crates/lex-core/src/converter/tests/reranker.rs index e43508f..a24b5a0 100644 --- a/engine/crates/lex-core/src/converter/tests/reranker.rs +++ b/engine/crates/lex-core/src/converter/tests/reranker.rs @@ -1,7 +1,7 @@ -use crate::converter::reranker::{history_rerank, rerank}; +use crate::converter::reranker::{history_rerank_at, rerank}; use crate::converter::viterbi::{RichSegment, ScoredPath}; use crate::dict::connection::ConnectionMatrix; -use crate::user_history::UserHistory; +use crate::user_history::{now_epoch, UserHistory}; #[test] fn test_rerank_penalizes_fragmented_path() { @@ -258,7 +258,7 @@ fn test_history_rerank_unigram_boost_reorders() { }, ]; - history_rerank(&mut paths, &h); + history_rerank_at(&mut paths, &h, now_epoch()); // "京" should be boosted to first place assert_eq!(paths[0].segments[0].surface, "京"); @@ -312,7 +312,7 @@ fn test_history_rerank_bigram_boost() { }, ]; - history_rerank(&mut paths, &h); + history_rerank_at(&mut paths, &h, now_epoch()); // "今日は" path should be boosted (both unigram + bigram) to first assert_eq!(paths[0].segments[0].surface, "今日"); @@ -345,7 +345,7 @@ fn test_history_rerank_empty_history_preserves_order() { }, ]; - history_rerank(&mut paths, &h); + history_rerank_at(&mut paths, &h, now_epoch()); assert_eq!(paths[0].segments[0].surface, "亜"); assert_eq!(paths[0].viterbi_cost, 1000); @@ -357,10 +357,44 @@ fn test_history_rerank_empty_history_preserves_order() { fn test_history_rerank_empty_paths() { let h = UserHistory::new(); let mut paths: Vec = Vec::new(); - history_rerank(&mut paths, &h); + history_rerank_at(&mut paths, &h, now_epoch()); assert!(paths.is_empty()); } +#[test] +fn test_history_rerank_at_matches_compute_history_boost() { + // Contract: `history_rerank_at` must subtract exactly the value reported + // by `compute_history_boost(...).applied(seg_count)` when given the same + // `now`. The `explain` observer relies on this so its precomputed + // breakdown matches what the pipeline actually applied. Regression for + // PR #247 R3. + use crate::converter::reranker::compute_history_boost; + + let mut h = UserHistory::new(); + h.record(&[("きょう".into(), "京".into())]); + let now = 1_700_000_000; + + let path_before = ScoredPath { + segments: vec![RichSegment { + reading: "きょう".into(), + surface: "京".into(), + left_id: 0, + right_id: 0, + word_cost: 0, + }], + viterbi_cost: 10_000, + }; + let expected_applied = + compute_history_boost(&path_before, &h, now).applied(path_before.segments.len()); + + let initial_cost = path_before.viterbi_cost; + let mut paths = vec![path_before]; + history_rerank_at(&mut paths, &h, now); + let actual_applied = initial_cost - paths[0].viterbi_cost; + + assert_eq!(actual_applied, expected_applied); +} + /// Build a connection matrix where all transitions cost the given value. fn uniform_conn(cost: i16) -> ConnectionMatrix { let num_ids = 4;