@@ -41,6 +41,40 @@ A reusable evaluation framework for LLM-as-Judge and multi-agent workflows.
4141- 🔗 ** Multi-agent coordination** with DAG-based report aggregation
4242- 📋 ** Claims validation** for factual claim extraction and source verification
4343
44+ ## Architecture
45+
46+ ```
47+ ┌─────────────────────────────────────────────────────────────┐
48+ │ SummaryReport (GO/NO-GO) │
49+ │ ┌──────────────────────┐ ┌──────────────────────┐ │
50+ │ │ Embedded Reports │ │ Team Sections │ │
51+ │ │ (Full-Fidelity) │ │ (Task Results) │ │
52+ │ └──────────────────────┘ └──────────────────────┘ │
53+ └─────────────────────────────────────────────────────────────┘
54+ ▲
55+ ┌───────────────┴───────────────┐
56+ │ │
57+ ┌─────────────┴─────────────┐ ┌─────────────┴─────────────┐
58+ │ Rubric (rubric/) │ │ ClaimsReport (claims/) │
59+ │ ┌─────────────────────┐ │ │ ┌─────────────────────┐ │
60+ │ │ Category Results │ │ │ │ Claims + Validation │ │
61+ │ │ (pass/partial/fail) │ │ │ │ (verified/rejected) │ │
62+ │ ├─────────────────────┤ │ │ ├─────────────────────┤ │
63+ │ │ Findings │ │ │ │ Sources │ │
64+ │ │ (severity-based) │ │ │ │ (external/internal) │ │
65+ │ └─────────────────────┘ │ │ └─────────────────────┘ │
66+ │ LLM-as-Judge scoring │ │ Fact verification │
67+ └───────────────────────────┘ └───────────────────────────┘
68+ ```
69+
70+ ** Three complementary report types:**
71+
72+ | Package | Purpose | Evaluation Type |
73+ | ---------| ---------| -----------------|
74+ | ` rubric/ ` | Categorical scoring with findings | Subjective (LLM-as-Judge) |
75+ | ` claims/ ` | Fact verification with sources | Objective (source-backed) |
76+ | ` summary/ ` | GO/NO-GO aggregation | Deterministic |
77+
4478## Installation
4579
4680``` bash
@@ -51,38 +85,38 @@ go get github.com/plexusone/structured-evaluation
5185
5286| Package | Description |
5387| ---------| -------------|
54- | ` evaluation ` | EvaluationReport, CategoryResult, Finding, Severity types |
55- | ` summary ` | SummaryReport, TeamSection, TaskResult for GO/NO-GO checks |
88+ | ` rubric ` | Rubric, CategoryResult, Finding, Severity types for LLM-as-Judge |
5689| ` claims ` | ClaimsReport, Claim, Validation, Verdict for source verification |
90+ | ` summary ` | SummaryReport, TeamSection, TaskResult for GO/NO-GO checks |
5791| ` combine ` | DAG-based report aggregation using Kahn's algorithm |
5892| ` render/box ` | Box-format terminal renderer for summary reports |
59- | ` render/detailed ` | Detailed terminal renderer for evaluation reports |
93+ | ` render/detailed ` | Detailed terminal renderer for rubric reports |
6094| ` render/terminal ` | ANSI-colored terminal renderer with UTF8 icons |
6195| ` render/markdown ` | Markdown report renderer |
6296| ` schema ` | JSON Schema generation and embedding |
6397
6498## Report Types
6599
66- ### Evaluation Report (LLM-as-Judge)
100+ ### Rubric (LLM-as-Judge)
67101
68102For subjective quality assessments with detailed findings:
69103
70104``` go
71- import " github.com/plexusone/structured-evaluation/evaluation "
105+ import " github.com/plexusone/structured-evaluation/rubric "
72106
73- report := evaluation. NewEvaluationReport (" prd" , " document.md" )
74- report.AddCategory (evaluation .CategoryResult {
107+ report := rubric. NewRubric (" prd" , " document.md" )
108+ report.AddCategoryResult (rubric .CategoryResult {
75109 Category : " problem_definition" ,
76- Score : evaluation .ScorePass ,
110+ Score : rubric .ScorePass ,
77111 Reasoning : " Clear problem statement with measurable goals" ,
78112})
79- report.AddFinding (evaluation .Finding {
80- Severity : evaluation .SeverityMedium ,
113+ report.AddFinding (rubric .Finding {
114+ Severity : rubric .SeverityMedium ,
81115 Category : " metrics" ,
82116 Title : " Missing baseline metrics" ,
83117 Recommendation : " Add current baseline measurements" ,
84118})
85- report.Finalize (" sevaluation check document.md" )
119+ report.Finalize (nil , " sevaluation check document.md" )
86120```
87121
88122### Summary Report (GO/NO-GO)
@@ -150,10 +184,10 @@ Following InfoSec conventions:
150184Default criteria (zero blocking findings, all categories passing):
151185
152186``` go
153- criteria := evaluation .DefaultPassCriteria ()
187+ criteria := rubric .DefaultPassCriteria ()
154188// MaxCritical: 0, MaxHigh: 0, MaxMedium: -1 (unlimited), RequireAllPass: false
155189
156- criteria := evaluation .StrictPassCriteria ()
190+ criteria := rubric .StrictPassCriteria ()
157191// MaxCritical: 0, MaxHigh: 0, MaxMedium: 3, RequireAllPass: true
158192```
159193
@@ -204,30 +238,33 @@ Schemas are embedded for runtime validation:
204238``` go
205239import " github.com/plexusone/structured-evaluation/schema"
206240
207- evalSchema := schema.EvaluationSchemaJSON
241+ rubricSchema := schema.RubricSchemaJSON
242+ claimsSchema := schema.ClaimsSchemaJSON
208243summarySchema := schema.SummarySchemaJSON
209244```
210245
211- ## Rubrics (v0.4.0)
246+ ## RubricSet (v0.4.0)
212247
213248Define explicit criteria for consistent categorical evaluations:
214249
215250``` go
216- rubric := evaluation.NewRubric (" quality" , " Output quality" ).
217- WithPassCriteria (" Meets all requirements, no significant issues" ).
218- WithPartialCriteria (" Meets most requirements, minor issues" ).
219- WithFailCriteria (" Missing key requirements or major issues" )
251+ cat := rubric.NewCategory (" quality" , " Output Quality" , " Overall quality assessment" ).
252+ WithPassPartialFail (
253+ []string {" Meets all requirements, no significant issues" },
254+ []string {" Meets most requirements, minor issues" },
255+ []string {" Missing key requirements or major issues" },
256+ )
220257
221258// Use default PRD rubric
222- rubricSet := evaluation .DefaultPRDRubricSet ()
259+ rubricSet := rubric .DefaultPRDRubricSet ()
223260```
224261
225262## Judge Metadata (v0.2.0)
226263
227264Track LLM judge configuration for reproducibility:
228265
229266``` go
230- judge := evaluation .NewJudgeMetadata (" claude-3-opus" ).
267+ judge := rubric .NewJudgeMetadata (" claude-3-opus" ).
231268 WithProvider (" anthropic" ).
232269 WithPrompt (" prd-eval-v1" , " 1.0" ).
233270 WithTemperature (0.0 ).
@@ -241,11 +278,11 @@ report.SetJudge(judge)
241278Compare two outputs instead of absolute scoring:
242279
243280``` go
244- comparison := evaluation .NewPairwiseComparison (input, outputA, outputB)
245- comparison.SetWinner (evaluation .WinnerA , " A is more accurate" , 0.9 )
281+ comparison := rubric .NewPairwiseComparison (input, outputA, outputB)
282+ comparison.SetWinner (rubric .WinnerA , " A is more accurate" , 0.9 )
246283
247284// Aggregate multiple comparisons
248- result := evaluation .ComputePairwiseResult (comparisons)
285+ result := rubric .ComputePairwiseResult (comparisons)
249286// result.WinRateA, result.OverallWinner
250287```
251288
@@ -254,7 +291,7 @@ result := evaluation.ComputePairwiseResult(comparisons)
254291Combine evaluations from multiple judges:
255292
256293``` go
257- result := evaluation .AggregateEvaluations (evaluations, evaluation .AggregationMajority )
294+ result := rubric .AggregateEvaluations (evaluations, rubric .AggregationMajority )
258295
259296// Methods: AggregationMajority, AggregationConservative, AggregationOptimistic
260297// result.Agreement - inter-judge agreement (0-1)
@@ -268,15 +305,15 @@ Use 1-5 numeric scales for human comparison studies:
268305
269306``` go
270307// Create a Likert-scale category
271- cat := evaluation .NewCategory (" quality" , " Content Quality" , " Overall quality" ).
272- WithLikert5 (evaluation .StandardLikert5Anchors ())
308+ cat := rubric .NewCategory (" quality" , " Content Quality" , " Overall quality" ).
309+ WithLikert5 (rubric .StandardLikert5Anchors ())
273310
274311// Record a Likert score (automatically maps to categorical)
275- result := evaluation .NewCategoryResultFromLikert (" quality" , 4 , config, " Good quality" )
312+ result := rubric .NewCategoryResultFromLikert (" quality" , 4 , config, " Good quality" )
276313// result.Score = ScorePass, result.NumericScore = 4.0
277314
278315// Or record both categorical and numeric
279- result := evaluation .NewCategoryResultWithNumeric (" quality" , evaluation .ScorePass , 4.5 , " reasoning" )
316+ result := rubric .NewCategoryResultWithNumeric (" quality" , rubric .ScorePass , 4.5 , " reasoning" )
280317```
281318
282319## Inter-Rater Reliability (v0.5.0)
@@ -285,14 +322,14 @@ Compare LLM evaluations with human ground truth:
285322
286323``` go
287324// Compute IRR metrics
288- metrics := evaluation .ComputeIRRFromResults (humanResults, llmResults)
325+ metrics := rubric .ComputeIRRFromResults (humanResults, llmResults)
289326
290327fmt.Printf (" Exact Agreement: %.1f%% \n " , metrics.ExactAgreement *100 )
291328fmt.Printf (" Adjacent Agreement: %.1f%% \n " , metrics.AdjacentAgreement *100 )
292329fmt.Printf (" Pearson r: %.3f \n " , metrics.PearsonCorrelation )
293330
294331// Categorical agreement with confusion matrix
295- agreement := evaluation .ComputeCategoricalAgreement (humanResults, llmResults)
332+ agreement := rubric .ComputeCategoricalAgreement (humanResults, llmResults)
296333```
297334
298335## Claims Validation (v0.6.0)
@@ -326,15 +363,15 @@ if report.IsPassing() {
326363Archive full-fidelity reports within SummaryReport:
327364
328365``` go
329- summary := summary.NewSummaryReport (" project" , " v1.0.0" , " RELEASE" )
366+ report := summary.NewSummaryReport (" project" , " v1.0.0" , " RELEASE" )
330367
331368// Embed detailed reports
332- summary. EmbedEvaluationReport (" quality-review" , evalReport )
333- summary .EmbedClaimsReport (" source-validation" , claimsReport)
369+ report. EmbedRubricReport (" quality-review" , rubricReport )
370+ report .EmbedClaimsReport (" source-validation" , claimsReport)
334371
335372// Retrieve later
336- var eval evaluation. EvaluationReport
337- summary. GetEmbeddedEvaluation (" quality-review" , &eval )
373+ var r rubric. Rubric
374+ report. GetEmbeddedRubricReport (" quality-review" , &r )
338375```
339376
340377## OmniObserve Integration
0 commit comments