From e4edb7b7b6aac14d7629b5243ffbbe35f4e66254 Mon Sep 17 00:00:00 2001 From: yisz Date: Thu, 22 Feb 2024 12:40:33 -0800 Subject: [PATCH] docs update --- .../content/docs/examples/0_single_metric.md | 2 +- .../docs/getting-started/Introduction.md | 31 +++---- docs/src/content/docs/index.mdx | 2 + .../Deterministic/python_ast_similarity.md | 2 +- .../Code/Deterministic/string_match.md | 2 +- .../Generation/Deterministic/correctness.md | 2 +- .../Generation/Deterministic/faithfulness.md | 2 +- .../flesch_kincaid_readability.md | 2 +- .../Generation/LLM-Based/llm_correctness.md | 2 +- .../Generation/LLM-Based/llm_faithfulness.md | 2 +- .../Generation/LLM-Based/llm_relevance.md | 2 +- .../metrics/Generation/LLM-Based/llm_style.md | 2 +- .../Semantic/bert_answer_relevance.md | 2 +- .../Semantic/bert_answer_similarity.md | 2 +- .../Semantic/deberta_answer_scores.md | 2 +- .../Deterministic/precision_recall.md | 2 +- .../Deterministic/rank_aware_metrics.md | 2 +- .../LLM-Based/llm_context_coverage.md | 2 +- .../LLM-Based/llm_context_precision.md | 2 +- docs/src/content/docs/pipeline/overview.md | 22 +++-- .../content/docs/pipeline/pipeline_eval.md | 92 +++++++++++++++++++ 21 files changed, 138 insertions(+), 43 deletions(-) create mode 100644 docs/src/content/docs/pipeline/pipeline_eval.md diff --git a/docs/src/content/docs/examples/0_single_metric.md b/docs/src/content/docs/examples/0_single_metric.md index 02fbfe1..7839edf 100644 --- a/docs/src/content/docs/examples/0_single_metric.md +++ b/docs/src/content/docs/examples/0_single_metric.md @@ -25,5 +25,5 @@ dataset = Dataset([q]) metric = PrecisionRecallF1() # Let's calculate the metric for the first datum -print(metric.calculate(**dataset.datum(0))) # alternatively `metric.calculate(**q)` +print(metric(**dataset.datum(0))) # alternatively `metric.calculate(**q)` ``` \ No newline at end of file diff --git a/docs/src/content/docs/getting-started/Introduction.md b/docs/src/content/docs/getting-started/Introduction.md index 8aa3ba1..1968e63 100644 --- a/docs/src/content/docs/getting-started/Introduction.md +++ b/docs/src/content/docs/getting-started/Introduction.md @@ -9,34 +9,29 @@ sidebar: ## What is continuous-eval? -`continuous-eval` is an open-source package created for the scientific and practical evaluation of LLM application pipelines. Currently, it focuses on retrieval-augmented generation (RAG) pipelines. +`continuous-eval` is an open-source package created for granular and holistic evaluation of GenAI application pipelines. -## Why another eval package? + -Good LLM evaluation should help you reliably identify weaknesses in the pipeline, inform what actions to take, and accelerate your development from prototype to production. - -As AI developers, we always wanted to put LLM Evaluation as part of our CI/CD pipeline just like any other part of software, but today it remains challenging because: - -**Human evaluation is trustworthy but not scalable** -- Eyeballing can only be done on a small dataset, and it has to be repeated for any pipeline update -- User feedback is spotty and lacks granularity - -**Using LLMs to evaluate LLMs is expensive, slow and difficult to trust** -- Can be very costly and slow to run at scale -- Can be biased towards certain answers and often doesn’t align well with human evaluation ## How is continuous-eval different? -- **The Most Complete RAG Metric Library**: mix and match Deterministic, Semantic and LLM-based metrics. +- **Modularized Evaluation**: Measure each module in the pipeline with tailored metrics. -- **Trustworthy Ensemble Metrics**: easily build a close-to-human ensemble evaluation pipeline with mathematical guarantees. +- **Comprehensive Metric Library**: Covers Retrieval-Augmented Generation (RAG), Code Generation, Agent Tool Use, Classification and a variety of other LLM use cases. Mix and match Deterministic, Semantic and LLM-based metrics. -- **Cheaper and Faster Evaluation**: our hybrid pipeline slashes cost by up to 15x compared to pure LLM-based metrics, and reduces eval time on large datasets from hours to minutes. +- **Leverage User Feedback in Evaluation**: Easily build a close-to-human ensemble evaluation pipeline with mathematical guarantees. + +- **Synthetic Dataset Generation**: Generate large-scale synthetic dataset to test your pipeline. -- **Tailored to Your Data**: our evaluation pipeline can be customized to your use case and leverages the data you trust. We can help you curate a golden dataset if you don’t have one. ## Resources -- **Blog Post: Practical Guide to RAG Pipeline Evaluation:** [Part 1: Retrieval](https://medium.com/relari/a-practical-guide-to-rag-pipeline-evaluation-part-1-27a472b09893), [Part 2: Generation](https://medium.com/relari/a-practical-guide-to-rag-evaluation-part-2-generation-c79b1bde0f5d) +- **Blog Posts:** + - Practical Guide to RAG Pipeline Evaluation: [Part 1: Retrieval](https://medium.com/relari/a-practical-guide-to-rag-pipeline-evaluation-part-1-27a472b09893) + - Practical Guide to RAG Pipeline Evaluation: [Part 2: Generation](https://medium.com/relari/a-practical-guide-to-rag-evaluation-part-2-generation-c79b1bde0f5d) + - How important is a Golden Dataset for LLM evaluation? + [link](https://medium.com/relari/how-important-is-a-golden-dataset-for-llm-pipeline-evaluation-4ef6deb14dc5) + - **Discord:** Join our community of LLM developers [Discord](https://discord.gg/GJnM8SRsHr) - **Reach out to founders:** [Email](mailto:founders@relari.ai) or [Schedule a chat](https://cal.com/yizhang/continuous-eval) diff --git a/docs/src/content/docs/index.mdx b/docs/src/content/docs/index.mdx index 2c31836..5a04d67 100644 --- a/docs/src/content/docs/index.mdx +++ b/docs/src/content/docs/index.mdx @@ -5,6 +5,8 @@ description: Making LLM development a science rather than an art. **Start today with continuous-eval and make your LLM development a science not an art!** + + import { LinkCard, CardGrid } from '@astrojs/starlight/components'; import { Icon } from '@astrojs/starlight/components'; diff --git a/docs/src/content/docs/metrics/Code/Deterministic/python_ast_similarity.md b/docs/src/content/docs/metrics/Code/Deterministic/python_ast_similarity.md index b6f6d17..a43b162 100644 --- a/docs/src/content/docs/metrics/Code/Deterministic/python_ast_similarity.md +++ b/docs/src/content/docs/metrics/Code/Deterministic/python_ast_similarity.md @@ -34,7 +34,7 @@ datum = { }, metric = PythonASTSimilarity() -print(metric.calculate(**datum)) +print(metric(**datum)) ``` ### Example Output diff --git a/docs/src/content/docs/metrics/Code/Deterministic/string_match.md b/docs/src/content/docs/metrics/Code/Deterministic/string_match.md index f318a1f..e997ffb 100644 --- a/docs/src/content/docs/metrics/Code/Deterministic/string_match.md +++ b/docs/src/content/docs/metrics/Code/Deterministic/string_match.md @@ -29,7 +29,7 @@ datum = { }, metric = CodeStringMatch() -print(metric.calculate(**datum)) +print(metric(**datum)) ``` ### Example Output diff --git a/docs/src/content/docs/metrics/Generation/Deterministic/correctness.md b/docs/src/content/docs/metrics/Generation/Deterministic/correctness.md index fa0abd5..dc6baad 100644 --- a/docs/src/content/docs/metrics/Generation/Deterministic/correctness.md +++ b/docs/src/content/docs/metrics/Generation/Deterministic/correctness.md @@ -60,7 +60,7 @@ datum = { } metric = DeterministicAnswerCorrectness() -print(metric.calculate(**datum)) +print(metric(**datum)) ``` ### Example Output diff --git a/docs/src/content/docs/metrics/Generation/Deterministic/faithfulness.md b/docs/src/content/docs/metrics/Generation/Deterministic/faithfulness.md index 6be9691..0e27c91 100644 --- a/docs/src/content/docs/metrics/Generation/Deterministic/faithfulness.md +++ b/docs/src/content/docs/metrics/Generation/Deterministic/faithfulness.md @@ -60,7 +60,7 @@ datum = { } metric = DeterministicFaithfulness() -print(metric.calculate(**datum)) +print(metric(**datum)) ``` ### Example Output diff --git a/docs/src/content/docs/metrics/Generation/Deterministic/flesch_kincaid_readability.md b/docs/src/content/docs/metrics/Generation/Deterministic/flesch_kincaid_readability.md index 3e94647..386fbfd 100644 --- a/docs/src/content/docs/metrics/Generation/Deterministic/flesch_kincaid_readability.md +++ b/docs/src/content/docs/metrics/Generation/Deterministic/flesch_kincaid_readability.md @@ -32,7 +32,7 @@ datum = { } metric = FleschKincaidReadability() -print(metric.calculate(**datum)) +print(metric(**datum)) ``` ### Example Output diff --git a/docs/src/content/docs/metrics/Generation/LLM-Based/llm_correctness.md b/docs/src/content/docs/metrics/Generation/LLM-Based/llm_correctness.md index 6e8de95..97b6649 100644 --- a/docs/src/content/docs/metrics/Generation/LLM-Based/llm_correctness.md +++ b/docs/src/content/docs/metrics/Generation/LLM-Based/llm_correctness.md @@ -36,7 +36,7 @@ datum = { } metric = LLMBasedAnswerCorrectness(LLMFactory("gpt-4-1106-preview")) -print(metric.calculate(**datum)) +print(metric(**datum)) ``` ### Sample Output diff --git a/docs/src/content/docs/metrics/Generation/LLM-Based/llm_faithfulness.md b/docs/src/content/docs/metrics/Generation/LLM-Based/llm_faithfulness.md index 8b65b29..4e314cf 100644 --- a/docs/src/content/docs/metrics/Generation/LLM-Based/llm_faithfulness.md +++ b/docs/src/content/docs/metrics/Generation/LLM-Based/llm_faithfulness.md @@ -58,7 +58,7 @@ datum = { "answer": "Shakespeare wrote 'Romeo and Juliet'", } metric = LLMBasedAnswerCorrectness(LLMFactory("gpt-4-1106-preview")) -print(metric.calculate(**datum)) +print(metric(**datum)) ``` ### Sample Output diff --git a/docs/src/content/docs/metrics/Generation/LLM-Based/llm_relevance.md b/docs/src/content/docs/metrics/Generation/LLM-Based/llm_relevance.md index dcc1e70..399c55e 100644 --- a/docs/src/content/docs/metrics/Generation/LLM-Based/llm_relevance.md +++ b/docs/src/content/docs/metrics/Generation/LLM-Based/llm_relevance.md @@ -28,7 +28,7 @@ datum = { } metric = LLMBasedAnswerRelevance(LLMFactory("gpt-4-1106-preview")) -print(metric.calculate(**datum)) +print(metric(**datum)) ``` ### Sample Output diff --git a/docs/src/content/docs/metrics/Generation/LLM-Based/llm_style.md b/docs/src/content/docs/metrics/Generation/LLM-Based/llm_style.md index 8375b8e..f3890ac 100644 --- a/docs/src/content/docs/metrics/Generation/LLM-Based/llm_style.md +++ b/docs/src/content/docs/metrics/Generation/LLM-Based/llm_style.md @@ -32,7 +32,7 @@ datum = { } metric = LLMBasedAnswerRelevance(LLMFactory("gpt-4-1106-preview")) -print(metric.calculate(**datum)) +print(metric(**datum)) ``` ### Sample Output diff --git a/docs/src/content/docs/metrics/Generation/Semantic/bert_answer_relevance.md b/docs/src/content/docs/metrics/Generation/Semantic/bert_answer_relevance.md index 52256dc..d7a5114 100644 --- a/docs/src/content/docs/metrics/Generation/Semantic/bert_answer_relevance.md +++ b/docs/src/content/docs/metrics/Generation/Semantic/bert_answer_relevance.md @@ -35,7 +35,7 @@ datum = { } metric = BertAnswerSimilarity() -print(metric.calculate(**datum)) +print(metric(**datum)) ``` ### Example Output diff --git a/docs/src/content/docs/metrics/Generation/Semantic/bert_answer_similarity.md b/docs/src/content/docs/metrics/Generation/Semantic/bert_answer_similarity.md index ee55bf2..e45b0fa 100644 --- a/docs/src/content/docs/metrics/Generation/Semantic/bert_answer_similarity.md +++ b/docs/src/content/docs/metrics/Generation/Semantic/bert_answer_similarity.md @@ -35,7 +35,7 @@ datum = { } metric = BertAnswerSimilarity() -print(metric.calculate(**datum)) +print(metric(**datum)) ``` ### Example Output diff --git a/docs/src/content/docs/metrics/Generation/Semantic/deberta_answer_scores.md b/docs/src/content/docs/metrics/Generation/Semantic/deberta_answer_scores.md index 191859a..e0eda22 100644 --- a/docs/src/content/docs/metrics/Generation/Semantic/deberta_answer_scores.md +++ b/docs/src/content/docs/metrics/Generation/Semantic/deberta_answer_scores.md @@ -45,7 +45,7 @@ datum = { } metric = DebertaAnswerScores() -print(metric.calculate(**datum)) +print(metric(**datum)) reverse_metric = DebertaAnswerScores(reverse=True) print(reverse_metric.calculate(**datum)) diff --git a/docs/src/content/docs/metrics/Retrieval/Deterministic/precision_recall.md b/docs/src/content/docs/metrics/Retrieval/Deterministic/precision_recall.md index b7f3fdd..06dc47f 100644 --- a/docs/src/content/docs/metrics/Retrieval/Deterministic/precision_recall.md +++ b/docs/src/content/docs/metrics/Retrieval/Deterministic/precision_recall.md @@ -94,7 +94,7 @@ datum = { } metric = PrecisionRecallF1(RougeChunkMatch()) -print(metric.calculate(**datum)) +print(metric(**datum)) ``` ### Example Output diff --git a/docs/src/content/docs/metrics/Retrieval/Deterministic/rank_aware_metrics.md b/docs/src/content/docs/metrics/Retrieval/Deterministic/rank_aware_metrics.md index fe1f93b..5140eb7 100644 --- a/docs/src/content/docs/metrics/Retrieval/Deterministic/rank_aware_metrics.md +++ b/docs/src/content/docs/metrics/Retrieval/Deterministic/rank_aware_metrics.md @@ -50,7 +50,7 @@ datum = { } metric = RankedRetrievalMetrics(RougeChunkMatch()) -print(metric.calculate(**datum)) +print(metric(**datum)) ``` ### Example Output diff --git a/docs/src/content/docs/metrics/Retrieval/LLM-Based/llm_context_coverage.md b/docs/src/content/docs/metrics/Retrieval/LLM-Based/llm_context_coverage.md index 7d9f746..0319cfe 100644 --- a/docs/src/content/docs/metrics/Retrieval/LLM-Based/llm_context_coverage.md +++ b/docs/src/content/docs/metrics/Retrieval/LLM-Based/llm_context_coverage.md @@ -37,7 +37,7 @@ datum = { } metric = LLMBasedContextCoverage(LLMFactory("gpt-4-1106-preview")) -print(metric.calculate(**datum)) +print(metric(**datum)) ``` ### Sample Output diff --git a/docs/src/content/docs/metrics/Retrieval/LLM-Based/llm_context_precision.md b/docs/src/content/docs/metrics/Retrieval/LLM-Based/llm_context_precision.md index 2cf9b59..5c6daac 100644 --- a/docs/src/content/docs/metrics/Retrieval/LLM-Based/llm_context_precision.md +++ b/docs/src/content/docs/metrics/Retrieval/LLM-Based/llm_context_precision.md @@ -43,7 +43,7 @@ datum = { } metric = LLMBasedContextPrecision(LLMFactory("gpt-4-1106-preview"), log_relevance_by_context=True) -print(metric.calculate(**datum)) +print(metric(**datum)) ``` Note: optional variable `log_relevance_by_context` outputs `LLM_based_context_relevance_by_context` - the LLM judgement of relevance to answer the question per context retrieved. diff --git a/docs/src/content/docs/pipeline/overview.md b/docs/src/content/docs/pipeline/overview.md index 9f82ce3..ab3b8df 100644 --- a/docs/src/content/docs/pipeline/overview.md +++ b/docs/src/content/docs/pipeline/overview.md @@ -1,8 +1,8 @@ --- -title: Overview +title: Pipeline sidebar: badge: - text: beta + text: new variant: tip --- @@ -18,16 +18,16 @@ Consider the following example: ```d2 direction: right -Retriever -> LLM +Retriever -> Reranker -> Generator ``` -In the example above, the pipeline consists of two modules: a retriever and a language model (LLM). +In the example above, the pipeline consists of three simple modules: a retriever and an LLM generator. ```python from continuous_eval.eval import Module, Pipeline, Dataset from typing import List, Dict -dataset = Dataset("dataset_folder") +dataset = Dataset("dataset_folder") # This is the dataset you will use you evaluate the pipeline module. retriever = Module( name="Retriever", @@ -35,13 +35,19 @@ retriever = Module( output=List[Dict[str, str]], ) +reranker = Module( + name="Reranker", + input=retriever, + output=List[Dict[str, str]], +) + llm = Module( name="LLM", - input=retriever, + input=reranker, output=str, ) -pipeline = Pipeline([retriever, llm], dataset=dataset) +pipeline = Pipeline([retriever, reranker, llm], dataset=dataset) +print(pipeline.graph_repr()) # visualize the pipeline in Mermaid graph format ``` -> We will talk about the dataset later diff --git a/docs/src/content/docs/pipeline/pipeline_eval.md b/docs/src/content/docs/pipeline/pipeline_eval.md new file mode 100644 index 0000000..f931fd0 --- /dev/null +++ b/docs/src/content/docs/pipeline/pipeline_eval.md @@ -0,0 +1,92 @@ +--- +title: Evaluators and Tests +sidebar: + badge: + text: new + variant: tip +--- + +## Adding Evaluators and Tests to a Pipeline + +You can optionally add `eval` and `tests` to the modules you want to measure the performance of. + + +`eval` field to select relevant evaluation metrics +- Select the metrics and specify the input according to the data fields required for each metric. `MetricName().use(data_fields)`. +- Metric inputs can be referenced using items from two sources: + - **From `dataset`**: e.g. `ground_truth_context = dataset.ground_truth_context` + - **From current module**: e.g. `answer = ModuleOutput()` + - **From prior modules**: e.g. `retrieved_context = ModuleOutput(DocumentsContent, module=reranker)`, where + `DocumentsContent = ModuleOutput(lambda x: [z["page_content"] for z in x])` to select specific items from the prior module's output + + +`tests` field to define specific performance criteria +- Select testing class `GreaterOrEqualThan` or `MeanGreaterOrEqualThan` to run test over each datapoint or the mean of the aggregate dataset +- Define `test_name`, `metric_name` (must be part of the metric_name that `eval` calculates), and `min_value`. + + + +Below is a full example of a two-step pipeline. + +```python +from continuous_eval.eval import Module, Pipeline, Dataset, ModuleOutput +from continuous_eval.metrics.retrieval import PrecisionRecallF1 # Deterministic metric +from continuous_eval.metrics.generation.text import ( + FleschKincaidReadability, # Deterministic metric + DebertaAnswerScores, # Semantic metric + LLMBasedFaithfulness, # LLM-based metric +) +from typing import List, Dict +from continuous_eval.eval.tests import GreaterOrEqualThan +dataset = Dataset("data/eval_golden_dataset") + +Documents = List[Dict[str, str]] +DocumentsContent = ModuleOutput(lambda x: [z["page_content"] for z in x]) + +base_retriever = Module( + name="base_retriever", + input=dataset.question, + output=Documents, + eval=[ + PrecisionRecallF1().use( # Reference-based metric that compares the Retrieved Context with the Ground Truths + retrieved_context=DocumentsContent, + ground_truth_context=dataset.ground_truth_contexts, + ), + ], + tests=[ + GreaterOrEqualThan( # Set a test using context_recall, a metric calculated by PrecisionRecallF1() + test_name="Context Recall", metric_name="context_recall", min_value=0.9 + ), + ], +) + +llm = Module( + name="answer_generator", + input=reranker, + output=str, + eval=[ + FleschKincaidReadability().use( # Reference-free metric that only uses the output of the module + answer=ModuleOutput() + ), + DebertaAnswerScores().use( # Reference-based metric that compares the Answer with the Ground Truths + answer=ModuleOutput(), ground_truth_answers=dataset.ground_truths + ), + LLMBasedFaithfulness().use( # Reference-free metric that uses output from a prior module (Retrieved Context) to evaluate the answer + answer=ModuleOutput(), + retrieved_context=ModuleOutput(DocumentsContent, module=reranker), # DocumentsContent from the reranker module + question=dataset.question, + ), + ], + tests=[ + MeanGreaterOrEqualThan( # Compares the aggregate result over the dataset against the min_value + test_name="Readability", metric_name="flesch_reading_ease", min_value=20.0 + ), + GreaterOrEqualThan( # Compares each result in the dataset against the min_value, and outputs the mean + test_name="Deberta Entailment", metric_name="deberta_entailment", min_value=0.8 + ), + ], +) + +pipeline = Pipeline([retriever, llm], dataset=dataset) +print(pipeline.graph_repr()) # visualize the pipeline in Mermaid graph format +``` \ No newline at end of file