# Quality Testing for Agentic Workflow

This notebook tests the quality of the LangGraph agentic workflow by:
- Reading questions from `input/quality.csv`
- Running **all questions in parallel** through the workflow to generate answers
- Evaluating answers using Google Gemini 2.5 Flash for quality metrics:
  - **Correctness (1-5)**: Factual accuracy of the answer
  - **Fluency (1-5)**: Language quality and readability
  - **Relevance (1-5)**: How well the answer addresses the question
  - **Coverage (1-5)**: Completeness of the answer
- Saving results to `output/out_quality.csv`

In [18]:
import sys
import os
import pandas as pd
from pathlib import Path
from tqdm.notebook import tqdm
import json
import concurrent.futures
from threading import Lock

# Add project root to path
project_root = Path(os.getcwd()).parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

from dotenv import load_dotenv
load_dotenv(project_root / ".env", override=True)

print(f"Project root: {project_root}")

Project root: /home/scsng/Desktop/Projects/Job_application/pwc/pwc_genai_task


In [19]:
from utils.chat_client import ChatClient
from utils.agents.agentic_workflow import AgenticWorkflow
from utils.rag.vector_db import QdrantDB
from google import genai
from google.genai import types

# Initialize chat client for agentic workflow
def get_chat_client():
    return ChatClient(
        base_url=os.getenv("INFERENCE_API_URL"),
        model=os.getenv("MODEL", "default-model"),
        api_key=os.getenv("INFERENCE_API_KEY")
    )

# Initialize agentic workflow
def get_agentic_workflow():
    """Initialize the agentic workflow with LLM."""
    chat_client = get_chat_client()
    max_task_count = int(os.getenv("MAX_TASK_COUNT", "3"))
    return AgenticWorkflow(
        llm=chat_client.llm,
        vector_db=QdrantDB(
            collection_name=os.getenv("COLLECTION_NAME"),
            qdrant_host=os.getenv("QDRANT_HOST"),
            embedding_model=os.getenv("EMBEDDING_MODEL"),
            qdrant_api_key=os.getenv("QDRANT_API_KEY"),
            top_k=int(os.getenv("TOP_K", "4"))
        ),
        max_task_count=max_task_count
    )

# Initialize Gemini client for quality evaluation
def get_gemini_client():
    """Initialize Google Gemini 2.5 Flash client for evaluation."""
    api_key = os.getenv("GOOGLE_API_KEY")
    if not api_key:
        raise ValueError("GOOGLE_API_KEY environment variable must be set")
    return genai.Client(api_key=api_key)

print("Initializing workflow...")
workflow = get_agentic_workflow()
print("Workflow initialized successfully!")

print("Initializing Gemini client...")
gemini_client = get_gemini_client()
print("Gemini client initialized successfully!")

Initializing workflow...


  client = QdrantClient(**client_options)


Workflow initialized successfully!
Initializing Gemini client...
Gemini client initialized successfully!


In [20]:
# Load quality test data
input_file = Path("input/quality.csv")
df = pd.read_csv(input_file, sep=";")

print(f"Loaded {len(df)} questions from {input_file}")
print(f"\nColumns: {list(df.columns)}")
df.head()

Loaded 15 questions from input/quality.csv

Columns: ['Topic', 'Complex (yes,no)', 'Question', 'Answer', 'Correctness (1-5)', 'Fluency (1-5)', 'Relevance (1-5)', 'Coverage(1-5)', 'Short review']


Unnamed: 0,Topic,"Complex (yes,no)",Question,Answer,Correctness (1-5),Fluency (1-5),Relevance (1-5),Coverage(1-5),Short review
0,Criminal Law,no,What is the principle of legality according to...,,,,,,
1,Criminal Law,yes,Explain the difference between intentional and...,,,,,,
2,Civil Law,no,"According to the Civil Code, when does a perso...",,,,,,
3,Civil Law,yes,Distinguish between legal capacity and capacit...,,,,,,
4,Public Procurement,no,What language requirements apply to public pro...,,,,,,


In [21]:
def generate_answer(question: str) -> str:
    """Generate an answer using the agentic workflow."""
    try:
        answer = workflow.invoke(question)
        return answer
    except Exception as e:
        print(f"Error generating answer: {e}")
        return f"Error: {e}"

## Step 1: Generate Answers (Parallel)

In [22]:
# Process a single question for answer generation
print_lock = Lock()

def generate_single_answer(row_data: tuple) -> dict:
    """Generate answer for a single question."""
    idx, row = row_data
    topic = row.get("Topic", "")
    is_complex = row.get("Complex (yes,no)", "no")
    question = row["Question"]
    reference_answer = row.get("Answer", "")
    
    with print_lock:
        print(f"[Q{idx + 1}] Starting: {question[:60]}...")
    
    # Generate answer using agentic workflow
    generated_answer = generate_answer(question)
    
    with print_lock:
        print(f"[Q{idx + 1}] Done - Answer generated ({len(generated_answer)} chars)")
    
    return {
        "idx": idx,
        "Topic": topic,
        "Complex (yes,no)": is_complex,
        "Question": question,
        "Answer": generated_answer,
        "Reference_Answer": reference_answer
    }

In [None]:
# Generate answers for all questions in PARALLEL
question_col = "Question"

# Filter out empty rows
df_filtered = df[df[question_col].notna() & (df[question_col].str.strip() != "")].copy()

# Prepare data for parallel processing
rows_to_process = [(idx, row) for idx, row in df_filtered.iterrows()]

print(f"Generating answers for {len(rows_to_process)} questions in PARALLEL...\n")
print("="*60)

# Use ThreadPoolExecutor for parallel execution
MAX_WORKERS = min(len(rows_to_process), 10)  # Process up to 10 questions simultaneously

answers_results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    # Submit all tasks
    future_to_row = {executor.submit(generate_single_answer, row_data): row_data for row_data in rows_to_process}
    
    # Collect results as they complete
    for future in tqdm(concurrent.futures.as_completed(future_to_row), total=len(rows_to_process), desc="Generating Answers"):
        try:
            result = future.result()
            answers_results.append(result)
        except Exception as e:
            row_data = future_to_row[future]
            print(f"Error processing question {row_data[0]}: {e}")

# Sort results by original index to maintain order
answers_results = sorted(answers_results, key=lambda x: x["idx"])

print("\n" + "="*60)
print(f"Completed generating {len(answers_results)} answers!")

Generating answers for 15 questions in PARALLEL...

[Q1] Starting: What is the principle of legality according to Section 1 of ...
[Q2] Starting: Explain the difference between intentional and negligent com...
[Q3] Starting: According to the Civil Code, when does a person reach majori...
[Q4] Starting: Distinguish between legal capacity and capacity to act under...
[Q5] Starting: What language requirements apply to public procurement proce...
[Q6] Starting: How is 'innovation' defined in the Public Procurement Act, a...
[Q7] Starting: According to Section 2 of the Labor Code, what specific grou...
[Q8] Starting: Under what conditions can an employer restrict a worker's pe...
[Q9] Starting: Describe the core principles of Hungary's statehood includin...
[Q10] Starting: Compare how the principle of 'good faith and fair dealing' i...


Generating Answers:   0%|          | 0/15 [00:00<?, ?it/s]

[Q5] Done - Answer generated (662 chars)
[Q11] Starting: In which specific cases does Hungarian criminal law apply to...
[Q6] Done - Answer generated (819 chars)
[Q12] Starting: What are the requirements for the 'Modellválasztás' (Model S...
[Q1] Done - Answer generated (501 chars)
[Q13] Starting: Outline the architectural requirements for the LangGraph wor...
[Q13] Done - Answer generated (296 chars)
[Q14] Starting: What is the legal status of a human foetus and who is entitl...
[Q8] Done - Answer generated (1065 chars)
[Q15] Starting: What are the rules for integrating 'Tools' and the 'RAG Alre...
[Q15] Done - Answer generated (296 chars)
[Q7] Done - Answer generated (1175 chars)
[Q2] Done - Answer generated (1918 chars)
[Q3] Done - Answer generated (1423 chars)
[Q4] Done - Answer generated (1334 chars)
[Q10] Done - Answer generated (2258 chars)
[Q9] Done - Answer generated (1447 chars)
[Q14] Done - Answer generated (833 chars)


In [None]:
# Preview generated answers
answers_df = pd.DataFrame(answers_results)
print(f"Generated {len(answers_df)} answers\n")

# Show preview
preview_df = answers_df[["idx", "Topic", "Question", "Answer"]].copy()
preview_df["Question"] = preview_df["Question"].str[:50] + "..."
preview_df["Answer"] = preview_df["Answer"].str[:80] + "..."
display(preview_df)

Generated 15 answers



Unnamed: 0,idx,Topic,Question,Answer
0,0,Criminal Law,What is the principle of legality according to...,The principle of legality according to Section...
1,1,Criminal Law,Explain the difference between intentional and...,The difference between intentional and neglige...
2,2,Civil Law,"According to the Civil Code, when does a perso...",The Civil Code does not explicitly state when ...
3,3,Civil Law,Distinguish between legal capacity and capacit...,The Hungarian Civil Code distinguishes between...
4,4,Public Procurement,What language requirements apply to public pro...,The language requirements for public procureme...
5,5,Public Procurement,How is 'innovation' defined in the Public Proc...,The Public Procurement Act does not provide a ...
6,6,Labor Law,"According to Section 2 of the Labor Code, what...",The Hungarian Labor Code covers specific group...
7,7,Labor Law,Under what conditions can an employer restrict...,The Hungarian Labor Code allows employers to r...
8,8,Fundamental Law,Describe the core principles of Hungary's stat...,The core principles of Hungary's statehood are...
9,9,Legal Principles,Compare how the principle of 'good faith and f...,The principle of 'good faith and fair dealing'...


## Step 2: Evaluate Answers with Gemini (Parallel)

In [None]:
# Quality evaluation prompt for Gemini 2.5 Flash
QUALITY_EVALUATION_PROMPT = """You are an expert evaluator assessing the quality of answers generated by an AI legal assistant.

Given a question and an answer, evaluate the answer on the following criteria (score 1-5 for each):

1. **Correctness (1-5)**: How factually accurate is the answer?
   - 1: Completely incorrect or contains major factual errors
   - 2: Contains significant errors but some correct elements
   - 3: Mostly correct with minor errors
   - 4: Correct with only trivial issues
   - 5: Completely accurate and factually correct

2. **Fluency (1-5)**: How well-written and readable is the answer?
   - 1: Incomprehensible or severely broken language
   - 2: Poor grammar/structure, hard to understand
   - 3: Acceptable but has some awkward phrasing
   - 4: Well-written with minor issues
   - 5: Excellent, professional-quality writing

3. **Relevance (1-5)**: How well does the answer address the question?
   - 1: Completely off-topic or irrelevant
   - 2: Partially relevant but misses the main point
   - 3: Addresses the question but with unnecessary tangents
   - 4: Directly relevant with minor deviations
   - 5: Perfectly addresses exactly what was asked

4. **Coverage (1-5)**: How comprehensive is the answer?
   - 1: Missing almost all important information
   - 2: Covers only a small portion of what's needed
   - 3: Covers main points but lacks important details
   - 4: Comprehensive with minor omissions
   - 5: Complete and thorough coverage

Respond ONLY with a valid JSON object in this exact format:
{"correctness": <1-5>, "fluency": <1-5>, "relevance": <1-5>, "coverage": <1-5>, "review": "<brief 1-2 sentence review>"}

Do not include any other text or explanation outside the JSON."""

def evaluate_answer_quality(question: str, answer: str, topic: str = "", reference_answer: str = "") -> dict:
    """Evaluate answer quality using Google Gemini 2.5 Flash."""
    user_content = f"Topic: {topic}\n\nQuestion: {question}\n\nAnswer to evaluate:\n{answer}"
    
    if reference_answer and pd.notna(reference_answer) and str(reference_answer).strip():
        user_content += f"\n\nReference answer (for context):\n{reference_answer}"
    
    try:
        response = gemini_client.models.generate_content(
            model="gemini-2.5-flash",
            contents=f"{QUALITY_EVALUATION_PROMPT}\n\n{user_content}",
            config=types.GenerateContentConfig(
                temperature=0.1,
                max_output_tokens=256
            )
        )
        
        result_text = response.text.strip()
        # Clean up the response - remove markdown code blocks if present
        if result_text.startswith("```json"):
            result_text = result_text[7:]
        if result_text.startswith("```"):
            result_text = result_text[3:]
        if result_text.endswith("```"):
            result_text = result_text[:-3]
        result_text = result_text.strip()
        
        # Parse JSON response
        result = json.loads(result_text)
        return {
            "correctness": int(result.get("correctness", 0)),
            "fluency": int(result.get("fluency", 0)),
            "relevance": int(result.get("relevance", 0)),
            "coverage": int(result.get("coverage", 0)),
            "review": result.get("review", "")
        }
    except json.JSONDecodeError as e:
        print(f"JSON parsing error: {e}")
        return {"correctness": 0, "fluency": 0, "relevance": 0, "coverage": 0, "review": f"Parse error: {e}"}
    except Exception as e:
        print(f"Evaluation error: {e}")
        return {"correctness": 0, "fluency": 0, "relevance": 0, "coverage": 0, "review": f"Error: {e}"}

In [None]:
# Evaluate a single answer
eval_print_lock = Lock()

def evaluate_single_answer(answer_data: dict) -> dict:
    """Evaluate a single answer with Gemini."""
    idx = answer_data["idx"]
    question = answer_data["Question"]
    answer = answer_data["Answer"]
    topic = answer_data["Topic"]
    reference_answer = answer_data.get("Reference_Answer", "")
    
    with eval_print_lock:
        print(f"[Q{idx + 1}] Evaluating...")
    
    # Evaluate quality using Gemini 2.5 Flash
    quality_scores = evaluate_answer_quality(
        question=question,
        answer=answer,
        topic=topic,
        reference_answer=reference_answer
    )
    
    with eval_print_lock:
        print(f"[Q{idx + 1}] Done - Scores: C={quality_scores['correctness']}, F={quality_scores['fluency']}, "
              f"R={quality_scores['relevance']}, Cov={quality_scores['coverage']}")
    
    return {
        "idx": idx,
        "Topic": topic,
        "Complex (yes,no)": answer_data["Complex (yes,no)"],
        "Question": question,
        "Answer": answer,
        "Correctness (1-5)": quality_scores["correctness"],
        "Fluency (1-5)": quality_scores["fluency"],
        "Relevance (1-5)": quality_scores["relevance"],
        "Coverage(1-5)": quality_scores["coverage"],
        "Short review": quality_scores["review"]
    }

In [None]:
# Evaluate all answers in PARALLEL
print(f"Evaluating {len(answers_results)} answers with Gemini 2.5 Flash in PARALLEL...\n")
print("="*60)

# Use ThreadPoolExecutor for parallel evaluation
MAX_EVAL_WORKERS = min(len(answers_results), 10)  # Process up to 10 evaluations simultaneously

final_results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_EVAL_WORKERS) as executor:
    # Submit all evaluation tasks
    future_to_answer = {executor.submit(evaluate_single_answer, answer_data): answer_data for answer_data in answers_results}
    
    # Collect results as they complete
    for future in tqdm(concurrent.futures.as_completed(future_to_answer), total=len(answers_results), desc="Evaluating"):
        try:
            result = future.result()
            final_results.append(result)
        except Exception as e:
            answer_data = future_to_answer[future]
            print(f"Error evaluating question {answer_data['idx']}: {e}")

# Sort results by original index to maintain order
final_results = sorted(final_results, key=lambda x: x["idx"])

# Remove the idx field from results
for r in final_results:
    del r["idx"]

print("\n" + "="*60)
print(f"Completed evaluating {len(final_results)} answers!")

Evaluating 15 answers with Gemini 2.5 Flash in PARALLEL...

[Q1] Evaluating...
[Q2] Evaluating...
[Q3] Evaluating...
[Q4] Evaluating...
[Q5] Evaluating...
[Q6] Evaluating...
[Q7] Evaluating...
[Q8] Evaluating...
[Q9] Evaluating...
[Q10] Evaluating...


Evaluating:   0%|          | 0/15 [00:00<?, ?it/s]

JSON parsing error: Expecting value: line 2 column 17 (char 18)
[Q5] Done - Scores: C=0, F=0, R=0, Cov=0
[Q11] Evaluating...
JSON parsing error: Unterminated string starting at: line 1 column 20 (char 19)
[Q8] Done - Scores: C=0, F=0, R=0, Cov=0
[Q12] Evaluating...
JSON parsing error: Expecting value: line 2 column 17 (char 18)
[Q3] Done - Scores: C=0, F=0, R=0, Cov=0
[Q13] Evaluating...
JSON parsing error: Expecting value: line 2 column 17 (char 18)
[Q4] Done - Scores: C=0, F=0, R=0, Cov=0
[Q14] Evaluating...
JSON parsing error: Expecting value: line 2 column 17 (char 18)
[Q9] Done - Scores: C=0, F=0, R=0, Cov=0
[Q15] Evaluating...
JSON parsing error: Expecting value: line 2 column 17 (char 18)
[Q2] Done - Scores: C=0, F=0, R=0, Cov=0
JSON parsing error: Expecting value: line 2 column 17 (char 18)
[Q10] Done - Scores: C=0, F=0, R=0, Cov=0
JSON parsing error: Unterminated string starting at: line 2 column 3 (char 4)
[Q1] Done - Scores: C=0, F=0, R=0, Cov=0
JSON parsing error: Expecting

## Step 3: Save Results and Display Summary

In [None]:
# Create results dataframe and save to CSV
results_df = pd.DataFrame(final_results)

# Create output directory if it doesn't exist
output_dir = Path("output")
output_dir.mkdir(exist_ok=True)

# Save to CSV
output_file = output_dir / "out_quality.csv"
results_df.to_csv(output_file, sep=";", index=False)

print(f"Results saved to {output_file}")
print(f"\nTotal questions processed: {len(results_df)}")

Results saved to output/out_quality.csv

Total questions processed: 15


In [None]:
# Display summary statistics
print("\n" + "="*60)
print("QUALITY METRICS SUMMARY")
print("="*60)

metrics = ["Correctness (1-5)", "Fluency (1-5)", "Relevance (1-5)", "Coverage(1-5)"]

for metric in metrics:
    if metric in results_df.columns:
        values = results_df[metric][results_df[metric] > 0]  # Exclude errors (0 values)
        if len(values) > 0:
            print(f"\n{metric}:")
            print(f"  Mean: {values.mean():.2f}")
            print(f"  Std:  {values.std():.2f}")
            print(f"  Min:  {values.min()}")
            print(f"  Max:  {values.max()}")

# Overall average
print("\n" + "-"*60)
all_scores = []
for metric in metrics:
    if metric in results_df.columns:
        all_scores.extend(results_df[metric][results_df[metric] > 0].tolist())

if all_scores:
    print(f"Overall Average Score: {sum(all_scores)/len(all_scores):.2f}")


QUALITY METRICS SUMMARY

------------------------------------------------------------


In [None]:
# Breakdown by complexity
print("\n" + "="*60)
print("BREAKDOWN BY COMPLEXITY")
print("="*60)

for complexity in ["yes", "no"]:
    subset = results_df[results_df["Complex (yes,no)"].str.lower() == complexity]
    if len(subset) > 0:
        print(f"\nComplex = {complexity} ({len(subset)} questions):")
        for metric in metrics:
            if metric in subset.columns:
                values = subset[metric][subset[metric] > 0]
                if len(values) > 0:
                    print(f"  {metric}: Mean={values.mean():.2f}, Std={values.std():.2f}")


BREAKDOWN BY COMPLEXITY

Complex = yes (9 questions):

Complex = no (6 questions):


In [None]:
# Breakdown by topic
print("\n" + "="*60)
print("BREAKDOWN BY TOPIC")
print("="*60)

topics = results_df["Topic"].unique()
topic_summary = []

for topic in topics:
    if pd.notna(topic) and topic.strip():
        subset = results_df[results_df["Topic"] == topic]
        if len(subset) > 0:
            avg_scores = {}
            for metric in metrics:
                if metric in subset.columns:
                    values = subset[metric][subset[metric] > 0]
                    if len(values) > 0:
                        avg_scores[metric] = values.mean()
            
            if avg_scores:
                overall_avg = sum(avg_scores.values()) / len(avg_scores)
                topic_summary.append({
                    "Topic": topic,
                    "Count": len(subset),
                    "Avg Score": overall_avg,
                    **avg_scores
                })

if topic_summary:
    topic_df = pd.DataFrame(topic_summary).sort_values("Avg Score", ascending=False)
    display(topic_df)


BREAKDOWN BY TOPIC


In [None]:
# Display the full results table
print("\n" + "="*60)
print("FULL RESULTS")
print("="*60)

# Display with truncated question and answer for readability
display_df = results_df.copy()
display_df["Question"] = display_df["Question"].str[:80] + "..."
display_df["Answer"] = display_df["Answer"].str[:100] + "..."
display_df["Short review"] = display_df["Short review"].str[:80] + "..."

display(display_df)


FULL RESULTS


Unnamed: 0,Topic,"Complex (yes,no)",Question,Answer,Correctness (1-5),Fluency (1-5),Relevance (1-5),Coverage(1-5),Short review
0,Criminal Law,no,What is the principle of legality according to...,The principle of legality according to Section...,0,0,0,0,Parse error: Unterminated string starting at: ...
1,Criminal Law,yes,Explain the difference between intentional and...,The difference between intentional and neglige...,0,0,0,0,Error: 429 RESOURCE_EXHAUSTED. {'error': {'cod...
2,Civil Law,no,"According to the Civil Code, when does a perso...",The Civil Code does not explicitly state when ...,0,0,0,0,Error: 429 RESOURCE_EXHAUSTED. {'error': {'cod...
3,Civil Law,yes,Distinguish between legal capacity and capacit...,The Hungarian Civil Code distinguishes between...,0,0,0,0,Error: 429 RESOURCE_EXHAUSTED. {'error': {'cod...
4,Public Procurement,no,What language requirements apply to public pro...,The language requirements for public procureme...,0,0,0,0,Parse error: Expecting value: line 2 column 17...
5,Public Procurement,yes,How is 'innovation' defined in the Public Proc...,The Public Procurement Act does not provide a ...,0,0,0,0,Parse error: Expecting value: line 2 column 17...
6,Labor Law,no,"According to Section 2 of the Labor Code, what...",The Hungarian Labor Code covers specific group...,0,0,0,0,Parse error: Expecting value: line 2 column 17...
7,Labor Law,yes,Under what conditions can an employer restrict...,The Hungarian Labor Code allows employers to r...,0,0,0,0,Parse error: Unterminated string starting at: ...
8,Fundamental Law,yes,Describe the core principles of Hungary's stat...,The core principles of Hungary's statehood are...,0,0,0,0,Parse error: Unterminated string starting at: ...
9,Legal Principles,yes,Compare how the principle of 'good faith and f...,The principle of 'good faith and fair dealing'...,0,0,0,0,Error: 429 RESOURCE_EXHAUSTED. {'error': {'cod...
