# AI vs Human Evaluation

In [2]:
import sys
sys.path.append('../../')
from src.clients.postgres_client import PostgresClient

db = PostgresClient()
evaluations = db.get_evaluations()
print(evaluations)


[{'id': '1d40f1b8-05e3-484c-9215-f60bdf691848', 'call_id': '8f18f353-4b4b-4f98-8b02-d5d9cc5a62e4', 'evaluator_type': 'agentic', 'evaluator_version': '0.1', 'overall_score': 1, 'category_scores': {'empathy_and_tone': {'score': 2, 'evidence': '"53.0 Okay. Wonderful."', 'explanation': 'Tone is neutral; minimal expressions of empathy are present.'}, 'problem_resolution': {'score': 1, 'evidence': '"458.0 I have it downloaded already."', 'explanation': "The call does not resolve the customer's issue or provide a clear solution."}, 'call_closure_quality': {'score': 2, 'evidence': '"687.0 Thank you so much."', 'explanation': 'Closure is brief but lacks a proper summary or confirmation.'}, 'compliance_statements': {'score': 1, 'evidence': 'Not present', 'explanation': 'No compliance or regulatory statements were made.'}, 'customer_satisfaction': {'score': 1, 'evidence': 'Not present', 'explanation': 'No explicit check for satisfaction; customer responses are minimal.'}, 'greeting_and_introducti

In [2]:
from typing import Dict, Any
import json

def compare_category_scores(human_output: Dict[str, Any], raw_output: Dict[str, Any]) -> Dict[str, Any]:
    """
    Compare category scores between human_output and raw_output.
    
    Args:
        human_output: Dictionary containing human evaluation output.
        raw_output: Dictionary containing raw (automated) evaluation output.
    
    Returns:
        Dictionary summarizing differences in category scores.
    """
    # Extract category scores
    human_scores = human_output.get("category_scores", {})
    raw_scores = raw_output.get("category_scores", {})
    
    # Initialize result
    comparison = {
        "matching_categories": [],
        "similar_categories": [],
        "differing_categories": [],
        "missing_in_human": [],
        "missing_in_raw": []
    }
    
    # Get all unique category names
    all_categories = set(human_scores.keys()).union(set(raw_scores.keys()))
    
    for category in all_categories:
        human_data = human_scores.get(category)
        raw_data = raw_scores.get(category)
        
        # Check for missing categories
        if human_data is None:
            comparison["missing_in_human"].append(category)
            continue
        if raw_data is None:
            comparison["missing_in_raw"].append(category)
            continue
        
        # Compare scores
        human_score = human_data.get("score")
        raw_score = raw_data.get("score")
        
        if human_score == raw_score:
            comparison["matching_categories"].append({
                "category": category,
                "score": human_score
            })
        elif abs(human_score - raw_score) >= 2:
            comparison["differing_categories"].append({
                "category": category,
                "human_score": human_score,
                "human_explanation": human_data.get("explanation"),
                "human_evidence": human_data.get("evidence"),
                "raw_score": raw_score,
                "raw_explanation": raw_data.get("explanation"),
                "raw_evidence": raw_data.get("evidence")
            })
        else:
            comparison["similar_categories"].append({
                "category": category,
                "human_score": human_score,
                "human_explanation": human_data.get("explanation"),
                "human_evidence": human_data.get("evidence"),
                "raw_score": raw_score,
                "raw_explanation": raw_data.get("explanation"),
                "raw_evidence": raw_data.get("evidence")
            })
    
    return comparison

for evaluation in evaluations:
    # Compare category scores
    result = compare_category_scores(evaluation["human_output"], evaluation["raw_output"])

    # Pretty print the result
    print(json.dumps(result, indent=2))

{
  "matching_categories": [
    {
      "category": "problem_resolution",
      "score": 1
    },
    {
      "category": "call_closure_quality",
      "score": 2
    },
    {
      "category": "empathy_and_tone",
      "score": 2
    },
    {
      "category": "compliance_statements",
      "score": 1
    }
  ],
  "similar_categories": [
    {
      "category": "greeting_and_introduction",
      "human_score": 3,
      "human_explanation": "Agent introduces themselves and purpose but lacks warmth and clarity.",
      "human_evidence": "00:00:08Hello. My name is Steven. I'm calling you from finance department of ______, but online company.",
      "raw_score": 2,
      "raw_explanation": "Agent repeats greetings but does not clearly introduce themselves or the purpose.",
      "raw_evidence": "\"4.0 Hello, my name is Rick and I'm here.\""
    },
    {
      "category": "product_information_accuracy",
      "human_score": 2,
      "human_explanation": "Instructions for AnyDesk and app 

In [3]:
from typing import Dict, Any, List
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.metrics import cohen_kappa_score


def extract_scores(evaluations: List[Dict[str, Any]]) -> pd.DataFrame:
    """
    Flatten human vs AI scores into a dataframe.
    """
    rows = []

    for ev in evaluations:
        human_scores = ev["human_output"]["category_scores"]
        ai_scores = ev["raw_output"]["category_scores"]

        for category, h_data in human_scores.items():
            if category not in ai_scores:
                continue

            rows.append({
                "call_id": ev.get("call_id"),
                "category": category,
                "human_score": h_data["score"],
                "ai_score": ai_scores[category]["score"]
            })

    return pd.DataFrame(rows)


def calculate_alignment_metrics(df: pd.DataFrame) -> Dict[str, Any]:
    """
    Calculate alignment metrics overall and per category.
    """
    report = {}

    # --- Overall metrics ---
    abs_diff = np.abs(df["human_score"] - df["ai_score"])

    overall = {
        "mean_absolute_error": abs_diff.mean(),
        "accuracy_within_1": (abs_diff <= 1).mean(),
        "accuracy_exact_match": (abs_diff == 0).mean(),
        "spearman_correlation": spearmanr(
            df["human_score"], df["ai_score"]
        ).correlation,
        "cohen_kappa": cohen_kappa_score(
            df["human_score"], df["ai_score"], weights="quadratic"
        )
    }

    report["overall"] = overall

    # --- Per-category metrics ---
    per_category = {}

    for category, group in df.groupby("category"):
        diff = np.abs(group["human_score"] - group["ai_score"])

        per_category[category] = {
            "mean_absolute_error": diff.mean(),
            "accuracy_within_1": (diff <= 1).mean(),
            "spearman_correlation": spearmanr(
                group["human_score"], group["ai_score"]
            ).correlation if len(group) > 1 else None,
            "cohen_kappa": cohen_kappa_score(
                group["human_score"], group["ai_score"], weights="quadratic"
            ) if len(group) > 1 else None,
            "num_samples": len(group)
        }

    report["per_category"] = per_category

    return report


In [4]:
df = extract_scores(evaluations)
report = calculate_alignment_metrics(df)

import json
print(json.dumps(report, indent=2))


{
  "overall": {
    "mean_absolute_error": 0.9285714285714286,
    "accuracy_within_1": 0.7857142857142857,
    "accuracy_exact_match": 0.2857142857142857,
    "spearman_correlation": 0.4970674233862171,
    "cohen_kappa": 0.23121387283236972
  },
  "per_category": {
    "call_closure_quality": {
      "mean_absolute_error": 1.0,
      "accuracy_within_1": 0.5,
      "spearman_correlation": -0.9999999999999999,
      "cohen_kappa": -0.33333333333333326,
      "num_samples": 2
    },
    "compliance_statements": {
      "mean_absolute_error": 1.0,
      "accuracy_within_1": 0.5,
      "spearman_correlation": NaN,
      "cohen_kappa": 0.0,
      "num_samples": 2
    },
    "customer_satisfaction": {
      "mean_absolute_error": 1.0,
      "accuracy_within_1": 1.0,
      "spearman_correlation": 0.9999999999999999,
      "cohen_kappa": 0.33333333333333337,
      "num_samples": 2
    },
    "empathy_and_tone": {
      "mean_absolute_error": 0.5,
      "accuracy_within_1": 1.0,
      "spear

  "spearman_correlation": spearmanr(
  "spearman_correlation": spearmanr(
  "spearman_correlation": spearmanr(


# Transcription accuracy

In [1]:
import sys
sys.path.append('../../')
from src.clients.postgres_client import PostgresClient

db = PostgresClient()
transcripts = db.get_transcripts()
print(transcripts)


[{'id': '864e752d-6697-435e-b145-8ea20ffdc93e', 'call_id': '9b065473-bc1d-46bc-a1ae-15b09afd2a3b', 'transcript_text': " I think it was not even released for money necessarily for atomic content at different organizations. Hello? Here's a man. Hello? Here's a man. Yes, Hi, sir. And hello, I called the... That's a lot to go. Yeah, sorry. We just came back. Sorry about that. Oh, really? I don't know. Yeah, sorry, we taste your corn and then it just connects. Sorry about that. Oh really? I don't have a sword now. I got the chalacious now. Okay. Well, at least we got each other right here right now. Listen, you're dead. As I was told by my finance team, they looked into this. Now, I personally do with atomic, but I should be honest, I never cashed out. I stocked by a monster all in atomic. Can I remove anything out of atomic? Right? Let's put a disc lane. So my knowledge, personal knowledge, maybe limited. But I'm checking everything up with my finance team and they're telling me that the t

In [14]:
# jiwer library
import jiwer
import re

def normalize_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"\[.*?\]", "", text)          # remove annotations
    text = re.sub(r"\d+:\d+:\d+", "", text)      # remove timestamps
    text = re.sub(r"[^a-z0-9\s']", "", text)     # remove punctuation
    text = re.sub(r"\s+", " ", text).strip()
    return text

transformation = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemovePunctuation(),
    jiwer.RemoveMultipleSpaces(),
    jiwer.Strip(),
    jiwer.ReduceToListOfListOfWords(word_delimiter=" ")
])

for transcript in transcripts:
    print(jiwer.wer(normalize_text(transcript['human_transcript']), normalize_text(transcript['timestamped_text']), reference_transform=transformation,
    hypothesis_transform=transformation))


0.3017312448474856
0.4485887096774194
