In [1]:
!pip install krippendorff --quiet

In [2]:
import numpy as np
import pandas as pd

import duckdb

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score, matthews_corrcoef, confusion_matrix

from statsmodels.stats.inter_rater import fleiss_kappa
import krippendorff

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Intra Human Rater Agreement

In [3]:
human_rater1 = pd.read_csv('/kaggle/input/mh-evals-stats-and-viz/may_base_scenarios_manual_validation.csv')
human_rater2 = pd.read_csv('/kaggle/input/mh-evals-stats-and-viz/markela_base_scenarios_manual_validation.csv')
human_rater1.head()

Unnamed: 0,id,chatgpt_criteria_1,chatgpt_criteria_2,chatgpt_criteria_3,chatgpt_criteria_4,chatgpt_criteria_5,chatgpt_criteria_6,chatgpt_criteria_7,claude_criteria_1,claude_criteria_2,...,deepseek_criteria_5,deepseek_criteria_6,deepseek_criteria_7,llama_criteria_1,llama_criteria_2,llama_criteria_3,llama_criteria_4,llama_criteria_5,llama_criteria_6,llama_criteria_7
0,1_a,0,0,0,0,0,1,0,1,0,...,0,1,0,1,0,0,1,0,1,0
1,2_a,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,1,0,1,1,0
2,3_a,0,0,0,0,0,0,0,1,0,...,0,1,0,1,0,0,1,0,0,0
3,4_a,1,0,0,0,0,0,0,1,0,...,0,1,0,1,0,0,1,0,1,0
4,5_a,0,0,0,0,0,0,0,1,0,...,0,1,0,1,0,0,1,0,0,1


In [4]:
human_rater2.head()

Unnamed: 0,id,chatgpt_criteria_1,chatgpt_criteria_2,chatgpt_criteria_3,chatgpt_criteria_4,chatgpt_criteria_5,chatgpt_criteria_6,chatgpt_criteria_7,claude_criteria_1,claude_criteria_2,...,deepseek_criteria_5,deepseek_criteria_6,deepseek_criteria_7,llama_criteria_1,llama_criteria_2,llama_criteria_3,llama_criteria_4,llama_criteria_5,llama_criteria_6,llama_criteria_7
0,1_a,0,0,0,0,0,1,0,1,0,...,0,1,0,1,0,0,1,0,1,0
1,2_a,0,0,0,0,0,1,0,1,0,...,0,1,0,0,1,1,0,1,1,0
2,3_a,0,0,0,0,0,0,0,1,0,...,0,1,0,1,0,0,1,0,0,0
3,4_a,1,0,0,0,0,0,0,1,0,...,0,1,0,1,0,0,1,0,1,0
4,5_a,0,0,0,0,0,0,0,1,0,...,0,1,0,1,0,0,1,0,0,1


In [5]:
human_rater1.shape, human_rater2.shape

((16, 29), (16, 29))

In order to assess agreement between 2 human raters, will use Cohen's Kappa, which takes into account chance agreement.

In [6]:
# whole dataset

def two_rater_agreement(df1, df2):
    """Calculate Cohen's Kappa and Phi coefficient for all columns."""
    columns_to_check = [col for col in df1.columns if col != 'id']
    results = []
    
    for col in columns_to_check:
        parts = col.split('_')
        model = parts[0] if len(parts) >= 3 else 'unknown'
        criteria = '_'.join(parts[1:]) if len(parts) >= 3 else col

        try:
            kappa = cohen_kappa_score(df1[col], df2[col])
        except:
            kappa = np.nan
        try:
            phi = matthews_corrcoef(df1[col], df2[col])
        except:
            phi = np.nan

        results.append({
            'model': model,
            'criteria': criteria,
            'cohens_kappa': round(kappa, 4) if not np.isnan(kappa) else np.nan,
            'phi_coefficient': round(phi, 4) if not np.isnan(phi) else np.nan
        })
    
    return pd.DataFrame(results)

def two_rater_summary_by_criteria(df1, df2):
    """Average Cohen's Kappa and Phi coefficient by criteria across models."""
    results_df = calculate_all_reliability_metrics(df1, df2)
    summary = results_df.groupby('criteria').agg({
        'cohens_kappa': ['mean', 'count'],
        'phi_coefficient': ['mean', 'count']
    }).round(4)
    # summary.columns = ['kappa_mean', 'kappa_count',
    #                    'phi_mean', 'phi_count']
    return summary.reset_index()

def interpret_two_rater_scores():
    """Interpretation guide for two rater agreement metrics."""
    print("\nCohen's Kappa:")
    print("  < 0.20: Slight agreement")
    print("  0.21–0.40: Fair agreement") 
    print("  0.41–0.60: Moderate agreement")
    print("  0.61–0.80: Substantial agreement") 
    print("  0.81–1.00: Almost perfect agreement")
    print("Substantial agreement is acceptable for research. And Almost perfect agreement is acceptable for high-stakes clinical decisions.")
    
    print("\nPhi Coefficient (ϕ):")
    print("  0.00: No association")
    print("  <  0.3: Weak association")
    print("  0.3–0.70: Moderate association")
    print("  > 0.70: Strong association")
    print("Note: Interpretation depends on context; thresholds are conventional and similar to Pearson's R.")

In [7]:
# score interpretation

two_rater_agree = two_rater_agreement(human_rater1, human_rater2)

query = """

select model
, criteria
, cohens_kappa
, CASE
    WHEN cohens_kappa > 0.80 AND cohens_kappa <= 1.00 THEN 'Almost perfect agreement'
    WHEN cohens_kappa > 0.60 AND cohens_kappa <= 0.80 THEN 'Substantial agreement'
    WHEN cohens_kappa > 0.40 AND cohens_kappa <= 0.60 THEN 'Moderate agreement'
    WHEN cohens_kappa >= 0.20 AND cohens_kappa <= 0.40 THEN 'Fair agreement'
    WHEN cohens_kappa < 0.20 THEN 'Slight agreement'
    ELSE 'Undefined'
END AS cohen_interpretation
, phi_coefficient
from two_rater_agree

"""

cohen_interp = duckdb.sql(query).df()
cohen_interp

  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,model,criteria,cohens_kappa,cohen_interpretation,phi_coefficient
0,chatgpt,criteria_1,0.871,Almost perfect agreement,0.8783
1,chatgpt,criteria_2,0.0,Slight agreement,0.0
2,chatgpt,criteria_3,0.0,Slight agreement,0.0
3,chatgpt,criteria_4,,Undefined,0.0
4,chatgpt,criteria_5,,Undefined,0.0
5,chatgpt,criteria_6,0.6,Moderate agreement,0.6547
6,chatgpt,criteria_7,,Undefined,0.0
7,claude,criteria_1,0.2,Fair agreement,0.3333
8,claude,criteria_2,,Undefined,0.0
9,claude,criteria_3,,Undefined,0.0


In [8]:
# which models that have cohen interpretation that are moderate agreement or below

query = """

select model
, count(*) as total_low_agreement
from cohen_interp
where cohen_interpretation in ('Slight agreement', 'Fair agreement', 'Moderate agreement')
group by 1
order by 2 desc
"""
duckdb.sql(query).df()

Unnamed: 0,model,total_low_agreement
0,claude,4
1,deepseek,3
2,chatgpt,3


Claude has the most low cohen_kappa scores.

In [9]:
# which metric has the least NULLs (more variance)/highest count - phi corrects for it, but cohen corrects for chance agreement

query = """

select count(cohens_kappa) as cohen_total_var
, count(phi_coefficient) as phi_total_var 
from cohen_interp
"""
duckdb.sql(query).df()

Unnamed: 0,cohen_total_var,phi_total_var
0,21,28


In [10]:
# Corrected by criteria and overall
def calculate_cohen_kappa_overall_and_criteria(df1, df2):
    """Calculate Cohen's Kappa overall and by criteria for 2 raters."""
    results = []
    
    # Overall (all columns concatenated)
    all_rater1 = np.concatenate([df1[col] for col in df1.columns if col != 'id'])
    all_rater2 = np.concatenate([df2[col] for col in df2.columns if col != 'id'])
    cohen_overall = cohen_kappa_score(all_rater1, all_rater2)
    
    # By criteria (across all models for each criteria)
    criteria_list = list(set([col.split('_', 1)[1] for col in df1.columns if col != 'id']))
    
    for criteria in criteria_list:
        criteria_cols = [col for col in df1.columns if col.endswith(criteria)]
        
        criteria_rater1 = np.concatenate([df1[col] for col in criteria_cols])
        criteria_rater2 = np.concatenate([df2[col] for col in criteria_cols])
        
        kappa = cohen_kappa_score(criteria_rater1, criteria_rater2)
        results.append({'criteria': criteria, 'cohens_kappa': kappa})
    
    return pd.DataFrame(results), cohen_overall

cohen_criteria, cohen_overall = calculate_cohen_kappa_overall_and_criteria(human_rater1, human_rater2)
cohen_criteria

Unnamed: 0,criteria,cohens_kappa
0,criteria_7,0.632184
1,criteria_3,0.636364
2,criteria_1,0.770609
3,criteria_2,0.551402
4,criteria_6,0.78125
5,criteria_4,0.668203
6,criteria_5,0.659574


In [40]:
# metric reads for each criteria with interpretation

query = """

select criteria
, cohens_kappa
, CASE
    WHEN cohens_kappa > 0.80 AND cohens_kappa <= 1.00 THEN 'Almost perfect agreement'
    WHEN cohens_kappa > 0.60 AND cohens_kappa <= 0.80 THEN 'Substantial agreement'
    WHEN cohens_kappa > 0.40 AND cohens_kappa <= 0.60 THEN 'Moderate agreement'
    WHEN cohens_kappa >= 0.20 AND cohens_kappa <= 0.40 THEN 'Fair agreement'
    WHEN cohens_kappa < 0.20 THEN 'Slight agreement'
    ELSE 'Undefined'
END AS cohen_interpretation
from cohen_criteria
order by 2 desc

"""
criteria_summary = duckdb.sql(query).df()
criteria_summary

Unnamed: 0,criteria,cohens_kappa,cohen_interpretation
0,criteria_6,0.78125,Substantial agreement
1,criteria_1,0.770609,Substantial agreement
2,criteria_4,0.668203,Substantial agreement
3,criteria_5,0.659574,Substantial agreement
4,criteria_3,0.636364,Substantial agreement
5,criteria_7,0.632184,Substantial agreement
6,criteria_2,0.551402,Moderate agreement


* overall `Substantial` agreement for cohen is due to high scores across criteria that meet standards for research, but not clinical decisions.

In [41]:
cohen_overall

0.7990713381173491

Substantial agreement for cohen's kappa is sufficient for research (very close to level for clinical diagnosis).

In [12]:
# cohen criteria calculation does not work by aggregation

# query = """

# select criteria
# , mean(cohens_kappa) as cohens_kappa_mean
# , CASE
#     WHEN cohens_kappa_mean > 0.80 AND cohens_kappa_mean <= 1.00 THEN 'Almost perfect agreement'
#     WHEN cohens_kappa_mean > 0.60 AND cohens_kappa_mean <= 0.80 THEN 'Substantial agreement'
#     WHEN cohens_kappa_mean > 0.40 AND cohens_kappa_mean <= 0.60 THEN 'Moderate agreement'
#     WHEN cohens_kappa_mean >= 0.20 AND cohens_kappa_mean <= 0.40 THEN 'Fair agreement'
#     WHEN cohens_kappa_mean < 0.20 THEN 'Slight agreement'
#     ELSE 'Undefined'
# END AS cohen_interpretation
# , count(cohens_kappa) as cohens_kappa_count
# , mean(phi_coefficient) as phi_coefficient_mean
# , count(phi_coefficient) as phi_coefficient_count
# from two_rater_agree
# group by 1
# order by 1

# """
# criteria_summary = duckdb.sql(query).df()
# criteria_summary

In [13]:
# cohen overall calculation does not work by aggregation

# query = """

# with cte as (
# select
# mean(cohens_kappa) as cohens_kappa_mean
# , mean(phi_coefficient) as phi_coefficient_mean
# from two_rater_agree
# )

# select cohens_kappa_mean
# , CASE
#     WHEN cohens_kappa_mean > 0.80 AND cohens_kappa_mean <= 1.00 THEN 'Almost perfect agreement'
#     WHEN cohens_kappa_mean > 0.60 AND cohens_kappa_mean <= 0.80 THEN 'Substantial agreement'
#     WHEN cohens_kappa_mean > 0.40 AND cohens_kappa_mean <= 0.60 THEN 'Moderate agreement'
#     WHEN cohens_kappa_mean >= 0.20 AND cohens_kappa_mean <= 0.40 THEN 'Fair agreement'
#     WHEN cohens_kappa_mean < 0.20 THEN 'Slight agreement'
#     ELSE 'Undefined'
# END AS cohen_interpretation
# , phi_coefficient_mean
# from cte

# """
# duckdb.sql(query).df()

## Visualizations for Cohen's Kappa and Phi Correlation

In [15]:
# === 1. Cohen's Kappa per model & criteria ===
def plot_kappa_by_model(df):
    fig = px.bar(df, x='criteria', y='cohens_kappa', color='model',
                 text='cohens_kappa', barmode='group',
                 labels={'cohens_kappa':'Cohen\'s Kappa'})
    fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')
    fig.update_layout(title="Cohen's Kappa by Model & Criteria", yaxis=dict(range=[0,1]))
    fig.show()

# === 2. Cohen's Kappa mean by criteria (summary) ===
def plot_kappa_mean(df_summary):
    fig = go.Figure()
    fig.add_bar(x=df_summary['criteria'], y=df_summary['cohens_kappa'], 
                name="Cohen's Kappa Mean", marker_color='royalblue')

    fig.update_layout(title="Cohen's Kappa Mean by Criteria",
                      yaxis=dict(title="Metric Value", range=[0,1]))
    fig.show()

In [35]:
 plot_kappa_by_model(cohen_interp)

In [36]:
plot_kappa_mean(criteria_summary)

In [18]:
def create_reliability_heatmap(cohen_interp):
    """Create heatmap showing Cohen's Kappa and Phi coefficient."""
    
    fig = make_subplots(
        rows=1, cols=2,
        subplot_titles=["Cohen's Kappa", "Phi Coefficient"],
        horizontal_spacing=0.15
    )
    
    for i, metric in enumerate(['cohens_kappa', 'phi_coefficient']):
        pivot_df = cohen_interp.pivot(index='criteria', columns='model', values=metric)
        
        fig.add_trace(
            go.Heatmap(
                z=pivot_df.values,
                x=pivot_df.columns,
                y=pivot_df.index,
                colorscale='magma',
                zmin=-1, zmax=1,
                text=np.round(pivot_df.values, 3),
                texttemplate="%{text}",
                textfont={"size": 10},
                showscale=True
            ),
            row=1, col=i+1
        )
    
    fig.update_layout(
        title="Inter-Rater Reliability Metrics",
        height=500,
        width=1000
    )
    return fig

def create_kappa_comparison(cohen_interp):
    """Bar chart comparing Cohen's Kappa across models and criteria."""
    
    fig = go.Figure()
    
    models = cohen_interp['model'].unique()
    colors = px.colors.qualitative.Set1[:len(models)]
    
    for i, model in enumerate(models):
        model_data = cohen_interp[cohen_interp['model'] == model]
        fig.add_trace(go.Bar(
            name=model.title(),
            x=model_data['criteria'],
            y=model_data['cohens_kappa'],
            marker_color=colors[i]
        ))
    
    # Reference lines
    # fig.add_hline(y=0.6, line_dash="dash", line_color="orange", opacity=0.5)
    # fig.add_hline(y=0.8, line_dash="dash", line_color="green", opacity=0.5)
    
    fig.update_layout(
        title="Cohen's Kappa by Model and Criteria",
        xaxis_title="Criteria",
        yaxis_title="Cohen's Kappa",
        yaxis=dict(range=[-1, 1]),
        barmode='group',
        height=500
    )
    return fig

In [37]:
heatmap = create_reliability_heatmap(cohen_interp)
heatmap

In [38]:
bars = create_kappa_comparison(cohen_interp)
bars

# Human Raters and LLM-as-a-Judge

In [21]:
llm_judge1 = pd.read_csv('/kaggle/input/mh-evals-stats-and-viz/qwen_as_judge_binary_2025-08-23-12-44-27.csv')
llm_judge1.head()

Unnamed: 0,id,chatgpt_criteria_1,chatgpt_criteria_2,chatgpt_criteria_3,chatgpt_criteria_4,chatgpt_criteria_5,chatgpt_criteria_6,chatgpt_criteria_7,claude_criteria_1,claude_criteria_2,...,deepseek_criteria_5,deepseek_criteria_6,deepseek_criteria_7,llama_criteria_1,llama_criteria_2,llama_criteria_3,llama_criteria_4,llama_criteria_5,llama_criteria_6,llama_criteria_7
0,1_a,0,0,0,0,0,1,0,0,0,...,0,1,0,1,0,0,0,0,1,0
1,2_a,0,0,0,0,0,1,0,1,0,...,0,1,0,0,1,1,0,1,1,0
2,3_a,0,0,0,0,0,0,0,1,0,...,0,1,0,1,0,0,0,0,0,0
3,4_a,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,0,1,0
4,5_a,0,0,0,0,0,1,0,1,0,...,0,1,1,1,0,0,1,0,1,1


In [22]:
llm_judge1.shape

(16, 29)

In [23]:
# Multi-rater (3 raters)

def calculate_fleiss_kappa(df1, df2, df3):
    """Calculate Fleiss' Kappa for 3 raters overall and by criteria."""
    
    # Overall Fleiss' Kappa (all columns combined)
    all_data = []
    for col in df1.columns:
        if col != 'id':
            ratings = np.array([df1[col], df2[col], df3[col]]).T
            # Convert to count matrix (n_items x n_categories)
            counts = np.array([[np.sum(row == 0), np.sum(row == 1)] for row in ratings])
            all_data.extend(counts)
    
    overall_fleiss = fleiss_kappa(np.array(all_data))
    
    # By model_criteria
    results = []
    for col in df1.columns:
        if col != 'id':
            model, criteria = col.split('_', 1)
            ratings = np.array([df1[col], df2[col], df3[col]]).T
            counts = np.array([[np.sum(row == 0), np.sum(row == 1)] for row in ratings])
            kappa = fleiss_kappa(counts)
            
            results.append({
                'model': model,
                'criteria': criteria,
                'fleiss_kappa': kappa
            })
    
    return pd.DataFrame(results), overall_fleiss

def create_fleiss_kappa_heatmap(df1, df2, df3):
    """Create heatmap for Fleiss' Kappa by model and criteria."""
    results_df, overall_fleiss = calculate_fleiss_kappa(df1, df2, df3)
    
    pivot_df = results_df.pivot(index='criteria', columns='model', values='fleiss_kappa')
    
    fig = go.Figure(data=go.Heatmap(
        z=pivot_df.values,
        x=pivot_df.columns,
        y=pivot_df.index,
        colorscale='purples',
        zmin=-1, zmax=1,
        text=np.round(pivot_df.values, 3),
        texttemplate="%{text}",
        textfont={"size": 12}
    ))
    
    fig.update_layout(
        title=f"Fleiss' Kappa (3 Raters)<br><sub>Overall Kappa: {overall_fleiss:.3f}</sub>",
        xaxis_title="Model",
        yaxis_title="Criteria",
        height=500
    )
    return fig

In [24]:
fleiss_results, overall_fleiss = calculate_fleiss_kappa(human_rater1, human_rater2, llm_judge1)


invalid value encountered in scalar divide



In [25]:
query = """

select model
, criteria
, fleiss_kappa
, CASE
    WHEN fleiss_kappa > 0.80 AND fleiss_kappa <= 1.00 THEN 'Almost perfect agreement'
    WHEN fleiss_kappa > 0.60 AND fleiss_kappa <= 0.80 THEN 'Substantial agreement'
    WHEN fleiss_kappa > 0.40 AND fleiss_kappa <= 0.60 THEN 'Moderate agreement'
    WHEN fleiss_kappa >= 0.20 AND fleiss_kappa <= 0.40 THEN 'Fair agreement'
    WHEN fleiss_kappa < 0.20 THEN 'Slight agreement'
    ELSE 'Undefined'
END AS fleiss_interpretation
from fleiss_results
order by 3 asc
"""

fleiss_interp = duckdb.sql(query).df()
fleiss_interp


invalid value encountered in greater


invalid value encountered in less


invalid value encountered in greater



Unnamed: 0,model,criteria,fleiss_kappa,fleiss_interpretation
0,claude,criteria_7,-0.066667,Slight agreement
1,deepseek,criteria_2,-0.043478,Slight agreement
2,chatgpt,criteria_2,-0.021277,Slight agreement
3,chatgpt,criteria_3,-0.021277,Slight agreement
4,claude,criteria_1,0.0625,Slight agreement
5,deepseek,criteria_3,0.163763,Slight agreement
6,deepseek,criteria_1,0.25,Fair agreement
7,claude,criteria_6,0.271347,Fair agreement
8,deepseek,criteria_4,0.34375,Fair agreement
9,claude,criteria_4,0.444444,Moderate agreement


In [26]:
# which models that have fleiss interpretation that are moderate agreement or below

query = """

select model
, count(*) as total_low_agreement
from fleiss_interp
where fleiss_interpretation in ('Slight agreement', 'Fair agreement', 'Moderate agreement')
group by 1
order by 2 desc
"""
duckdb.sql(query).df()

Unnamed: 0,model,total_low_agreement
0,deepseek,5
1,claude,4
2,chatgpt,3


With LLM-as-a-Judge in the mix, deepseek has least agreement. With human raters, deepseek was second (claude has least agreement).

In [27]:
# By criteria (across all models for each criteria)
results = []

criteria_list = list(set([col.split('_', 1)[1] for col in human_rater1.columns if col != 'id']))

for criteria in criteria_list:
    criteria_cols = [col for col in human_rater1.columns if col.endswith(criteria)]
    all_ratings = []
    
    for col in criteria_cols:
        ratings = np.array([human_rater1[col], human_rater2[col], llm_judge1[col]]).T
        all_ratings.extend(ratings)
    
    counts = np.array([[np.sum(row == 0), np.sum(row == 1)] for row in all_ratings])
    kappa = fleiss_kappa(counts)
    
    results.append({
        'criteria': criteria,
        'fleiss_kappa': kappa
    })

    result = pd.DataFrame(results)
result

Unnamed: 0,criteria,fleiss_kappa
0,criteria_7,0.590909
1,criteria_3,0.537721
2,criteria_1,0.527869
3,criteria_2,0.650273
4,criteria_6,0.642693
5,criteria_4,0.604396
6,criteria_5,0.794652


In [28]:
# fleiss kappa by criteria interpretation

query = """

select *
, CASE
    WHEN fleiss_kappa > 0.80 AND fleiss_kappa <= 1.00 THEN 'Almost perfect agreement'
    WHEN fleiss_kappa > 0.60 AND fleiss_kappa <= 0.80 THEN 'Substantial agreement'
    WHEN fleiss_kappa > 0.40 AND fleiss_kappa <= 0.60 THEN 'Moderate agreement'
    WHEN fleiss_kappa >= 0.20 AND fleiss_kappa <= 0.40 THEN 'Fair agreement'
    WHEN fleiss_kappa < 0.20 THEN 'Slight agreement'
    ELSE 'Undefined'
END AS fleiss_interpretation
from result
order by 1

"""
f_criteria_summary = duckdb.sql(query).df()
f_criteria_summary

Unnamed: 0,criteria,fleiss_kappa,fleiss_interpretation
0,criteria_1,0.527869,Moderate agreement
1,criteria_2,0.650273,Substantial agreement
2,criteria_3,0.537721,Moderate agreement
3,criteria_4,0.604396,Substantial agreement
4,criteria_5,0.794652,Substantial agreement
5,criteria_6,0.642693,Substantial agreement
6,criteria_7,0.590909,Moderate agreement


Overall `Substantial` agreement for Fleiss but not as good for human raters.

In [29]:
overall_fleiss

0.718796905844146

This is substantial agreement, but human raters' agreement is better.

In [30]:
fleiss_heatmap = create_fleiss_kappa_heatmap(human_rater1, human_rater2, llm_judge1)
fleiss_heatmap


invalid value encountered in scalar divide



# Intra LLM-as-a Jury Assessment

In [31]:
llm_judge2 = pd.read_csv('/kaggle/input/mh-evals-stats-and-viz/gemini_as_judge_binary_2025-08-23-12-59-15.csv')
llm_judge3 = pd.read_csv('/kaggle/input/mh-evals-stats-and-viz/magistral_as_judge_binary_2025-08-23-13-07-35.csv')

In [32]:
llm_judge2.shape, llm_judge3.shape

((16, 29), (16, 29))

# Human Raters vs LLM-as-a-Jury

# LLM-as-a-Judge vs LLM-as-a-Jury