In [1]:
!pip install krippendorff --quiet

In [2]:
import numpy as np
import pandas as pd

import duckdb

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score, matthews_corrcoef, confusion_matrix

from statsmodels.stats.inter_rater import fleiss_kappa
import krippendorff

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Inter Human Rater Agreement

In [3]:
human_rater1 = pd.read_csv('/kaggle/input/mh-evals-stats-and-viz/may_base_scenarios_manual_validation.csv')
human_rater2 = pd.read_csv('/kaggle/input/mh-evals-stats-and-viz/markela_base_scenarios_manual_validation.csv')
human_rater1.head()

Unnamed: 0,id,chatgpt_criteria_1,chatgpt_criteria_2,chatgpt_criteria_3,chatgpt_criteria_4,chatgpt_criteria_5,chatgpt_criteria_6,chatgpt_criteria_7,claude_criteria_1,claude_criteria_2,...,deepseek_criteria_5,deepseek_criteria_6,deepseek_criteria_7,llama_criteria_1,llama_criteria_2,llama_criteria_3,llama_criteria_4,llama_criteria_5,llama_criteria_6,llama_criteria_7
0,1_a,0,0,0,0,0,1,0,1,0,...,0,1,0,1,0,0,1,0,1,0
1,2_a,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,1,0,1,1,0
2,3_a,0,0,0,0,0,0,0,1,0,...,0,1,0,1,0,0,1,0,0,0
3,4_a,1,0,0,0,0,0,0,1,0,...,0,1,0,1,0,0,1,0,1,0
4,5_a,0,0,0,0,0,0,0,1,0,...,0,1,0,1,0,0,1,0,0,1


In [4]:
human_rater2.head()

Unnamed: 0,id,chatgpt_criteria_1,chatgpt_criteria_2,chatgpt_criteria_3,chatgpt_criteria_4,chatgpt_criteria_5,chatgpt_criteria_6,chatgpt_criteria_7,claude_criteria_1,claude_criteria_2,...,deepseek_criteria_5,deepseek_criteria_6,deepseek_criteria_7,llama_criteria_1,llama_criteria_2,llama_criteria_3,llama_criteria_4,llama_criteria_5,llama_criteria_6,llama_criteria_7
0,1_a,0,0,0,0,0,1,0,1,0,...,0,1,0,1,0,0,1,0,1,0
1,2_a,0,0,0,0,0,1,0,1,0,...,0,1,0,0,1,1,0,1,1,0
2,3_a,0,0,0,0,0,0,0,1,0,...,0,1,0,1,0,0,1,0,0,0
3,4_a,1,0,0,0,0,0,0,1,0,...,0,1,0,1,0,0,1,0,1,0
4,5_a,0,0,0,0,0,0,0,1,0,...,0,1,0,1,0,0,1,0,0,1


In [5]:
human_rater1.shape, human_rater2.shape

((16, 29), (16, 29))

In order to assess agreement between 2 human raters, will use Cohen's Kappa, which takes into account chance agreement.

In [6]:
# whole dataset

def two_rater_agreement(df1, df2):
    """Calculate Cohen's Kappa and Phi coefficient for all columns."""
    columns_to_check = [col for col in df1.columns if col != 'id']
    results = []
    
    for col in columns_to_check:
        parts = col.split('_')
        model = parts[0] if len(parts) >= 3 else 'unknown'
        criteria = '_'.join(parts[1:]) if len(parts) >= 3 else col

        try:
            kappa = cohen_kappa_score(df1[col], df2[col])
        except:
            kappa = np.nan
        try:
            phi = matthews_corrcoef(df1[col], df2[col])
        except:
            phi = np.nan

        results.append({
            'model': model,
            'criteria': criteria,
            'cohens_kappa': round(kappa, 4) if not np.isnan(kappa) else np.nan,
            'phi_coefficient': round(phi, 4) if not np.isnan(phi) else np.nan
        })
    
    return pd.DataFrame(results)

def two_rater_summary_by_criteria(df1, df2):
    """Average Cohen's Kappa and Phi coefficient by criteria across models."""
    results_df = calculate_all_reliability_metrics(df1, df2)
    summary = results_df.groupby('criteria').agg({
        'cohens_kappa': ['mean', 'count'],
        'phi_coefficient': ['mean', 'count']
    }).round(4)
    # summary.columns = ['kappa_mean', 'kappa_count',
    #                    'phi_mean', 'phi_count']
    return summary.reset_index()

def interpret_two_rater_scores():
    """Interpretation guide for two rater agreement metrics."""
    print("\nCohen's Kappa:")
    print("  < 0.20: Slight agreement")
    print("  0.21–0.40: Fair agreement") 
    print("  0.41–0.60: Moderate agreement")
    print("  0.61–0.80: Substantial agreement") 
    print("  0.81–1.00: Almost perfect agreement")
    print("Substantial agreement is acceptable for research. And Almost perfect agreement is acceptable for high-stakes clinical decisions.")
    
    print("\nPhi Coefficient (ϕ):")
    print("  0.00: No association")
    print("  <  0.3: Weak association")
    print("  0.3–0.70: Moderate association")
    print("  > 0.70: Strong association")
    print("Note: Interpretation depends on context; thresholds are conventional and similar to Pearson's R.")

In [7]:
# score interpretation

two_rater_agree = two_rater_agreement(human_rater1, human_rater2)

query = """

select model
, criteria
, cohens_kappa
, CASE
    WHEN cohens_kappa > 0.80 AND cohens_kappa <= 1.00 THEN 'Almost perfect agreement'
    WHEN cohens_kappa > 0.60 AND cohens_kappa <= 0.80 THEN 'Substantial agreement'
    WHEN cohens_kappa > 0.40 AND cohens_kappa <= 0.60 THEN 'Moderate agreement'
    WHEN cohens_kappa >= 0.20 AND cohens_kappa <= 0.40 THEN 'Fair agreement'
    WHEN cohens_kappa < 0.20 THEN 'Slight agreement'
    ELSE 'Undefined'
END AS cohen_interpretation
, phi_coefficient
from two_rater_agree

"""

cohen_interp = duckdb.sql(query).df()
cohen_interp

  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,model,criteria,cohens_kappa,cohen_interpretation,phi_coefficient
0,chatgpt,criteria_1,0.871,Almost perfect agreement,0.8783
1,chatgpt,criteria_2,0.0,Slight agreement,0.0
2,chatgpt,criteria_3,0.0,Slight agreement,0.0
3,chatgpt,criteria_4,,Undefined,0.0
4,chatgpt,criteria_5,,Undefined,0.0
5,chatgpt,criteria_6,0.6,Moderate agreement,0.6547
6,chatgpt,criteria_7,,Undefined,0.0
7,claude,criteria_1,0.2,Fair agreement,0.3333
8,claude,criteria_2,,Undefined,0.0
9,claude,criteria_3,,Undefined,0.0


In [8]:
# which models that have cohen interpretation that are moderate agreement or below

query = """

select model
, count(*) as total_low_agreement
from cohen_interp
where cohen_interpretation in ('Slight agreement', 'Fair agreement', 'Moderate agreement')
group by 1
order by 2 desc
"""
duckdb.sql(query).df()

Unnamed: 0,model,total_low_agreement
0,claude,4
1,deepseek,3
2,chatgpt,3


Claude has the most low cohen_kappa scores.

In [9]:
# which metric has the least NULLs (more variance)/highest count - phi corrects for it, but cohen corrects for chance agreement

query = """

select count(cohens_kappa) as cohen_total_var
, count(phi_coefficient) as phi_total_var 
from cohen_interp
"""
duckdb.sql(query).df()

Unnamed: 0,cohen_total_var,phi_total_var
0,21,28


In [10]:
# Corrected by criteria and overall
def calculate_cohen_kappa_overall_and_criteria(df1, df2):
    """Calculate Cohen's Kappa overall and by criteria for 2 raters."""
    results = []
    
    # Overall (all columns concatenated)
    all_rater1 = np.concatenate([df1[col] for col in df1.columns if col != 'id'])
    all_rater2 = np.concatenate([df2[col] for col in df2.columns if col != 'id'])
    cohen_overall = cohen_kappa_score(all_rater1, all_rater2)
    
    # By criteria (across all models for each criteria)
    criteria_list = list(set([col.split('_', 1)[1] for col in df1.columns if col != 'id']))
    
    for criteria in criteria_list:
        criteria_cols = [col for col in df1.columns if col.endswith(criteria)]
        
        criteria_rater1 = np.concatenate([df1[col] for col in criteria_cols])
        criteria_rater2 = np.concatenate([df2[col] for col in criteria_cols])
        
        kappa = cohen_kappa_score(criteria_rater1, criteria_rater2)
        results.append({'criteria': criteria, 'cohens_kappa': kappa})
    
    return pd.DataFrame(results), cohen_overall

cohen_criteria, cohen_overall = calculate_cohen_kappa_overall_and_criteria(human_rater1, human_rater2)
cohen_criteria

Unnamed: 0,criteria,cohens_kappa
0,criteria_7,0.632184
1,criteria_3,0.636364
2,criteria_1,0.770609
3,criteria_2,0.551402
4,criteria_6,0.78125
5,criteria_4,0.668203
6,criteria_5,0.659574


In [40]:
# metric reads for each criteria with interpretation

query = """

select criteria
, cohens_kappa
, CASE
    WHEN cohens_kappa > 0.80 AND cohens_kappa <= 1.00 THEN 'Almost perfect agreement'
    WHEN cohens_kappa > 0.60 AND cohens_kappa <= 0.80 THEN 'Substantial agreement'
    WHEN cohens_kappa > 0.40 AND cohens_kappa <= 0.60 THEN 'Moderate agreement'
    WHEN cohens_kappa >= 0.20 AND cohens_kappa <= 0.40 THEN 'Fair agreement'
    WHEN cohens_kappa < 0.20 THEN 'Slight agreement'
    ELSE 'Undefined'
END AS cohen_interpretation
from cohen_criteria
order by 2 desc

"""
criteria_summary = duckdb.sql(query).df()
criteria_summary

Unnamed: 0,criteria,cohens_kappa,cohen_interpretation
0,criteria_6,0.78125,Substantial agreement
1,criteria_1,0.770609,Substantial agreement
2,criteria_4,0.668203,Substantial agreement
3,criteria_5,0.659574,Substantial agreement
4,criteria_3,0.636364,Substantial agreement
5,criteria_7,0.632184,Substantial agreement
6,criteria_2,0.551402,Moderate agreement


In [41]:
cohen_overall

0.7990713381173491

Overall `Substantial` agreement for cohen is due to high scores across criteria that meet standards for research, but not clinical decisions.

In [12]:
# cohen criteria calculation does not work by aggregation

# query = """

# select criteria
# , mean(cohens_kappa) as cohens_kappa_mean
# , CASE
#     WHEN cohens_kappa_mean > 0.80 AND cohens_kappa_mean <= 1.00 THEN 'Almost perfect agreement'
#     WHEN cohens_kappa_mean > 0.60 AND cohens_kappa_mean <= 0.80 THEN 'Substantial agreement'
#     WHEN cohens_kappa_mean > 0.40 AND cohens_kappa_mean <= 0.60 THEN 'Moderate agreement'
#     WHEN cohens_kappa_mean >= 0.20 AND cohens_kappa_mean <= 0.40 THEN 'Fair agreement'
#     WHEN cohens_kappa_mean < 0.20 THEN 'Slight agreement'
#     ELSE 'Undefined'
# END AS cohen_interpretation
# , count(cohens_kappa) as cohens_kappa_count
# , mean(phi_coefficient) as phi_coefficient_mean
# , count(phi_coefficient) as phi_coefficient_count
# from two_rater_agree
# group by 1
# order by 1

# """
# criteria_summary = duckdb.sql(query).df()
# criteria_summary

In [13]:
# cohen overall calculation does not work by aggregation

# query = """

# with cte as (
# select
# mean(cohens_kappa) as cohens_kappa_mean
# , mean(phi_coefficient) as phi_coefficient_mean
# from two_rater_agree
# )

# select cohens_kappa_mean
# , CASE
#     WHEN cohens_kappa_mean > 0.80 AND cohens_kappa_mean <= 1.00 THEN 'Almost perfect agreement'
#     WHEN cohens_kappa_mean > 0.60 AND cohens_kappa_mean <= 0.80 THEN 'Substantial agreement'
#     WHEN cohens_kappa_mean > 0.40 AND cohens_kappa_mean <= 0.60 THEN 'Moderate agreement'
#     WHEN cohens_kappa_mean >= 0.20 AND cohens_kappa_mean <= 0.40 THEN 'Fair agreement'
#     WHEN cohens_kappa_mean < 0.20 THEN 'Slight agreement'
#     ELSE 'Undefined'
# END AS cohen_interpretation
# , phi_coefficient_mean
# from cte

# """
# duckdb.sql(query).df()

## Visualizations for Cohen's Kappa and Phi Coefficient

In [15]:
# === 1. Cohen's Kappa per model & criteria ===
def plot_kappa_by_model(df):
    fig = px.bar(df, x='criteria', y='cohens_kappa', color='model',
                 text='cohens_kappa', barmode='group',
                 labels={'cohens_kappa':'Cohen\'s Kappa'})
    fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')
    fig.update_layout(title="Cohen's Kappa by Model & Criteria", yaxis=dict(range=[0,1]))
    fig.show()

# === 2. Cohen's Kappa mean by criteria (summary) ===
def plot_kappa_mean(df_summary):
    fig = go.Figure()
    fig.add_bar(x=df_summary['criteria'], y=df_summary['cohens_kappa'], 
                name="Cohen's Kappa Mean", marker_color='royalblue')

    fig.update_layout(title="Cohen's Kappa Mean by Criteria",
                      yaxis=dict(title="Metric Value", range=[0,1]))
    fig.show()

In [35]:
 plot_kappa_by_model(cohen_interp)

In [36]:
plot_kappa_mean(criteria_summary)

In [18]:
def create_reliability_heatmap(cohen_interp):
    """Create heatmap showing Cohen's Kappa and Phi coefficient."""
    
    fig = make_subplots(
        rows=1, cols=2,
        subplot_titles=["Cohen's Kappa", "Phi Coefficient"],
        horizontal_spacing=0.15
    )
    
    for i, metric in enumerate(['cohens_kappa', 'phi_coefficient']):
        pivot_df = cohen_interp.pivot(index='criteria', columns='model', values=metric)
        
        fig.add_trace(
            go.Heatmap(
                z=pivot_df.values,
                x=pivot_df.columns,
                y=pivot_df.index,
                colorscale='magma',
                zmin=-1, zmax=1,
                text=np.round(pivot_df.values, 3),
                texttemplate="%{text}",
                textfont={"size": 10},
                showscale=True
            ),
            row=1, col=i+1
        )
    
    fig.update_layout(
        title="Inter-Rater Reliability Metrics",
        height=500,
        width=1000
    )
    return fig

def create_kappa_comparison(cohen_interp):
    """Bar chart comparing Cohen's Kappa across models and criteria."""
    
    fig = go.Figure()
    
    models = cohen_interp['model'].unique()
    colors = px.colors.qualitative.Set1[:len(models)]
    
    for i, model in enumerate(models):
        model_data = cohen_interp[cohen_interp['model'] == model]
        fig.add_trace(go.Bar(
            name=model.title(),
            x=model_data['criteria'],
            y=model_data['cohens_kappa'],
            marker_color=colors[i]
        ))
    
    # Reference lines
    # fig.add_hline(y=0.6, line_dash="dash", line_color="orange", opacity=0.5)
    # fig.add_hline(y=0.8, line_dash="dash", line_color="green", opacity=0.5)
    
    fig.update_layout(
        title="Cohen's Kappa by Model and Criteria",
        xaxis_title="Criteria",
        yaxis_title="Cohen's Kappa",
        yaxis=dict(range=[-1, 1]),
        barmode='group',
        height=500
    )
    return fig

In [37]:
heatmap = create_reliability_heatmap(cohen_interp)
heatmap

In [38]:
bars = create_kappa_comparison(cohen_interp)
bars

# Human Raters and LLM-as-a-Judge

In [78]:
qwen_rater = pd.read_csv('/kaggle/input/mh-evals-stats-and-viz/qwen_as_judge_binary_2025-08-23-12-44-27.csv')
gemini_rater = pd.read_csv('/kaggle/input/mh-evals-stats-and-viz/gemini_as_judge_binary_2025-08-23-12-59-15.csv')
magistral_rater = pd.read_csv('/kaggle/input/mh-evals-stats-and-viz/magistral_as_judge_binary_2025-08-23-13-07-35.csv')
qwen_rater.head()

Unnamed: 0,id,chatgpt_criteria_1,chatgpt_criteria_2,chatgpt_criteria_3,chatgpt_criteria_4,chatgpt_criteria_5,chatgpt_criteria_6,chatgpt_criteria_7,claude_criteria_1,claude_criteria_2,...,deepseek_criteria_5,deepseek_criteria_6,deepseek_criteria_7,llama_criteria_1,llama_criteria_2,llama_criteria_3,llama_criteria_4,llama_criteria_5,llama_criteria_6,llama_criteria_7
0,1_a,0,0,0,0,0,1,0,0,0,...,0,1,0,1,0,0,0,0,1,0
1,2_a,0,0,0,0,0,1,0,1,0,...,0,1,0,0,1,1,0,1,1,0
2,3_a,0,0,0,0,0,0,0,1,0,...,0,1,0,1,0,0,0,0,0,0
3,4_a,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,0,1,0
4,5_a,0,0,0,0,0,1,0,1,0,...,0,1,1,1,0,0,1,0,1,1


In [79]:
gemini_rater.head()

Unnamed: 0,id,chatgpt_criteria_1,chatgpt_criteria_2,chatgpt_criteria_3,chatgpt_criteria_4,chatgpt_criteria_5,chatgpt_criteria_6,chatgpt_criteria_7,claude_criteria_1,claude_criteria_2,...,deepseek_criteria_5,deepseek_criteria_6,deepseek_criteria_7,llama_criteria_1,llama_criteria_2,llama_criteria_3,llama_criteria_4,llama_criteria_5,llama_criteria_6,llama_criteria_7
0,1_a,0,0,0,0,0,1,0,1,0,...,0,1,0,1,0,0,1,0,1,1
1,2_a,0,0,0,0,0,1,0,1,0,...,0,1,0,0,1,1,0,1,1,1
2,3_a,0,0,0,0,0,0,0,1,0,...,0,1,0,1,0,0,1,0,0,0
3,4_a,1,0,0,0,0,0,0,1,0,...,0,1,0,1,0,0,1,0,1,0
4,5_a,0,0,0,0,0,1,0,1,0,...,0,1,1,1,0,0,1,0,1,1


In [81]:
magistral_rater.head()

Unnamed: 0,id,chatgpt_criteria_1,chatgpt_criteria_2,chatgpt_criteria_3,chatgpt_criteria_4,chatgpt_criteria_5,chatgpt_criteria_6,chatgpt_criteria_7,claude_criteria_1,claude_criteria_2,...,deepseek_criteria_5,deepseek_criteria_6,deepseek_criteria_7,llama_criteria_1,llama_criteria_2,llama_criteria_3,llama_criteria_4,llama_criteria_5,llama_criteria_6,llama_criteria_7
0,1_a,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,1,0,1,0
1,2_a,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,1,0,1,1,0
2,3_a,0,0,0,0,1,0,0,0,0,...,0,1,0,1,0,0,0,1,0,0
3,4_a,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,5_a,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0


In [82]:
qwen_rater.shape, gemini_rater.shape, magistral_rater.shape

((16, 29), (16, 29), (16, 29))

## Qwen

In [89]:
def calculate_fleiss_kappa_alpha(df1, df2, df3, rater3_name=None):
    """Calculate Fleiss' Kappa and Krippendorff's Alpha for 3 raters."""
    
    # Auto-detect rater name from dataframe if not provided
    if rater3_name is None:
        rater3_name = getattr(df3, 'name', 'rater3')
    
    # Overall metrics (all columns combined)
    all_fleiss_data = []
    all_ratings_combined = []
    
    for col in df1.columns:
        if col != 'id':
            ratings = np.array([df1[col], df2[col], df3[col]])
            all_ratings_combined.extend(ratings.T)  # Transpose to get observations x raters
            
            # For Fleiss
            counts = np.array([[np.sum(row == 0), np.sum(row == 1)] for row in ratings.T])
            all_fleiss_data.extend(counts)
    
    overall_fleiss = fleiss_kappa(np.array(all_fleiss_data))
    # Convert back to raters x observations format for Krippendorff
    overall_kripp = krippendorff.alpha(np.array(all_ratings_combined).T, level_of_measurement='nominal')
    
    # By model_criteria
    results = []
    for col in df1.columns:
        if col != 'id':
            model, criteria = col.split('_', 1)
            ratings = np.array([df1[col], df2[col], df3[col]])
            
            # Check for variation (need both 0s and 1s)
            unique_vals = len(np.unique(ratings))
            
            if unique_vals > 1:
                # Fleiss Kappa
                counts = np.array([[np.sum(row == 0), np.sum(row == 1)] for row in ratings.T])
                fleiss = fleiss_kappa(counts)
                
                # Krippendorff's Alpha
                kripp = krippendorff.alpha(ratings, level_of_measurement='nominal')
            else:
                fleiss = np.nan
                kripp = np.nan
            
            results.append({
                'model': model,
                'criteria': criteria,
                f'fleiss_kappa_{rater3_name}': fleiss,
                f'krippendorff_alpha_{rater3_name}': kripp
            })
    
    return pd.DataFrame(results), overall_fleiss, overall_kripp

def create_dual_reliability_heatmap(df1, df2, df3, rater3_name=None):
    """Create side-by-side heatmaps for Fleiss' Kappa and Krippendorff's Alpha."""
    results_df, overall_fleiss, overall_kripp = calculate_fleiss_kappa_alpha(df1, df2, df3, rater3_name)
    
    # Auto-detect rater name if not provided
    if rater3_name is None:
        rater3_name = getattr(df3, 'name', 'rater3')
    
    fig = make_subplots(
        rows=1, cols=2,
        subplot_titles=["Fleiss' Kappa", "Krippendorff's Alpha"],
        horizontal_spacing=0.15
    )
    
    metric_cols = [f'fleiss_kappa_{rater3_name}', f'krippendorff_alpha_{rater3_name}']
    
    for i, metric in enumerate(metric_cols):
        pivot_df = results_df.pivot(index='criteria', columns='model', values=metric)
        
        fig.add_trace(
            go.Heatmap(
                z=pivot_df.values,
                x=pivot_df.columns,
                y=pivot_df.index,
                colorscale='oranges',
                zmin=-1, zmax=1,
                text=np.where(np.isnan(pivot_df.values), 'N/A', np.round(pivot_df.values, 3)),
                texttemplate="%{text}",
                textfont={"size": 10},
                showscale=True
            ),
            row=1, col=i+1
        )
    
    fig.update_layout(
        title=f"Multi-Rater Reliability with {rater3_name.title()} (3 Raters)<br><sub>Overall Fleiss: {overall_fleiss:.5f} | Overall Krippendorff: {overall_kripp:.5f}</sub>",
        height=500,
        width=1000
    )
    return fig

In [85]:
results_qwen, fleiss_qwen, kripp_qwen = calculate_fleiss_kappa_alpha(human_rater1, human_rater2, qwen_rater, 'qwen')
results_qwen


invalid value encountered in greater


invalid value encountered in less


invalid value encountered in greater



Unnamed: 0,model,criteria,fleiss_kappa_qwen,krippendorff_alpha_qwen
0,chatgpt,criteria_1,0.625,0.632812
1,chatgpt,criteria_2,-0.021277,0.0
2,chatgpt,criteria_3,-0.021277,0.0
3,chatgpt,criteria_4,,
4,chatgpt,criteria_5,,
5,chatgpt,criteria_6,0.555556,0.564815
6,chatgpt,criteria_7,,
7,claude,criteria_1,0.0625,0.082031
8,claude,criteria_2,,
9,claude,criteria_3,,


In [86]:
fleiss_qwen

0.718796905844146

In [87]:
kripp_qwen

0.7190061343368215

In [90]:
heatmap_qwen = create_dual_reliability_heatmap(human_rater1, human_rater2, qwen_rater, 'qwen')
heatmap_qwen

## Gemini

In [91]:
results_gemini, fleiss_gemini, kripp_gemini = calculate_fleiss_kappa_alpha(human_rater1, human_rater2, gemini_rater, 'gemini')
results_gemini


invalid value encountered in greater


invalid value encountered in less


invalid value encountered in greater



Unnamed: 0,model,criteria,fleiss_kappa_gemini,krippendorff_alpha_gemini
0,chatgpt,criteria_1,0.825771,0.829401
1,chatgpt,criteria_2,0.478261,0.48913
2,chatgpt,criteria_3,0.478261,0.48913
3,chatgpt,criteria_4,-0.021277,0.0
4,chatgpt,criteria_5,,
5,chatgpt,criteria_6,0.394958,0.407563
6,chatgpt,criteria_7,-0.021277,0.0
7,claude,criteria_1,0.042735,0.062678
8,claude,criteria_2,,
9,claude,criteria_3,,


In [92]:
fleiss_gemini

0.7505478097306518

In [93]:
kripp_gemini

0.7507334140388883

In [95]:
heatmap_gemini = create_dual_reliability_heatmap(human_rater1, human_rater2, gemini_rater, 'gemini')
heatmap_gemini

## Magistral

In [96]:
results_magistral, fleiss_magistral, kripp_magistral = calculate_fleiss_kappa_alpha(human_rater1, human_rater2, magistral_rater, 'magistral')
results_magistral


invalid value encountered in greater


invalid value encountered in less


invalid value encountered in greater



Unnamed: 0,model,criteria,fleiss_kappa_magistral,krippendorff_alpha_magistral
0,chatgpt,criteria_1,0.261538,0.276923
1,chatgpt,criteria_2,-0.021277,0.0
2,chatgpt,criteria_3,-0.021277,0.0
3,chatgpt,criteria_4,,
4,chatgpt,criteria_5,-0.142857,-0.119048
5,chatgpt,criteria_6,0.238095,0.253968
6,chatgpt,criteria_7,,
7,claude,criteria_1,-0.174825,-0.15035
8,claude,criteria_2,,
9,claude,criteria_3,,


In [97]:
fleiss_magistral

0.48972733903278143

In [98]:
kripp_magistral

0.49010700619123904

In [99]:
heatmap_magistral = create_dual_reliability_heatmap(human_rater1, human_rater2, magistral_rater, 'magistral')
heatmap_magistral

# All LLM-as-a-Judge Raters (qwen, gemini and magistral)

In [100]:
results_all_llms, fleiss_all_llms, kripp_all_llms = calculate_fleiss_kappa_alpha(qwen_rater, gemini_rater, magistral_rater, 'all_llms')
results_all_llms


invalid value encountered in greater


invalid value encountered in less


invalid value encountered in greater



Unnamed: 0,model,criteria,fleiss_kappa_all_llms,krippendorff_alpha_all_llms
0,chatgpt,criteria_1,0.179487,0.196581
1,chatgpt,criteria_2,-0.021277,0.0
2,chatgpt,criteria_3,-0.021277,0.0
3,chatgpt,criteria_4,-0.021277,0.0
4,chatgpt,criteria_5,-0.142857,-0.119048
5,chatgpt,criteria_6,0.193277,0.210084
6,chatgpt,criteria_7,-0.021277,0.0
7,claude,criteria_1,-0.090909,-0.068182
8,claude,criteria_2,,
9,claude,criteria_3,,


In [101]:
fleiss_all_llms

0.4537197356419325

In [102]:
kripp_all_llms

0.45412619417196076

In [106]:
heatmap_all_llms = create_dual_reliability_heatmap(qwen_rater, gemini_rater, magistral_rater, 'all llms')
heatmap_all_llms

# All Human Raters and LLM-as-a-Judges (human rater 1, human rater 2, qwen, gemini, magistral)

In [120]:
def calculate_fleiss_kappa_alpha_five_raters(df1, df2, df3, df4, df5):
    """Calculate Fleiss' Kappa and Krippendorff's Alpha for 5 raters."""
    
    # Overall metrics (all columns combined)
    all_fleiss_data = []
    all_ratings_combined = []
    
    for col in df1.columns:
        if col != 'id':
            ratings = np.array([df1[col], df2[col], df3[col], df4[col], df5[col]])
            all_ratings_combined.extend(ratings.T)  # Transpose to get observations x raters
            
            # For Fleiss
            counts = np.array([[np.sum(row == 0), np.sum(row == 1)] for row in ratings.T])
            all_fleiss_data.extend(counts)
    
    overall_fleiss = fleiss_kappa(np.array(all_fleiss_data))
    # Convert back to raters x observations format for Krippendorff
    overall_kripp = krippendorff.alpha(np.array(all_ratings_combined).T, level_of_measurement='nominal')
    
    # By model_criteria
    results = []
    for col in df1.columns:
        if col != 'id':
            model, criteria = col.split('_', 1)
            ratings = np.array([df1[col], df2[col], df3[col], df4[col], df5[col]])
            
            # Check for variation (need both 0s and 1s)
            unique_vals = len(np.unique(ratings))
            
            if unique_vals > 1:
                # Fleiss Kappa
                counts = np.array([[np.sum(row == 0), np.sum(row == 1)] for row in ratings.T])
                fleiss = fleiss_kappa(counts)
                
                # Krippendorff's Alpha
                kripp = krippendorff.alpha(ratings, level_of_measurement='nominal')
            else:
                fleiss = np.nan
                kripp = np.nan
            
            results.append({
                'model': model,
                'criteria': criteria,
                'fleiss_kappa_5raters': fleiss,
                'krippendorff_alpha_5raters': kripp
            })
    
    return pd.DataFrame(results), overall_fleiss, overall_kripp

def create_five_rater_heatmap(df1, df2, df3, df4, df5):
    """Create side-by-side heatmaps for 5-rater reliability."""
    results_df, overall_fleiss, overall_kripp = calculate_fleiss_kappa_alpha_five_raters(df1, df2, df3, df4, df5)
    
    fig = make_subplots(
        rows=1, cols=2,
        subplot_titles=["Fleiss' Kappa", "Krippendorff's Alpha"],
        horizontal_spacing=0.15
    )
    
    metric_cols = ['fleiss_kappa_5raters', 'krippendorff_alpha_5raters']
    
    for i, metric in enumerate(metric_cols):
        pivot_df = results_df.pivot(index='criteria', columns='model', values=metric)
        
        fig.add_trace(
            go.Heatmap(
                z=pivot_df.values,
                x=pivot_df.columns,
                y=pivot_df.index,
                colorscale='armyrose',
                zmin=-1, zmax=1,
                text=np.where(np.isnan(pivot_df.values), 'N/A', np.round(pivot_df.values, 3)),
                texttemplate="%{text}",
                textfont={"size": 10},
                showscale=True
            ),
            row=1, col=i+1
        )
    
    fig.update_layout(
        title=f"Five-Rater Reliability (Human1, Human2, Qwen, Gemini, Magistral)<br><sub>Overall Fleiss: {overall_fleiss:.5f} | Overall Krippendorff: {overall_kripp:.5f}</sub>",
        height=500,
        width=1000
    )
    return fig

In [108]:
results_5raters, fleiss_5raters, alpha_5raters = calculate_fleiss_kappa_alpha_five_raters(human_rater1, human_rater2, qwen_rater, gemini_rater, magistral_rater)
results_5raters


invalid value encountered in greater


invalid value encountered in less


invalid value encountered in greater



Unnamed: 0,model,criteria,fleiss_kappa_5raters,krippendorff_alpha_5raters
0,chatgpt,criteria_1,0.435737,0.44279
1,chatgpt,criteria_2,0.230769,0.240385
2,chatgpt,criteria_3,0.230769,0.240385
3,chatgpt,criteria_4,-0.012658,0.0
4,chatgpt,criteria_5,-0.081081,-0.067568
5,chatgpt,criteria_6,0.333333,0.341667
6,chatgpt,criteria_7,-0.012658,0.0
7,claude,criteria_1,-0.005803,0.00677
8,claude,criteria_2,,
9,claude,criteria_3,,


In [109]:
fleiss_5raters

0.5733333333333331

In [110]:
alpha_5raters

0.5735238095238095

In [121]:
heatmap_5raters = create_five_rater_heatmap(human_rater1, human_rater2, qwen_rater, gemini_rater, magistral_rater)
heatmap_5raters

# Overall Fleiss Kappa and Krippendorff Alpha Winner's Criteria Scores and Interpretation

In [134]:
def calculate_reliability_by_criteria_only(df1, df2, df3, rater3_name=None):
    """Calculate Fleiss' Kappa and Krippendorff's Alpha by criteria only (pooling across all models)."""
    
    # Auto-detect rater name from dataframe if not provided
    if rater3_name is None:
        rater3_name = getattr(df3, 'name', 'rater3')
    
    # Extract unique criteria
    criteria_list = list(set([col.split('_', 1)[1] for col in df1.columns if col != 'id']))
    
    # Overall metrics (all columns combined)
    all_fleiss_data = []
    all_ratings_combined = []
    
    for col in df1.columns:
        if col != 'id':
            ratings = np.array([df1[col], df2[col], df3[col]])
            all_ratings_combined.extend(ratings.T)
            
            counts = np.array([[np.sum(row == 0), np.sum(row == 1)] for row in ratings.T])
            all_fleiss_data.extend(counts)
    
    overall_fleiss = fleiss_kappa(np.array(all_fleiss_data))
    overall_kripp = krippendorff.alpha(np.array(all_ratings_combined).T, level_of_measurement='nominal')
    
    # By criteria only (pool across all models for each criteria)
    results = []
    for criteria in criteria_list:
        criteria_cols = [col for col in df1.columns if col.endswith(criteria)]
        
        # Pool all ratings for this criteria across all models
        all_criteria_ratings = []
        for col in criteria_cols:
            ratings = np.array([df1[col], df2[col], df3[col]])
            all_criteria_ratings.extend(ratings.T)
        
        # Check for variation
        all_criteria_array = np.array(all_criteria_ratings)
        unique_vals = len(np.unique(all_criteria_array))
        
        if unique_vals > 1:
            # Fleiss Kappa
            counts = np.array([[np.sum(row == 0), np.sum(row == 1)] for row in all_criteria_ratings])
            fleiss = fleiss_kappa(counts)
            
            # Krippendorff's Alpha (raters x observations format)
            criteria_ratings_matrix = np.array(all_criteria_ratings).T  # 3 raters x N observations
            kripp = krippendorff.alpha(criteria_ratings_matrix, level_of_measurement='nominal')
        else:
            fleiss = np.nan
            kripp = np.nan
        
        results.append({
            'criteria': criteria,
            f'fleiss_kappa_{rater3_name}': fleiss,
            f'krippendorff_alpha_{rater3_name}': kripp
        })
    
    return pd.DataFrame(results), overall_fleiss, overall_kripp

def create_criteria_reliability_chart(df1, df2, df3, rater3_name=None):
    """Create bar chart showing reliability by criteria only."""
    results_df, overall_fleiss, overall_kripp = calculate_reliability_by_criteria_only(df1, df2, df3, rater3_name)
    
    if rater3_name is None:
        rater3_name = getattr(df3, 'name', 'rater3')
    
    # Sort criteria in proper order (criteria_1, criteria_2, etc.)
    results_df['criteria_num'] = results_df['criteria'].str.extract('(\d+)').astype(int)
    results_df = results_df.sort_values('criteria_num').reset_index(drop=True)
    
    fig = make_subplots(
        rows=1, cols=2,
        subplot_titles=["Fleiss' Kappa by Criteria", "Krippendorff's Alpha by Criteria"],
        horizontal_spacing=0.15
    )
    
    # Fleiss Kappa bars
    fleiss_col = f'fleiss_kappa_{rater3_name}'
    fig.add_trace(
        go.Bar(
            x=results_df['criteria'],
            y=results_df[fleiss_col],
            text=np.where(np.isnan(results_df[fleiss_col]), 'N/A', np.round(results_df[fleiss_col], 3)),
            textposition='auto',
            marker_color='steelblue',
            name="Fleiss' Kappa"
        ),
        row=1, col=1
    )
    
    # Krippendorff Alpha bars
    kripp_col = f'krippendorff_alpha_{rater3_name}'
    fig.add_trace(
        go.Bar(
            x=results_df['criteria'],
            y=results_df[kripp_col],
            text=np.where(np.isnan(results_df[kripp_col]), 'N/A', np.round(results_df[kripp_col], 3)),
            textposition='auto',
            marker_color='darkorange',
            name="Krippendorff's Alpha"
        ),
        row=1, col=2
    )
    
    # Add reference lines
    for col in [1, 2]:
        fig.add_hline(y=0.6, line_dash="dash", line_color="orange", opacity=0.5, row=1, col=col)
        fig.add_hline(y=0.8, line_dash="dash", line_color="green", opacity=0.5, row=1, col=col)
    
    fig.update_layout(
        title=f"Reliability by Criteria Only (Human1, Human2, {rater3_name.title()})<br><sub>Overall Fleiss: {overall_fleiss:.5f} | Overall Krippendorff: {overall_kripp:.5f}</sub>",
        height=500,
        width=1000,
        showlegend=False
    )
    
    # Update y-axis range to start at 0
    fig.update_yaxes(range=[0, 1])
    
    return fig

In [126]:
hum_gem_results_by_criteria, overall_fleiss, overall_kripp = calculate_reliability_by_criteria_only(human_rater1, human_rater2, gemini_rater, 'gemini')
hum_gem_results_by_criteria

Unnamed: 0,criteria,fleiss_kappa_gemini,krippendorff_alpha_gemini
0,criteria_7,0.52381,0.52629
1,criteria_3,0.522727,0.525213
2,criteria_1,0.76544,0.766662
3,criteria_2,0.614264,0.616273
4,criteria_6,0.531707,0.534146
5,criteria_4,0.676768,0.678451
6,criteria_5,0.794652,0.795722


In [135]:
hum_gem_criteria_chart = create_criteria_reliability_chart(human_rater1, human_rater2, gemini_rater, 'gemini')
hum_gem_criteria_chart

# Fleiss kappa and Krippendorff alpha Score Interpretation

In [131]:
query = """

select criteria
, fleiss_kappa_gemini
, CASE
    WHEN fleiss_kappa_gemini > 0.80 AND fleiss_kappa_gemini <= 1.00 THEN 'Almost perfect agreement'
    WHEN fleiss_kappa_gemini > 0.60 AND fleiss_kappa_gemini <= 0.80 THEN 'Substantial agreement'
    WHEN fleiss_kappa_gemini > 0.40 AND fleiss_kappa_gemini <= 0.60 THEN 'Moderate agreement'
    WHEN fleiss_kappa_gemini >= 0.20 AND fleiss_kappa_gemini <= 0.40 THEN 'Fair agreement'
    WHEN fleiss_kappa_gemini < 0.20 THEN 'Slight agreement'
    ELSE 'Undefined'
    END AS fleiss_interpretation
, krippendorff_alpha_gemini
, CASE
    WHEN krippendorff_alpha_gemini >= 0.80 THEN 'Acceptable'
    WHEN krippendorff_alpha_gemini >= 0.67 AND krippendorff_alpha_gemini < 0.80 THEN 'Acceptable'
    WHEN krippendorff_alpha_gemini < 0.67 THEN 'Unacceptable'
    ELSE 'Undefined'
    END AS kripp_interpretation

from hum_gem_results_by_criteria
order by 1 asc
"""

fleiss_kripp_interp = duckdb.sql(query).df()
fleiss_kripp_interp

Unnamed: 0,criteria,fleiss_kappa_gemini,fleiss_interpretation,krippendorff_alpha_gemini,kripp_interpretation
0,criteria_1,0.76544,Substantial agreement,0.766662,Acceptable
1,criteria_2,0.614264,Substantial agreement,0.616273,Unacceptable
2,criteria_3,0.522727,Moderate agreement,0.525213,Unacceptable
3,criteria_4,0.676768,Substantial agreement,0.678451,Acceptable
4,criteria_5,0.794652,Substantial agreement,0.795722,Acceptable
5,criteria_6,0.531707,Moderate agreement,0.534146,Unacceptable
6,criteria_7,0.52381,Moderate agreement,0.52629,Unacceptable
