In [2]:
!pip install krippendorff --quiet

In [9]:
import numpy as np
import pandas as pd

import duckdb

from sklearn.metrics import cohen_kappa_score, matthews_corrcoef
import krippendorff 

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [4]:
human_rater1 = pd.read_csv('/kaggle/input/mh-evals-stats-and-viz/may_base_scenarios_manual_validation.csv')
human_rater2 = pd.read_csv('/kaggle/input/mh-evals-stats-and-viz/markela_base_scenarios_manual_validation.csv')
human_rater1.head()

Unnamed: 0,id,chatgpt_criteria_1,chatgpt_criteria_2,chatgpt_criteria_3,chatgpt_criteria_4,chatgpt_criteria_5,chatgpt_criteria_6,chatgpt_criteria_7,claude_criteria_1,claude_criteria_2,...,deepseek_criteria_5,deepseek_criteria_6,deepseek_criteria_7,llama_criteria_1,llama_criteria_2,llama_criteria_3,llama_criteria_4,llama_criteria_5,llama_criteria_6,llama_criteria_7
0,1_a,0,0,0,0,0,1,0,1,0,...,0,1,0,1,0,0,1,0,1,0
1,2_a,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,1,0,1,1,0
2,3_a,0,0,0,0,0,0,0,1,0,...,0,1,0,1,0,0,1,0,0,0
3,4_a,1,0,0,0,0,0,0,1,0,...,0,1,0,1,0,0,1,0,1,0
4,5_a,0,0,0,0,0,0,0,1,0,...,0,1,0,1,0,0,1,0,0,1


In [5]:
human_rater2.head()

Unnamed: 0,id,chatgpt_criteria_1,chatgpt_criteria_2,chatgpt_criteria_3,chatgpt_criteria_4,chatgpt_criteria_5,chatgpt_criteria_6,chatgpt_criteria_7,claude_criteria_1,claude_criteria_2,...,deepseek_criteria_5,deepseek_criteria_6,deepseek_criteria_7,llama_criteria_1,llama_criteria_2,llama_criteria_3,llama_criteria_4,llama_criteria_5,llama_criteria_6,llama_criteria_7
0,1_a,0,0,0,0,0,1,0,1,0,...,0,1,0,1,0,0,1,0,1,0
1,2_a,0,0,0,0,0,1,0,1,0,...,0,1,0,0,1,1,0,1,1,0
2,3_a,0,0,0,0,0,0,0,1,0,...,0,1,0,1,0,0,1,0,0,0
3,4_a,1,0,0,0,0,0,0,1,0,...,0,1,0,1,0,0,1,0,1,0
4,5_a,0,0,0,0,0,0,0,1,0,...,0,1,0,1,0,0,1,0,0,1


In [6]:
human_rater1.shape, human_rater2.shape

((16, 29), (16, 29))

Pearson's Correlation will not work with bivariate comparison between binary data. It mainly works with continuous normally distributed data (integer OK).

So will use instead:
* Percent Agreement (quick, never produces NaN's)
* After checking both raters have variance in each column, use:
    *  Cohen's Kappa (agreement between two raters corrected for chance, most widely used for categorical including binary)
    *  Krippendorff’s Alpha (generalization of agreement for any number of raters; can be used for two raters as well, can work with different data types)

First, check if any columns have the same value across all rows.
This is a test of variance.
Some statistics will fail due to no variance (produce NaN's) in scored criteria.

In [7]:
def check_no_variance_columns(df1, df2):
    """Check for columns with no variance (same value across all rows)."""
    no_variance_results = []
    
    # Get all model-criteria columns (skip 'id' column)
    columns_to_check = [col for col in df1.columns if col != 'id']
    
    for col in columns_to_check:
        # Check variance for each dataframe
        df1_var = df1[col].var()
        df2_var = df2[col].var()
        df1_unique = df1[col].nunique()
        df2_unique = df2[col].nunique()
        
        if df1_var == 0 or df2_var == 0: 
            no_variance_results.append({ # identify where no variance is
                'column': col, 
                'df1_variance': df1_var,
                'df2_variance': df2_var,
                # 'df1_unique_values': df1_unique,
                # 'df2_unique_values': df2_unique,
                'df1_constant_value': df1[col].iloc[0] if df1_var == 0 else None,
                'df2_constant_value': df2[col].iloc[0] if df2_var == 0 else None,
                'both_no_variance': df1_var == 0 and df2_var == 0
            })
    
    return pd.DataFrame(no_variance_results)

def variance_summary(df1, df2):
    """Quick summary of variance issues."""
    no_var_df = check_no_variance_columns(df1, df2)
    
    if len(no_var_df) == 0:
        print("✅ All columns have variance - no statistical issues expected")
        return
    
    print(f"⚠️  Found {len(no_var_df)} columns with no variance:")
    print(f"   - Rater 1 only: {sum((no_var_df['df1_variance'] == 0) & (no_var_df['df2_variance'] > 0))}")
    print(f"   - Rater 2 only: {sum((no_var_df['df1_variance'] > 0) & (no_var_df['df2_variance'] == 0))}")
    print(f"   - Both raters: {sum(no_var_df['both_no_variance'])}")
    
    return no_var_df

def parse_column_name(col_name):
    """Parse model-criteria column names."""
    parts = col_name.split('_')
    if len(parts) >= 3:
        model = parts[0]
        criteria = '_'.join(parts[1:])  # Handle criteria_1, criteria_2, etc.
        return model, criteria
    return None, None

def variance_by_criteria(df1, df2):
    """Group no-variance issues by criteria."""
    no_var_df = check_no_variance_columns(df1, df2)
    
    if len(no_var_df) == 0:
        return pd.DataFrame()
    
    # Parse column names
    no_var_df[['model', 'criteria']] = no_var_df['column'].apply(
        lambda x: pd.Series(parse_column_name(x))
    )
    
    # Group by criteria
    criteria_summary = no_var_df.groupby('criteria').agg({
        'model': lambda x: list(x),
        'both_no_variance': 'sum',
        'column': 'count'
    }).rename(columns={
        'model': 'affected_models',
        'both_no_variance': 'perfect_agreement_count',
        'column': 'total_variance_issues'
    })
    
    return criteria_summary


# Quick check
variance_summary(human_rater1, human_rater2)

⚠️  Found 10 columns with no variance:
   - Rater 1 only: 3
   - Rater 2 only: 0
   - Both raters: 7


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,column,df1_variance,df2_variance,df1_constant_value,df2_constant_value,both_no_variance
0,chatgpt_criteria_2,0.0,0.0625,0,,False
1,chatgpt_criteria_3,0.0,0.0625,0,,False
2,chatgpt_criteria_4,0.0,0.0,0,0.0,True
3,chatgpt_criteria_5,0.0,0.0,0,0.0,True
4,chatgpt_criteria_7,0.0,0.0,0,0.0,True
5,claude_criteria_2,0.0,0.0,0,0.0,True
6,claude_criteria_3,0.0,0.0,0,0.0,True
7,claude_criteria_5,0.0,0.0,0,0.0,True
8,deepseek_criteria_2,0.0,0.116667,0,,False
9,deepseek_criteria_5,0.0,0.0,0,0.0,True


* Rater 1 had more consistent marks across scenarios per model_criteria
* consistency is when there is 0 in column or absence of criteria across scenarios
* strong agreement between raters with criteria 5 (absence of it) across models

In [10]:
# Group by criteria
criteria_issues = variance_by_criteria(human_rater1, human_rater2)
criteria_issues

Unnamed: 0_level_0,affected_models,perfect_agreement_count,total_issues
criteria,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
criteria_2,"[chatgpt, claude, deepseek]",1,3
criteria_3,"[chatgpt, claude]",1,2
criteria_4,[chatgpt],1,1
criteria_5,"[chatgpt, claude, deepseek]",3,3
criteria_7,[chatgpt],1,1


criteria 2 and 5 have most consistent scores within rater and between raters.
criteria 5's low variance results makes sense since it is the most objective criteria of all 7.

In [8]:
# One-liner checks
# def has_variance(df, col):
#     """Check if single column has variance."""
#     return df[col].nunique() > 1

# def problematic_columns(df1, df2):
#     """Return list of column names that will cause NaN in statistics."""
#     return [col for col in df1.columns if col != 'id' 
#             and (not has_variance(df1, col) or not has_variance(df2, col))]
# problematic_columns(human_rater1, human_rater2)

Given that there is lack of variance in some columns, will calculate in this order:
1. Percent Agreement
2. Cohen's Kappa
3. Krippendorff’s Alpha 

Watch out for NaN results at granular level. 
NaN's will disappear upon aggregate statistics.
Aggregate statistics will be reported.

Cohen's Kappa will be reported first, then Krippendorff’s Alpha and lastly Percent Agreement (baseline).

# Percent Agreement

In [19]:
# Percent Agreement calculation
def calculate_percent_agreement(df1, df2):
    """Calculate percent agreement for each model-criteria combination."""
    columns_to_check = [col for col in df1.columns if col != 'id']
    results = []
    
    for col in columns_to_check:
        percent_agreement = np.mean(df1[col] == df2[col]) * 100
        
        # Parse column name
        model, criteria = parse_column_name(col)
        
        results.append({
            #'column': col,
            'model': model,
            'criteria': criteria,
            'percent_agreement': round(percent_agreement, 2)
        })
    
    return pd.DataFrame(results)

def agreement_summary_by_criteria():
    """Calculate average percent agreement by criteria across models."""
    agreement_df = calculate_percent_agreement(human_rater1, human_rater2)
    
    summary = agreement_df.groupby('criteria')['percent_agreement'].agg([
        'mean', 'min', 'max', 'std'
    ]).round(2)
    
    summary.columns = ['avg_agreement', 'min_agreement', 'max_agreement', 'std_agreement']
    return summary.reset_index()

def overall_agreement(df1, df2):
    """Calculate overall percent agreement across all ratings."""
    columns_to_check = [col for col in df1.columns if col != 'id']
    
    total_agreements = 0
    total_comparisons = 0
    
    for col in columns_to_check:
        agreements = np.sum(df1[col] == df2[col])
        total_agreements += agreements
        total_comparisons += len(df1[col])
    
    overall_percent = (total_agreements / total_comparisons) * 100
    
    
    print(f"Total Comparisons: {total_comparisons}")
    print(f"Total Agreements: {total_agreements}")
    
    return f"Overall Percent Agreement: {overall_percent:.2f}%"

In [24]:
model_criteria_agree = calculate_percent_agreement(human_rater1, human_rater2)
model_criteria_agree

Unnamed: 0,model,criteria,percent_agreement
0,chatgpt,criteria_1,93.75
1,chatgpt,criteria_2,93.75
2,chatgpt,criteria_3,93.75
3,chatgpt,criteria_4,100.0
4,chatgpt,criteria_5,100.0
5,chatgpt,criteria_6,87.5
6,chatgpt,criteria_7,100.0
7,claude,criteria_1,68.75
8,claude,criteria_2,100.0
9,claude,criteria_3,100.0


In [28]:
# percent agreement in descending order (most agreed at top) by alphabetical order
query = """

select *
from model_criteria_agree
order by 3 desc

"""
duckdb.sql(query).df()

Unnamed: 0,model,criteria,percent_agreement
0,chatgpt,criteria_4,100.0
1,chatgpt,criteria_5,100.0
2,chatgpt,criteria_7,100.0
3,claude,criteria_2,100.0
4,claude,criteria_3,100.0
5,claude,criteria_5,100.0
6,deepseek,criteria_1,100.0
7,deepseek,criteria_5,100.0
8,llama,criteria_1,100.0
9,llama,criteria_2,100.0


A little under half of the model_criteria combos have agreement.

In [29]:
criteria_agreement = agreement_summary_by_criteria()
criteria_agreement

Unnamed: 0,criteria,avg_agreement,min_agreement,max_agreement,std_agreement
0,criteria_1,90.62,68.75,100.0,14.88
1,criteria_2,95.31,87.5,100.0,5.98
2,criteria_3,93.75,81.25,100.0,8.84
3,criteria_4,85.94,62.5,100.0,17.95
4,criteria_5,98.44,93.75,100.0,3.12
5,criteria_6,89.06,81.25,93.75,5.98
6,criteria_7,93.75,81.25,100.0,8.84


In [30]:
query = """

select *
from criteria_agreement
order by 2 desc

"""
duckdb.sql(query).df()

Unnamed: 0,criteria,avg_agreement,min_agreement,max_agreement,std_agreement
0,criteria_5,98.44,93.75,100.0,3.12
1,criteria_2,95.31,87.5,100.0,5.98
2,criteria_3,93.75,81.25,100.0,8.84
3,criteria_7,93.75,81.25,100.0,8.84
4,criteria_1,90.62,68.75,100.0,14.88
5,criteria_6,89.06,81.25,93.75,5.98
6,criteria_4,85.94,62.5,100.0,17.95


criteria 5, 2, and 3 are at the top. 2 and 3 are somewhat straightforward to detect, but not as much as 5.

In [23]:
overall_agreement(human_rater1, human_rater2)

Total Comparisons: 448
Total Agreements: 414


'Overall Percent Agreement: 92.41%'

# Cohen's Kappa & Krippendorff’s Alpha

In [38]:
def calculate_all_reliability_metrics(df1, df2):
    """Calculate Cohen's Kappa and Krippendorff's Alpha for all columns."""
    columns_to_check = [col for col in df1.columns if col != 'id']
    results = []
    
    for col in columns_to_check:
        # Parse column name
        parts = col.split('_')
        model = parts[0] if len(parts) >= 3 else 'unknown'
        criteria = '_'.join(parts[1:]) if len(parts) >= 3 else col
        
        # Cohen's Kappa
        try:
            kappa = cohen_kappa_score(df1[col], df2[col])
            kappa = kappa if not (np.isnan(kappa) or np.isinf(kappa)) else np.nan
        except:
            kappa = np.nan
        
        # Krippendorff's Alpha
        try:
            # Format data as 2D array: [rater1_values, rater2_values]
            reliability_data = np.array([df1[col].values, df2[col].values])
            alpha = krippendorff.alpha(reliability_data, level_of_measurement='nominal')
            alpha = alpha if not (np.isnan(alpha) or np.isinf(alpha)) else np.nan
        except:
            alpha = np.nan
        
        # Percent Agreement
        percent_agreement = np.mean(df1[col] == df2[col]) * 100
        
        results.append({
            #'column': col,
            'model': model,
            'criteria': criteria,
            'cohens_kappa': round(kappa, 4) if not np.isnan(kappa) else np.nan,
            'krippendorff_alpha': round(alpha, 4) if not np.isnan(alpha) else np.nan,
            'percent_agreement': round(percent_agreement, 2)
        })
    
    return pd.DataFrame(results)

def reliability_summary_by_criteria(df1, df2):
    """Average reliability metrics by criteria across models."""
    results_df = calculate_all_reliability_metrics(df1, df2)
    
    summary = results_df.groupby('criteria').agg({
        'cohens_kappa': ['mean', 'count'],
        'krippendorff_alpha': ['mean', 'count'],
        'percent_agreement': ['mean', 'std']
    }).round(4)
    
    # Flatten column names
    summary.columns = ['kappa_mean', 'kappa_valid',
                      'alpha_mean', 'alpha_valid', 
                      'agreement_mean', 'agreement_std']
    
    return summary.reset_index()

def quick_reliability_check(df1, df2, column):
    """Quick reliability check for a single column."""
    print(f"Reliability metrics for {column}:")
    print(f"Percent Agreement: {np.mean(df1[column] == df2[column]) * 100:.2f}%")
    
    try:
        kappa = cohen_kappa_score(df1[column], df2[column])
        print(f"Cohen's Kappa: {kappa:.4f}")
    except:
        print("Cohen's Kappa: Unable to calculate")
    
    try:
        reliability_data = np.array([df1[column].values, df2[column].values])
        alpha = krippendorff.alpha(reliability_data, level_of_measurement='nominal')
        print(f"Krippendorff's Alpha: {alpha:.4f}")
    except:
        print("Krippendorff's Alpha: Unable to calculate")

def interpret_reliability_scores():
    """Interpretation guide for reliability metrics."""
    print("Reliability Score Interpretation:")
    print("\nPercent Agreement:")
    print("  < 70%: Poor agreement")
    print("  70-79%: Fair agreement")
    print("  80-89%: Good agreement") 
    print("  90-95%: Excellent agreement")
    print("  > 95%: Near-perfect agreement")
    print("  Note: It does NOT adjust for chance agreement, so its reliability in high-stakes decisions is limited.")
    
    print("\nCohen's Kappa:")
    print("  < 0.20: Slight agreement")
    print("  0.21-0.40: Fair agreement") 
    print("  0.41-0.60: Moderate agreement")
    print("  0.61-0.80: Substantial agreement")
    print("  0.81-1.00: Almost perfect agreement")
    print("Criticized for issues with unbalanced data or prevalence, but still the most common interpretation.")
    
    print("\nKrippendorff's Alpha:")
    print("  α < 0.67: Not sufficient for conclusions.")
    print("  0.67 ≤ α < 0.80: Tentative conclusions")
    print("  α ≥ 0.80: Reliable (high-stakes/decisive conclusions)")
    print("Some flexibility exists depending on discipline, but the above are widely cited thresholds.")


# One-liner functions for specific metrics
def cohens_kappa_only(df1, df2, column):
    """Calculate only Cohen's Kappa for a column."""
    try:
        return cohen_kappa_score(df1[column], df2[column])
    except:
        return np.nan

def krippendorff_only(df1, df2, column):
    """Calculate only Krippendorff's Alpha for a column."""
    try:
        reliability_data = np.array([df1[column].values, df2[column].values])
        return krippendorff.alpha(reliability_data, level_of_measurement='nominal')
    except:
        return np.nan

In [33]:
# Complete analysis
all_metrics = calculate_all_reliability_metrics(human_rater1, human_rater2)
all_metrics

  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,model,criteria,cohens_kappa,krippendorff_alpha,percent_agreement
0,chatgpt,criteria_1,0.871,0.8745,93.75
1,chatgpt,criteria_2,0.0,0.0,93.75
2,chatgpt,criteria_3,0.0,0.0,93.75
3,chatgpt,criteria_4,,,100.0
4,chatgpt,criteria_5,,,100.0
5,chatgpt,criteria_6,0.6,0.6026,87.5
6,chatgpt,criteria_7,,,100.0
7,claude,criteria_1,0.2,0.1143,68.75
8,claude,criteria_2,,,100.0
9,claude,criteria_3,,,100.0


In [34]:
# Look at top 10 Cohen's Kappa scores by model and criteria, ordered in alphabetical order.

query = """

select *
from all_metrics
order by 3 desc

"""
duckdb.sql(query).df()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,model,criteria,cohens_kappa,krippendorff_alpha,percent_agreement
0,deepseek,criteria_1,1.0,1.0,100.0
1,llama,criteria_1,1.0,1.0,100.0
2,llama,criteria_2,1.0,1.0,100.0
3,llama,criteria_3,1.0,1.0,100.0
4,llama,criteria_4,1.0,1.0,100.0
5,llama,criteria_7,1.0,1.0,100.0
6,chatgpt,criteria_1,0.871,0.8745,93.75
7,llama,criteria_6,0.8621,0.8658,93.75
8,deepseek,criteria_6,0.7647,0.7704,93.75
9,deepseek,criteria_7,0.7647,0.7704,93.75


Interesting that there is highest agreement in llama models across all criteria except 5 and 6.

In [35]:
# Summary by criteria
criteria_summary = reliability_summary_by_criteria(human_rater1, human_rater2)
criteria_summary

  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)


Unnamed: 0,criteria,kappa_mean,kappa_valid,alpha_mean,alpha_valid,agreement_mean,agreement_std
0,criteria_1,0.7677,4,0.7472,4,90.625,14.878
1,criteria_2,0.3333,3,0.3222,3,95.3125,5.9839
2,criteria_3,0.4928,3,0.4895,3,93.75,8.8388
3,criteria_4,0.5795,3,0.5919,3,85.9375,17.9518
4,criteria_5,0.6364,1,0.6437,1,98.4375,3.125
5,criteria_6,0.6703,4,0.6768,4,89.0625,5.9839
6,criteria_7,0.5579,3,0.5671,3,93.75,8.8388


`valid` means total non NaN count. Those that had NaN had 0 variance in a column across scenarios.

In [36]:
# cohen's kappa in descending order (most agreed at top) then alpha in descending order by alphabetical order
query = """

select *
from criteria_summary
order by 2 desc, 4 desc

"""
duckdb.sql(query).df()

Unnamed: 0,criteria,kappa_mean,kappa_valid,alpha_mean,alpha_valid,agreement_mean,agreement_std
0,criteria_1,0.7677,4,0.7472,4,90.625,14.878
1,criteria_6,0.6703,4,0.6768,4,89.0625,5.9839
2,criteria_5,0.6364,1,0.6437,1,98.4375,3.125
3,criteria_4,0.5795,3,0.5919,3,85.9375,17.9518
4,criteria_7,0.5579,3,0.5671,3,93.75,8.8388
5,criteria_3,0.4928,3,0.4895,3,93.75,8.8388
6,criteria_2,0.3333,3,0.3222,3,95.3125,5.9839


kappa_mean and alpha_mean line up in rank, but neither line up with agreement_mean. This is explained by kappa's correction for chance agreement.

In [40]:
# overall kappa mean, alpha and agreement
query = """

select mean(kappa_mean) as overall_kappa
, mean(alpha_mean) as overall_alpha
, mean(agreement_mean) as overall_agreement
from criteria_summary

"""
duckdb.sql(query).df()

Unnamed: 0,overall_kappa,overall_alpha,overall_agreement
0,0.576843,0.576914,92.410714


In [None]:
# Quick check for one column
# quick_reliability_check(human_rater1, human_rater2, 'chatgpt_criteria_1')

In [None]:
# One-liners
# kappa = cohens_kappa_only(human_rater1, human_rater2, 'claude_criteria_2')
# mcc = mcc_only(human_rater1, human_rater2, 'deepseek_criteria_3')
# alpha = krippendorff_only(human_rater1, human_rater2, 'llama_criteria_4')

In [39]:
# Interpretation guide
interpret_reliability_scores()

Reliability Score Interpretation:

Percent Agreement:
  < 70%: Poor agreement
  70-79%: Fair agreement
  80-89%: Good agreement
  90-95%: Excellent agreement
  > 95%: Near-perfect agreement
  Note: It does NOT adjust for chance agreement, so its reliability in high-stakes decisions is limited.

Cohen's Kappa:
  < 0.20: Slight agreement
  0.21-0.40: Fair agreement
  0.41-0.60: Moderate agreement
  0.61-0.80: Substantial agreement
  0.81-1.00: Almost perfect agreement
Criticized for issues with unbalanced data or prevalence, but still the most common interpretation.

Krippendorff's Alpha:
  α < 0.67: Not sufficient for conclusions.
  0.67 ≤ α < 0.80: Tentative conclusions
  α ≥ 0.80: Reliable (high-stakes/decisive conclusions)
Some flexibility exists depending on discipline, but the above are widely cited thresholds.


For overall:
* kappa = moderate
* alpha = not sufficient
* percentage agreement = excellent

For criteria's kappa mean:
* 1 = substantial
* 2 = fair
* 3 = moderate
* 4 = moderate
* 5 = substantial
* 6 = substantial
* 7 = moderate

# Visualizations

In [50]:
def create_reliability_heatmap(df1, df2):
    """Create heatmap showing all reliability metrics by model-criteria."""
    results_df = calculate_all_reliability_metrics(df1, df2)
    
    # Create pivot tables for each metric
    metrics = {
        'Percent Agreement': 'percent_agreement',
        "Cohen's Kappa": 'cohens_kappa', 
        "Krippendorff's Alpha": 'krippendorff_alpha'
    }
    
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=list(metrics.keys()),
        vertical_spacing=0.15,
        horizontal_spacing=0.1
    )
    
    positions = [(1,1), (1,2), (2,1), (2,2)]
    
    for i, (title, metric) in enumerate(metrics.items()):
        pivot_df = results_df.pivot(index='criteria', columns='model', values=metric)
        
        # Handle colorscale ranges
        if metric == 'percent_agreement':
            zmin, zmax = 0, 100
            colorscale = 'Viridis'
        else:
            zmin, zmax = -1, 1
            colorscale = 'RdBu'
        
        row, col = positions[i]
        fig.add_trace(
            go.Heatmap(
                z=pivot_df.values,
                x=pivot_df.columns,
                y=pivot_df.index,
                colorscale=colorscale,
                zmin=zmin, zmax=zmax,
                text=np.round(pivot_df.values, 3),
                texttemplate="%{text}",
                textfont={"size": 10},
                showscale=True
            ),
            row=row, col=col
        )
    
    fig.update_layout(
        title="Inter-Rater Reliability Heatmaps by Metric",
        height=700,
        width=1000,
        font=dict(size=12)
    )
    
    return fig

def create_criteria_comparison_lines(df1, df2):
    """Line plot comparing reliability across criteria for each model."""
    results_df = calculate_all_reliability_metrics(df1, df2)
    
    fig = go.Figure()
    
    models = ['chatgpt', 'claude', 'deepseek', 'llama']
    colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A']
    
    # Plot each model as separate line
    for i, model in enumerate(models):
        model_data = results_df[results_df['model'] == model].sort_values('criteria')
        
        fig.add_trace(go.Scatter(
            x=model_data['criteria'],
            y=model_data['percent_agreement'],
            mode='lines+markers',
            name=f'{model.title()}',
            line=dict(width=3, color=colors[i]),
            marker=dict(size=8)
        ))
    
    # Add reference line
    fig.add_hline(y=80, line_dash="dash", line_color="green", 
                  opacity=0.5, annotation_text="Good Agreement (80%)")
    
    fig.update_layout(
        title="Percent Agreement Across Criteria by Model",
        xaxis_title="Criteria",
        yaxis_title="Percent Agreement (%)",
        yaxis=dict(range=[0, 105]),
        height=500,
        width=900,
        font=dict(size=12),
        legend=dict(x=0.02, y=0.98)
    )
    
    return fig

def create_kappa_comparison_bars(df1, df2):
    """Grouped bar chart showing Cohen's Kappa by criteria and model."""
    results_df = calculate_all_reliability_metrics(df1, df2)
    
    fig = go.Figure()
    
    models = ['chatgpt', 'claude', 'deepseek', 'llama']
    colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']
    
    criteria_list = sorted(results_df['criteria'].unique())
    
    for i, model in enumerate(models):
        model_data = results_df[results_df['model'] == model]
        kappa_values = []
        
        for criteria in criteria_list:
            kappa = model_data[model_data['criteria'] == criteria]['cohens_kappa'].iloc[0]
            kappa_values.append(kappa if not pd.isna(kappa) else 0)
        
        fig.add_trace(go.Bar(
            name=model.title(),
            x=criteria_list,
            y=kappa_values,
            marker_color=colors[i]
        ))
    
    # Add reference lines
    fig.add_hline(y=0.6, line_dash="dash", line_color="orange", 
                  opacity=0.5, annotation_text="Substantial (0.6)")
    fig.add_hline(y=0.8, line_dash="dash", line_color="green", 
                  opacity=0.5, annotation_text="Almost Perfect (0.8)")
    
    fig.update_layout(
        title="Cohen's Kappa by Criteria and Model",
        xaxis_title="Criteria",
        yaxis_title="Cohen's Kappa",
        yaxis=dict(range=[-1, 1]),
        barmode='group',
        height=500,
        width=900,
        font=dict(size=12)
    )
    
    return fig

def create_averaged_reliability_by_criteria(df1, df2):
    """Bar chart showing average reliability metrics by criteria."""
    summary = reliability_summary_by_criteria(df1, df2)
    
    fig = go.Figure()
    
    # Add bars for each metric
    metrics_info = [
        ('kappa_mean', "Cohen's Kappa", '#ff7f0e'),
        ('alpha_mean', "Krippendorff's Alpha", '#d62728'),
        ('agreement_mean', '% Agreement', '#1f77b4')
    ]
    
    for metric, name, color in metrics_info:
        values = summary[metric].fillna(0)  # Replace NaN with 0 for visualization
        
        fig.add_trace(go.Bar(
            x=summary['criteria'],
            y=values,
            name=name,
            marker_color=color,
            text=np.round(values, 3),
            textposition='auto'
        ))
    
    fig.update_layout(
        title="Average Reliability Metrics by Criteria",
        xaxis_title="Criteria",
        yaxis_title="Reliability Score",
        barmode='group',
        height=500,
        width=900,
        font=dict(size=12),
        legend=dict(x=0.02, y=0.98)
    )
    
    return fig

def create_metric_correlation_scatter(df1, df2):
    """Scatter plot comparing different reliability metrics."""
    results_df = calculate_all_reliability_metrics(df1, df2)
    valid_data = results_df.dropna()
    
    fig = go.Figure()
    
    fig.add_trace(go.Scatter(
        x=valid_data['percent_agreement'],
        y=valid_data['cohens_kappa'],
        mode='markers+text',
        text=[f"{row['model'][:3]}-{row['criteria'][-1]}" for _, row in valid_data.iterrows()],
        textposition="top center",
        
        name='Model-Criteria Points'
    ))
    
    fig.update_layout(
        title="Reliability Metrics Correlation: Kappa vs Percent Agreement",
        xaxis_title="Percent Agreement (%)",
        yaxis_title="Cohen's Kappa",
        height=500,
        width=800,
        font=dict(size=12)
    )
    
    return fig

def create_reliability_distribution(df1, df2):
    """Distribution plot showing spread of reliability scores."""
    results_df = calculate_all_reliability_metrics(df1, df2)
    
    fig = go.Figure()
    
    metrics_info = [        
        ('cohens_kappa', "Cohen's Kappa", '#ff7f0e'),
        ('krippendorff_alpha', "Krippendorff's Alpha", '#d62728'),
        ('percent_agreement', 'Percent Agreement', '#1f77b4')
    ]
    
    for metric, name, color in metrics_info:
        valid_values = results_df[metric].dropna()
        if len(valid_values) > 0:
            fig.add_trace(go.Histogram(
                x=valid_values,
                name=name,
                opacity=0.7,
                nbinsx=15,
                marker_color=color
            ))
    
    fig.update_layout(
        title="Distribution of Reliability Scores",
        xaxis_title="Reliability Score",
        yaxis_title="Frequency",
        barmode='overlay',
        height=500,
        width=800,
        font=dict(size=12),
        legend=dict(x=0.7, y=0.9)
    )
    
    return fig

In [None]:
# 1. Heatmaps of all metrics
# fig1 = create_reliability_heatmap(human_rater1, human_rater2)
# fig1.show()

In [42]:
# 2. Line plot of percent agreement
# fig2 = create_criteria_comparison_lines(human_rater1, human_rater2)
# fig2.show()


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide



In [43]:
# # 3. Grouped bars for Cohen's Kappa
# fig3 = create_kappa_comparison_bars(human_rater1, human_rater2)
# fig3.show()


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide



In [45]:
# # 4. Average reliability by criteria
# fig4 = create_averaged_reliability_by_criteria(human_rater1, human_rater2)
# fig4.show()

In [48]:
# # 5. Correlation scatter plot
# fig5 = create_metric_correlation_scatter(human_rater1, human_rater2)
# fig5.show()


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide



In [49]:
# # 6. Distribution histogram
# fig6 = create_reliability_distribution(human_rater1, human_rater2)
# fig6.show()


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide

