In [2]:
import pandas as pd

In [3]:
may = pd.read_csv('../pipeline/src/data/may_base_scenarios_manual_validation.csv')
kela = pd.read_csv('../pipeline/src/data/base_scenarios_manual_validation.csv')

In [4]:
diff_count = (may.iloc[:, 1:] != kela.iloc[:, 1:]).sum().sum()
print("Number of different cells (excluding the first column):", diff_count)
print("Total number of cells (excluding the first column):", may.iloc[:, 1:].size)

Number of different cells (excluding the first column): 34
Total number of cells (excluding the first column): 448


In [5]:
from sklearn.metrics import cohen_kappa_score

# Flatten the ratings (excluding the 'id' column)
may_ratings = may.iloc[:, 1:].values.flatten()
kela_ratings = kela.iloc[:, 1:].values.flatten()

kappa = cohen_kappa_score(may_ratings, kela_ratings)
print("Cohen's kappa:", kappa)

Cohen's kappa: 0.7990713381173491


In [6]:
from scipy.stats import pearsonr

# Flatten the ratings (excluding the 'id' column)
may_flat = may.iloc[:, 1:].values.flatten()
kela_flat = kela.iloc[:, 1:].values.flatten()

corr, p_value = pearsonr(may_flat, kela_flat)
print("Pearson's correlation coefficient:", corr)
print("p-value:", p_value)

Pearson's correlation coefficient: 0.8018203347496713
p-value: 8.714720238929512e-102


In [7]:
from scipy.stats import chi2_contingency
import numpy as np

# Flatten the ratings (excluding the 'id' column)
may_bin = may.iloc[:, 1:].values.flatten()
kela_bin = kela.iloc[:, 1:].values.flatten()

# Build 2x2 contingency table
table = np.zeros((2, 2), dtype=int)
for a, b in zip(may_bin, kela_bin):
    table[a, b] += 1

# Calculate phi coefficient
chi2, _, _, _ = chi2_contingency(table, correction=False)
n = table.sum()
phi = np.sqrt(chi2 / n)
print("Phi coefficient:", phi)

Phi coefficient: 0.8018203347496715


In [13]:
qwen = pd.read_csv('../pipeline/src/data/qwen_as_judge_binary_2025-08-23-12-44-27.csv')
gemini = pd.read_csv('../pipeline/src/data/gemini_as_judge_binary_2025-08-23-12-59-15.csv')
mistral = pd.read_csv('../pipeline/src/data/magistral_as_judge_binary_2025-08-23-13-07-35.csv')
kela = pd.read_csv('../pipeline/src/data/base_scenarios_manual_validation.csv')

In [12]:
from sklearn.metrics import cohen_kappa_score

# Flatten the ratings (excluding the 'id' column)
qwen_ratings = qwen.iloc[:, 1:].values.flatten()
kela_ratings = kela.iloc[:, 1:].values.flatten()

kappa = cohen_kappa_score(qwen_ratings, kela_ratings)
print("Cohen's kappa:", kappa)

Cohen's kappa: 0.6667388011252975


In [14]:
# Flatten the ratings (excluding the 'id' column)
gemini_ratings = gemini.iloc[:, 1:].values.flatten()
kela_ratings = kela.iloc[:, 1:].values.flatten()

kappa = cohen_kappa_score(gemini_ratings, kela_ratings)
print("Cohen's kappa:", kappa)

Cohen's kappa: 0.7490943755958055


In [15]:
# Flatten the ratings (excluding the 'id' column)
mistral_ratings = mistral.iloc[:, 1:].values.flatten()
kela_ratings = kela.iloc[:, 1:].values.flatten()

kappa = cohen_kappa_score(mistral_ratings, kela_ratings)
print("Cohen's kappa:", kappa)

Cohen's kappa: 0.2831016825164594


In [16]:
def parse_column_name(col_name):
    """Parse model-criteria column names."""
    parts = col_name.split('_')
    if len(parts) >= 3:
        model = parts[0]
        criteria = '_'.join(parts[1:])  # Handle criteria_1, criteria_2, etc.
        return model, criteria
    return None, None

In [18]:


# Percent Agreement calculation
def calculate_percent_agreement(df1, df2):
    """Calculate percent agreement for each model-criteria combination."""
    columns_to_check = [col for col in df1.columns if col != 'id']
    results = []
    
    for col in columns_to_check:
        percent_agreement = np.mean(df1[col] == df2[col]) * 100
        
        # Parse column name
        model, criteria = parse_column_name(col)
        
        results.append({
            #'column': col,
            'model': model,
            'criteria': criteria,
            'percent_agreement': round(percent_agreement, 2)
        })
    
    return pd.DataFrame(results)

def agreement_summary_by_criteria(df1, df2):
    """Calculate average percent agreement by criteria across models."""
    agreement_df = calculate_percent_agreement(df1,df2)
    
    summary = agreement_df.groupby('criteria')['percent_agreement'].agg([
        'mean', 'min', 'max', 'std'
    ]).round(2)
    
    summary.columns = ['avg_agreement', 'min_agreement', 'max_agreement', 'std_agreement']
    return summary.reset_index()

agreement_df = agreement_summary_by_criteria(qwen, kela)
print(agreement_df)
agreement_df = agreement_summary_by_criteria(gemini, kela)
print(agreement_df)
agreement_df = agreement_summary_by_criteria(mistral, kela)
print(agreement_df)

     criteria  avg_agreement  min_agreement  max_agreement  std_agreement
0  criteria_1          70.31          50.00           87.5          15.63
1  criteria_2          95.31          87.50          100.0           5.98
2  criteria_3          90.62          68.75          100.0          14.88
3  criteria_4          84.38          75.00          100.0          10.83
4  criteria_5          98.44          93.75          100.0           3.12
5  criteria_6          81.25          62.50           87.5          12.50
6  criteria_7          93.75          87.50          100.0           7.22
     criteria  avg_agreement  min_agreement  max_agreement  std_agreement
0  criteria_1          93.75          81.25         100.00           8.84
1  criteria_2          95.31          87.50         100.00           5.98
2  criteria_3          90.62          68.75         100.00          14.88
3  criteria_4          87.50          68.75          93.75          12.50
4  criteria_5          98.44          