In [25]:
import os
import glob
import json
import pandas as pd


def load_reviews(folder_path):
    rows = []
    # find all JSON files in the folder
    for file_path in glob.glob(os.path.join(folder_path, '*.json')):
        with open(file_path, 'r') as f:
            data = json.load(f)
        
        paper_id = data.get('paper_id')
        assessor = data.get('assessor')
        metrics = data.get('metrics', {})
        
        # group metrics by reviewer name
        reviewer_metrics = {}
        for key, value in metrics.items():
            # only process keys that start with "review_"
            if not key.startswith('review_'):
                continue
            parts = key.split('_')
            reviewer = parts[1]                          # e.g. "Palwinder-Singh"
            metric_name = '_'.join(parts[2:])            # e.g. "Comprehensiveness"
            
            reviewer_metrics.setdefault(reviewer, {})
            reviewer_metrics[reviewer][metric_name] = value
        
        # turn each reviewer’s metrics into a row
        for reviewer, mdict in reviewer_metrics.items():
            row = {
                'paper_id': paper_id,
                'assessor': assessor,
                'reviewer': reviewer
            }
            row.update(mdict)
            rows.append(row)
    
    # build the final DataFrame
    df = pd.DataFrame(rows)
    return df

# Example usage:
folder = 'Human_Annotation_Data'
df = load_reviews(folder)

# show the first few rows
df = df[df['Overall_Quality'] > 5]
df

Unnamed: 0,paper_id,assessor,reviewer,Comprehensiveness,Usage_of_Technical_Terms,Factuality,Sentiment_Polarity,Politeness,Vagueness,Objectivity,Fairness,Actionability,Constructiveness,Relevance_Alignment,Clarity_and_Readability,Overall_Quality
0,166,Sajad-Ebrahimi,Reviewer-7mFW,2,4,factual,neutral,polite,high,4,4,4,3,4,4,67
1,166,Sajad-Ebrahimi,Reviewer-FAWm,4,4,factual,neutral,polite,none,4,4,5,5,5,4,86
2,166,Sajad-Ebrahimi,Reviewer-kjkr,3,4,factual,neutral,polite,low,4,4,5,5,4,5,75
3,100,Seyed,Enrico-Daga,3,2,factual,positive,polite,none,4,5,4,4,4,4,80
4,100,Seyed,Julia-Bosque,5,4,factual,positive,polite,low,4,4,4,4,5,4,87
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
483,75,Ali-Ghorbanpour,Reviewer-s437,3,3,partially factual,neutral,polite,low,2,3,2,3,3,3,55
484,75,Ali-Ghorbanpour,Reviewer-mMGf,4,4,factual,negative,polite,none,4,4,4,5,4,4,80
485,75,Ali-Ghorbanpour,Reviewer-AtQ2,5,4,factual,positive,polite,none,4,4,3,4,4,5,90
486,75,Ali-Ghorbanpour,Reviewer-v6cq,2,3,partially factual,positive,polite,moderate,3,3,2,3,3,3,50


In [26]:
df_llama = pd.read_json('final_data/HA_ALL_llama.json', orient='records', lines=True)
df_qwen = pd.read_json('final_data/HA_ALL_qwen.json')

In [29]:
# remove 'llm_' from all columns prefix in df_llama and df_qwen
df_llama.columns = df_llama.columns.str.replace(' ', '_', regex=False)
df_qwen.columns = df_qwen.columns.str.replace(' ', '_', regex=False)

In [31]:
# print column names of df_llama and df_qwen and df
# print("Columns in df_llama:")
# print(df_llama.columns.tolist())
# print("Columns in df_qwen:")
# print(df_qwen.columns.tolist())
# print("Columns in df:")
# print(df.columns.tolist())

# print shared columns between df_llama and df
shared_columns_llama = set(df_llama.columns) & set(df.columns)
print("Shared columns between df_llama and df:")
print(shared_columns_llama)
# print shared columns between df_qwen and df
shared_columns_qwen = set(df_qwen.columns) & set(df.columns)
print("Shared columns between df_qwen and df:")
print(shared_columns_qwen)

Shared columns between df_llama and df:
{'Comprehensiveness', 'Politeness', 'Usage_of_Technical_Terms', 'paper_id', 'Relevance_Alignment', 'Objectivity', 'reviewer', 'Sentiment_Polarity', 'Vagueness', 'Factuality', 'Fairness', 'Actionability', 'Overall_Quality', 'Constructiveness', 'Clarity_and_Readability'}
Shared columns between df_qwen and df:
{'Comprehensiveness', 'Politeness', 'Usage_of_Technical_Terms', 'paper_id', 'Relevance_Alignment', 'Objectivity', 'reviewer', 'Sentiment_Polarity', 'Vagueness', 'Factuality', 'Fairness', 'Actionability', 'Overall_Quality', 'Constructiveness', 'Clarity_and_Readability'}


In [65]:
# Convert paper_id and reviewer to string in all DataFrames
for dframe in [df, df_qwen, df_llama]:
    dframe['paper_id'] = dframe['paper_id'].astype(str)
    dframe['reviewer'] = dframe['reviewer'].astype(str)

# Define shared metrics (excluding paper_id and reviewer)
shared_metrics = [
    'Comprehensiveness', 'Usage_of_Technical_Terms', 'Relevance_Alignment',
    'Objectivity', 'Sentiment_Polarity', 'Vagueness', 'Factuality',
    'Fairness', 'Actionability', 'Overall_Quality', 'Constructiveness',
    'Clarity_and_Readability', 'Politeness'
]

# Rename columns with prefixes
df_human_renamed = df.rename(columns={col: f'Human_{col}' for col in shared_metrics})
df_qwen_renamed = df_qwen.rename(columns={col: f'Qwen_{col}' for col in shared_metrics})
df_llama_renamed = df_llama.rename(columns={col: f'Llama_{col}' for col in shared_metrics})

# in df_llama and df_qwen just for column reviewer, replace '_' and ' ' with '-'
df_llama_renamed['reviewer'] = df_llama_renamed['reviewer'].str.replace('_', '-', regex=False)
df_qwen_renamed['reviewer'] = df_qwen_renamed['reviewer'].str.replace('_', '-', regex=False)
df_llama_renamed['reviewer'] = df_llama_renamed['reviewer'].str.replace(' ', '-', regex=False)
df_qwen_renamed['reviewer'] = df_qwen_renamed['reviewer'].str.replace(' ', '-', regex=False)


# Merge Qwen and Llama first, then merge with Human
df_human_vs_llm = (
    df_qwen_renamed
    .merge(df_llama_renamed, on=['paper_id', 'reviewer'], how='inner')
    .merge(df_human_renamed, on=['paper_id', 'reviewer'], how='inner')
)

# Handle missing values and sort columns
# df_human_vs_llm = df_human_vs_llm.fillna('N/A')

# Create ordered column list (Human first, then Qwen, then Llama)
column_order = (
    ['paper_id', 'reviewer'] + 
    sorted([col for col in df_human_vs_llm if col.startswith('Human_')]) +
    sorted([col for col in df_human_vs_llm if col.startswith('Qwen_')]) +
    sorted([col for col in df_human_vs_llm if col.startswith('Llama_')])
)

df_human_vs_llm = df_human_vs_llm[column_order]
df_human_vs_llm

Unnamed: 0,paper_id,reviewer,Human_Actionability,Human_Clarity_and_Readability,Human_Comprehensiveness,Human_Constructiveness,Human_Factuality,Human_Fairness,Human_Objectivity,Human_Overall_Quality,...,Llama_Constructiveness,Llama_Factuality,Llama_Fairness,Llama_Objectivity,Llama_Overall_Quality,Llama_Politeness,Llama_Relevance_Alignment,Llama_Sentiment_Polarity,Llama_Usage_of_Technical_Terms,Llama_Vagueness
0,123,Reviewer-EGJf,3,4,3,2,partially factual,3,2,30,...,3.0,partially factual,3.0,4.0,60.0,polite,4.0,neutral,4.0,low
1,123,Reviewer-DWom,2,4,2,2,partially factual,3,3,50,...,4.0,factual,4.0,4.0,80.0,polite,5.0,neutral,3.0,low
2,123,Reviewer-PnHf,4,4,4,4,factual,3,4,70,...,4.0,factual,4.0,4.0,80.0,polite,5.0,positive,4.0,none
3,123,Reviewer-ekPo,3,4,4,3,factual,3,3,65,...,4.0,factual,4.0,4.0,80.0,polite,5.0,neutral,5.0,none
4,0,Reviewer-HFRa,4,4,4,4,factual,4,4,80,...,3.0,factual,4.0,4.0,70.0,polite,4.0,neutral,3.0,low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
552,118,Anonymous,4,4,4,4,factual,4,4,82,...,2.0,unfactual,3.0,2.0,60.0,neutral,4.0,negative,4.0,moderate
553,118,Anonymous,4,4,4,4,factual,4,4,80,...,2.0,unfactual,3.0,2.0,60.0,neutral,4.0,negative,4.0,moderate
554,76,Anonymous,3,3,5,4,factual,4,4,90,...,2.0,factual,3.0,4.0,60.0,polite,4.0,neutral,4.0,none
555,76,Ghislain-Hachey,4,3,4,4,factual,4,3,80,...,2.0,partially factual,3.0,2.0,60.0,polite,4.0,neutral,4.0,low


In [69]:
df_human_vs_llm = df_human_vs_llm.dropna()
df_human_vs_llm

Unnamed: 0,paper_id,reviewer,Human_Actionability,Human_Clarity_and_Readability,Human_Comprehensiveness,Human_Constructiveness,Human_Factuality,Human_Fairness,Human_Objectivity,Human_Overall_Quality,...,Llama_Constructiveness,Llama_Factuality,Llama_Fairness,Llama_Objectivity,Llama_Overall_Quality,Llama_Politeness,Llama_Relevance_Alignment,Llama_Sentiment_Polarity,Llama_Usage_of_Technical_Terms,Llama_Vagueness
0,123,Reviewer-EGJf,3,4,3,2,partially factual,3,2,30,...,3.0,partially factual,3.0,4.0,60.0,polite,4.0,neutral,4.0,low
1,123,Reviewer-DWom,2,4,2,2,partially factual,3,3,50,...,4.0,factual,4.0,4.0,80.0,polite,5.0,neutral,3.0,low
2,123,Reviewer-PnHf,4,4,4,4,factual,3,4,70,...,4.0,factual,4.0,4.0,80.0,polite,5.0,positive,4.0,none
3,123,Reviewer-ekPo,3,4,4,3,factual,3,3,65,...,4.0,factual,4.0,4.0,80.0,polite,5.0,neutral,5.0,none
4,0,Reviewer-HFRa,4,4,4,4,factual,4,4,80,...,3.0,factual,4.0,4.0,70.0,polite,4.0,neutral,3.0,low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
552,118,Anonymous,4,4,4,4,factual,4,4,82,...,2.0,unfactual,3.0,2.0,60.0,neutral,4.0,negative,4.0,moderate
553,118,Anonymous,4,4,4,4,factual,4,4,80,...,2.0,unfactual,3.0,2.0,60.0,neutral,4.0,negative,4.0,moderate
554,76,Anonymous,3,3,5,4,factual,4,4,90,...,2.0,factual,3.0,4.0,60.0,polite,4.0,neutral,4.0,none
555,76,Ghislain-Hachey,4,3,4,4,factual,4,3,80,...,2.0,partially factual,3.0,2.0,60.0,polite,4.0,neutral,4.0,low


In [70]:
print("Unique values per column:")
for col in df_human_vs_llm.columns:
    if col not in ['paper_id', 'reviewer']:
        unique_vals = df_human_vs_llm[col].unique()
        try:
            # Try numerical sorting
            sorted_vals = sorted(unique_vals)
        except TypeError:
            # Fallback to string sorting for mixed types
            sorted_vals = sorted(unique_vals, key=lambda x: str(x))
        
        print(f"\n{col} ({len(sorted_vals)} unique values):")
        print(*sorted_vals[:10], sep=', ')  # Show first 10 values
        if len(sorted_vals) > 10:
            print(f"... plus {len(sorted_vals)-10} more values")

Unique values per column:

Human_Actionability (6 unique values):
0, 1, 2, 3, 4, 5

Human_Clarity_and_Readability (6 unique values):
0, 1, 2, 3, 4, 5

Human_Comprehensiveness (6 unique values):
0, 1, 2, 3, 4, 5

Human_Constructiveness (6 unique values):
0, 1, 2, 3, 4, 5

Human_Factuality (3 unique values):
factual, partially factual, unfactual

Human_Fairness (6 unique values):
0, 1, 2, 3, 4, 5

Human_Objectivity (6 unique values):
0, 1, 2, 3, 4, 5

Human_Overall_Quality (50 unique values):
10, 12, 19, 20, 25, 29, 30, 33, 35, 37
... plus 40 more values

Human_Politeness (3 unique values):
impolite, neutral, polite

Human_Relevance_Alignment (6 unique values):
0, 1, 2, 3, 4, 5

Human_Sentiment_Polarity (3 unique values):
negative, neutral, positive

Human_Usage_of_Technical_Terms (6 unique values):
0, 1, 2, 3, 4, 5

Human_Vagueness (5 unique values):
extreme, high, low, moderate, none

Qwen_Actionability (6 unique values):
0, 1, 2, 3, 4, 5

Qwen_Clarity_and_Readability (5 unique values)

In [71]:
df_human_vs_llm.columns

Index(['paper_id', 'reviewer', 'Human_Actionability',
       'Human_Clarity_and_Readability', 'Human_Comprehensiveness',
       'Human_Constructiveness', 'Human_Factuality', 'Human_Fairness',
       'Human_Objectivity', 'Human_Overall_Quality', 'Human_Politeness',
       'Human_Relevance_Alignment', 'Human_Sentiment_Polarity',
       'Human_Usage_of_Technical_Terms', 'Human_Vagueness',
       'Qwen_Actionability', 'Qwen_Clarity_and_Readability',
       'Qwen_Comprehensiveness', 'Qwen_Constructiveness', 'Qwen_Factuality',
       'Qwen_Fairness', 'Qwen_Objectivity', 'Qwen_Overall_Quality',
       'Qwen_Politeness', 'Qwen_Relevance_Alignment',
       'Qwen_Sentiment_Polarity', 'Qwen_Usage_of_Technical_Terms',
       'Qwen_Vagueness', 'Llama_Actionability',
       'Llama_Clarity_and_Readability', 'Llama_Comprehensiveness',
       'Llama_Constructiveness', 'Llama_Factuality', 'Llama_Fairness',
       'Llama_Objectivity', 'Llama_Overall_Quality', 'Llama_Politeness',
       'Llama_Relevance_A

In [80]:
import pandas as pd
import numpy as np
from sklearn.metrics import cohen_kappa_score, mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr, spearmanr
import krippendorff

# Define valid categories for categorical metrics
CATEGORY_MAP = {
    'Factuality': ['factual', 'partially factual', 'unfactual'],
    'Politeness': ['polite', 'neutral', 'impolite'],
    'Sentiment_Polarity': ['positive', 'neutral', 'negative'],
    'Vagueness': ['low', 'moderate', 'high', 'extreme', 'none']
}

# Define metric types
numerical_metrics = [
    'Actionability', 'Clarity_and_Readability', 'Comprehensiveness',
    'Constructiveness', 'Fairness', 'Objectivity', 'Overall_Quality',
    'Relevance_Alignment', 'Usage_of_Technical_Terms'
]

categorical_metrics = list(CATEGORY_MAP.keys())

def preprocess_data(df):
    """Handle data type conversions and invalid values"""
    # Process numerical metrics
    for metric in numerical_metrics:
        for prefix in ['Human', 'Qwen', 'Llama']:
            col = f"{prefix}_{metric}"
            # Convert to numeric, coerce errors to NaN
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Process categorical metrics
    for metric in categorical_metrics:
        valid_categories = CATEGORY_MAP[metric]
        for prefix in ['Human', 'Qwen', 'Llama']:
            col = f"{prefix}_{metric}"
            # Convert to string and lowercase for consistency
            df[col] = df[col].astype(str).str.lower().str.strip()
            # Mark invalid values as NaN
            df[col] = df[col].where(df[col].isin(valid_categories))
    
    return df.dropna()

# Preprocess the entire dataframe
processed_df = preprocess_data(df_human_vs_llm.copy())

results = []

for metric in numerical_metrics + categorical_metrics:
    for pair in [('Human', 'Qwen'), ('Human', 'Llama')]:  # , ('Qwen', 'Llama')
        col1 = f"{pair[0]}_{metric}"
        col2 = f"{pair[1]}_{metric}"
        
        # Filter valid pairs specifically for this metric pair
        valid_data = processed_df[[col1, col2]].dropna()
        
        if len(valid_data) < 2:
            continue
            
        entry = {
            'metric': metric,
            'pair': f"{pair[0]}-{pair[1]}",
            'n_samples': len(valid_data)
        }
        
        if metric in numerical_metrics:
            # Ensure numerical types
            valid_data = valid_data.apply(pd.to_numeric, errors='coerce').dropna()
            
            entry.update({
                'pearson': pearsonr(valid_data[col1], valid_data[col2])[0],
                'spearman': spearmanr(valid_data[col1], valid_data[col2])[0],
                'MAE': mean_absolute_error(valid_data[col1], valid_data[col2]),
                'RMSE': np.sqrt(mean_squared_error(valid_data[col1], valid_data[col2]))
            })
            
            try:
                entry['krippendorff_alpha'] = krippendorff.alpha(
                    reliability_data=[valid_data[col1].values, 
                    valid_data[col2].values],
                    level_of_measurement='interval'
                )
            except:
                entry['krippendorff_alpha'] = np.nan
                
        if metric in categorical_metrics:
            # Additional check for categorical consistency
            valid_data = valid_data[
                valid_data[col1].isin(CATEGORY_MAP[metric]) & 
                valid_data[col2].isin(CATEGORY_MAP[metric])
            ]
            
            if len(valid_data) < 2:
                continue
                
            entry['n_samples'] = len(valid_data)
            entry['cohen_kappa'] = cohen_kappa_score(
                valid_data[col1], 
                valid_data[col2]
            )
            
            try:
                entry['krippendorff_alpha'] = krippendorff.alpha(
                    reliability_data=[valid_data[col1].values, 
                    valid_data[col2].values],
                    level_of_measurement='nominal'
                )
            except:
                entry['krippendorff_alpha'] = np.nan
                
        results.append(entry)

agreement_df = pd.DataFrame(results)

# Add significance indicators
def add_significance(row):
    if row['n_samples'] < 30:
        return '*' * (3 - (row['n_samples'] // 10))
    return ''

agreement_df['sig'] = agreement_df.apply(add_significance, axis=1)

column_order = [
    'metric', 'pair', 'n_samples', 'sig',
    'pearson', 'spearman', 'MAE', 'RMSE',
    'cohen_kappa', 'krippendorff_alpha'
]
agreement_df = agreement_df[column_order]

print(agreement_df.sort_values(['metric', 'pair']).to_string())

# save agreement_df.sort_values(['metric', 'pair']) to csv
agreement_df.sort_values(['metric', 'pair']).to_csv('final_data/human_vs_llm.csv', index=False)

                      metric         pair  n_samples sig   pearson  spearman        MAE       RMSE  cohen_kappa  krippendorff_alpha
1              Actionability  Human-Llama        487     -0.163561 -0.122997   1.685832   1.980915          NaN           -0.299412
0              Actionability   Human-Qwen        487      0.391953  0.322687   1.078029   1.514475          NaN            0.246886
3    Clarity_and_Readability  Human-Llama        487      0.002318  0.055896   0.669405   1.047143          NaN           -0.007293
2    Clarity_and_Readability   Human-Qwen        487      0.145476  0.158779   0.735113   1.066573          NaN            0.031274
5          Comprehensiveness  Human-Llama        487     -0.082886 -0.103371   1.098563   1.529317          NaN           -0.135884
4          Comprehensiveness   Human-Qwen        487      0.425355  0.383133   0.946612   1.314115          NaN            0.249033
7           Constructiveness  Human-Llama        487     -0.100116 -0.090724

In [81]:
tmp = pd.read_csv('final_data/human_vs_llm.csv')
tmp

Unnamed: 0,metric,pair,n_samples,sig,pearson,spearman,MAE,RMSE,cohen_kappa,krippendorff_alpha
0,Actionability,Human-Llama,487,,-0.163561,-0.122997,1.685832,1.980915,,-0.299412
1,Actionability,Human-Qwen,487,,0.391953,0.322687,1.078029,1.514475,,0.246886
2,Clarity_and_Readability,Human-Llama,487,,0.002318,0.055896,0.669405,1.047143,,-0.007293
3,Clarity_and_Readability,Human-Qwen,487,,0.145476,0.158779,0.735113,1.066573,,0.031274
4,Comprehensiveness,Human-Llama,487,,-0.082886,-0.103371,1.098563,1.529317,,-0.135884
5,Comprehensiveness,Human-Qwen,487,,0.425355,0.383133,0.946612,1.314115,,0.249033
6,Constructiveness,Human-Llama,487,,-0.100116,-0.090724,1.297741,1.689441,,-0.08413
7,Constructiveness,Human-Qwen,487,,0.270373,0.22993,1.227926,1.711178,,0.108562
8,Factuality,Human-Llama,487,,,,,,-0.051325,
9,Factuality,Human-Qwen,487,,,,,,0.085174,
