In [1]:
import pandas as pd
from scipy import stats

In [4]:
df1 = pd.read_csv("evaluation_scores_1.csv")
df2 = pd.read_csv("evaluation_scores_2.csv")
df3 = pd.read_csv("evaluation_scores_3.csv")
df4 = pd.read_csv("evaluation_scores_4.csv")
df5 = pd.read_csv("evaluation_scores_5.csv")

In [5]:
df1.describe()

Unnamed: 0,patient_id,agentic_factual_accuracy,agentic_clinical_relevance,agentic_consistency,agentic_average,nonagentic_factual_accuracy,nonagentic_clinical_relevance,nonagentic_consistency,nonagentic_average
count,29.0,29.0,29.0,29.0,29.0,29.0,29.0,29.0,29.0
mean,292537.344828,6.758621,7.448276,7.827586,7.344138,2.137931,1.586207,3.0,2.241379
std,257137.313777,0.786274,1.020721,1.582696,0.93225,3.259318,2.73231,4.035556,3.085921
min,129391.0,5.0,6.0,3.0,4.67,0.0,0.0,0.0,0.0
25%,140376.0,7.0,6.0,7.0,7.0,0.0,0.0,0.0,0.0
50%,160310.0,7.0,8.0,8.0,7.33,0.0,0.0,0.0,0.0
75%,302154.0,7.0,8.0,9.0,8.0,5.0,3.0,7.0,5.0
max,962227.0,8.0,9.0,10.0,9.0,10.0,10.0,10.0,10.0


In [6]:
runs = [df1, df2, df3, df4, df5]

for i, df in enumerate(runs, start=1):
    df['run'] = i
all_runs = pd.concat(runs, ignore_index=True)

In [7]:
# 1. Aggregate across runs: per-patient mean & SD
metrics = [
    'agentic_factual_accuracy', 'agentic_clinical_relevance', 'agentic_consistency',
    'nonagentic_factual_accuracy', 'nonagentic_clinical_relevance', 'nonagentic_consistency'
]
agg = all_runs.groupby('patient_id')[metrics] \
              .agg(['mean', 'std']) \
              .reset_index()
# Flatten columns
agg.columns = ['_'.join(col).rstrip('_') for col in agg.columns.values]

In [9]:
# 2. Descriptive statistics across patients
# Compute descriptive stats (mean, median, std_dev)
desc = agg.loc[:, [f"{m}_mean" for m in metrics]].describe().transpose()[['mean', '50%', 'std']].rename(
    columns={'50%': 'median', 'std': 'std_dev'}
)

# Compute IQR manually and convert to DataFrame for join
iqr = (
    agg.loc[:, [f"{m}_mean" for m in metrics]]
    .quantile(0.75) - agg.loc[:, [f"{m}_mean" for m in metrics]].quantile(0.25)
)
iqr.index = iqr.index.str.replace('_mean', '_IQR')
iqr = iqr.to_frame(name='IQR')

# Join IQR into desc
desc = desc.join(iqr)

In [10]:
summary = pd.DataFrame({
    'metric': ['Factual Accuracy', 'Clinical Relevance', 'Consistency'],
    'agentic_mean': [
        agg['agentic_factual_accuracy_mean'].mean(),
        agg['agentic_clinical_relevance_mean'].mean(),
        agg['agentic_consistency_mean'].mean()
    ],
    'agentic_sd': [
        agg['agentic_factual_accuracy_std'].mean(),
        agg['agentic_clinical_relevance_std'].mean(),
        agg['agentic_consistency_std'].mean()
    ],
    'nonagentic_mean': [
        agg['nonagentic_factual_accuracy_mean'].mean(),
        agg['nonagentic_clinical_relevance_mean'].mean(),
        agg['nonagentic_consistency_mean'].mean()
    ],
    'nonagentic_sd': [
        agg['nonagentic_factual_accuracy_std'].mean(),
        agg['nonagentic_clinical_relevance_std'].mean(),
        agg['nonagentic_consistency_std'].mean()
    ]
})
# Compute mean difference and paired t-test p-values
summary['mean_diff'] = summary['agentic_mean'] - summary['nonagentic_mean']

In [11]:
summary

Unnamed: 0,metric,agentic_mean,agentic_sd,nonagentic_mean,nonagentic_sd,mean_diff
0,Factual Accuracy,6.772414,0.018887,1.965517,0.120013,4.806897
1,Clinical Relevance,7.462069,0.142256,1.551724,0.149188,5.910345
2,Consistency,7.827586,0.061685,2.703448,0.200584,5.124138
