In [26]:
import os
import pandas as pd
import numpy as np

import requests
import json
import logging
import time

from typing import List, Dict, Any, Optional
from enum import Enum
from dataclasses import dataclass

from sklearn.metrics import cohen_kappa_score, confusion_matrix

# Define the output directory
output_dir = '03_data_ai'
os.makedirs(output_dir, exist_ok=True)

# Read the parquet file
df_stats = pd.read_parquet(f'{output_dir}/part_3_covariates_final.parquet')

print("Dataframe loaded. Shape:", df_stats.shape)
df_stats.head(3)

Dataframe loaded. Shape: (64, 68)


Unnamed: 0,Index,NCTId,LeadSponsorClass,LeadSponsorName,Condition,OfficialTitle,BriefTitle,Acronym,StudyType,InterventionType,...,relevant_quotes_anthropic_claude_3.5_sonnet,brain_stimulation_used_meta_llama_llama_3_70b_instruct,stimulation_details_primary_type_meta_llama_llama_3_70b_instruct,stimulation_details_is_noninvasive_meta_llama_llama_3_70b_instruct,stimulation_details_primary_target_meta_llama_llama_3_70b_instruct,stimulation_details_secondary_targets_meta_llama_llama_3_70b_instruct,stimulation_details_parameters_intensity_meta_llama_llama_3_70b_instruct,stimulation_details_parameters_duration_meta_llama_llama_3_70b_instruct,confidence_level_meta_llama_llama_3_70b_instruct,relevant_quotes_meta_llama_llama_3_70b_instruct
0,,NCT06145776,OTHER,Federal University of Paraíba,"Parkinson Disease, Cognition Disorder, EEG Wit...",Effects of Cognitive-motor Dual-Task Training ...,The Role of EEG in Identifying Cognitive Chang...,,INTERVENTIONAL,",",...,"[""It is possible that strategies such as Trans...",Yes,tDCS,True,,[],,,High,"[""Transcranial Direct Current Stimulation can ..."
1,,NCT04174573,OTHER,Maharishi Markendeswar University (Deemed to b...,Parkinson Disease,Efficacy of Group Therapy With Transcranial Di...,Group Therapy With Transcranial Direct Current...,GTBQ-PD,INTERVENTIONAL,",",...,"[""Transcranial direct current stimulation (tDC...",Yes,tDCS,True,Primary motor cortex M1,[],,20 minutes,High,"[""Anodal tDCS can increase M1 excitability, re..."
2,,NCT03191916,OTHER,Sanford Health,Parkinson Disease,Transcranial Direct Current Stimulation (tDCS)...,Transcranial Direct Current Stimulation for Co...,tDCS,INTERVENTIONAL,",",...,"[""Transcranial direct current stimulation (tDC...",Yes,tDCS,True,LDLPFC,[],2 milliamps (mA),20 minutes,High,"[""During tDCS, low-voltage, low amplitude curr..."


In [27]:
# List the three columns for brain_stimulation_used
brain_stim_cols = [
    "brain_stimulation_used_gpt_4",
    "brain_stimulation_used_anthropic_claude_3.5_sonnet",
    "brain_stimulation_used_meta_llama_llama_3_70b_instruct"
]

# Map Yes -> 1, No -> 0
for c in brain_stim_cols:
    df_stats[c] = df_stats[c].map({"Yes": 1, "No": 0})

# Drop rows that became NaN if something else was in those columns
df_stats.dropna(subset=brain_stim_cols, inplace=True)

# Cast to int
for c in brain_stim_cols:
    df_stats[c] = df_stats[c].astype(int)

print("After converting Yes/No -> 1/0, dataframe shape:", df_stats.shape)
print(df_stats[brain_stim_cols].head(5))

After converting Yes/No -> 1/0, dataframe shape: (63, 68)
   brain_stimulation_used_gpt_4  \
0                             1   
1                             1   
2                             1   
3                             1   
4                             1   

   brain_stimulation_used_anthropic_claude_3.5_sonnet  \
0                                                  1    
1                                                  1    
2                                                  1    
3                                                  1    
4                                                  1    

   brain_stimulation_used_meta_llama_llama_3_70b_instruct  
0                                                  1       
1                                                  1       
2                                                  1       
3                                                  1       
4                                                  1       


In [28]:
def compute_agreement_metrics(df, col_model_a, col_model_b):
    """
    Computes the confusion matrix, observed agreement, 
    expected agreement, and Cohen's kappa for two binary (0/1) columns.
    """
    # Extract the two columns
    a = df[col_model_a]
    b = df[col_model_b]
    
    # Compute confusion matrix (2x2) with sklearn
    # confusion_matrix(labels=[0,1]) => [[TN, FP],
    #                                    [FN, TP]]
    cm = confusion_matrix(a, b, labels=[0, 1])
    tn, fp, fn, tp = cm.ravel()
    
    # Observed agreement: (TP + TN) / total
    total = tn + fp + fn + tp
    p_o = (tp + tn) / total
    
    # Expected agreement: p_e = [(row1_sum * col1_sum) + (row2_sum * col2_sum)] / total^2
    row1_sum = tp + fn  # total # of '1' in ground truth (col A)
    row2_sum = fp + tn  # total # of '0' in ground truth (col A)
    col1_sum = tp + fp  # total # of '1' in predicted (col B)
    col2_sum = fn + tn  # total # of '0' in predicted (col B)
    p_e = ((row1_sum * col1_sum) + (row2_sum * col2_sum)) / (total ** 2) 
    
    # Cohen's kappa
    if 1 - p_e == 0:
        kappa = 1.0
    else:
        kappa = (p_o - p_e) / (1 - p_e)
    
    return {
        "ConfusionMatrix": cm,
        "ObservedAgreement": p_o,
        "ExpectedAgreement": p_e,
        "CohensKappa": kappa
    }

In [29]:
def compute_multiclass_confusion_kappa(df, col_a, col_b):
    """
    Builds a multi-class confusion matrix for all unique labels
    in col_a and col_b, and computes Cohen's kappa.
    Returns a dict with the labels, confusion matrix, kappa, and row counts.
    """
    series_a = df[col_a].dropna()
    series_b = df[col_b].dropna()

    # We will restrict to rows present in both columns (drop any row that's NaN in either)
    # Alternatively, handle it more carefully if needed:
    valid_mask = df[col_a].notna() & df[col_b].notna()
    series_a = df.loc[valid_mask, col_a]
    series_b = df.loc[valid_mask, col_b]

    # Collect all unique labels from both columns
    unique_labels = sorted(set(series_a.unique()) | set(series_b.unique()))

    # If no data or no unique labels, just bail out
    if len(unique_labels) == 0:
        return {
            "Labels": [],
            "ConfusionMatrix": None,
            "CohensKappa": None,
            "NumRows": 0
        }
    
    # Build the confusion matrix
    cm = confusion_matrix(series_a, series_b, labels=unique_labels)

    # Compute Cohen's kappa (works for multi-class or binary)
    kappa_val = cohen_kappa_score(series_a, series_b)
    
    return {
        "Labels": unique_labels,
        "ConfusionMatrix": cm,
        "CohensKappa": kappa_val,
        "NumRows": len(series_a)  # how many rows were compared
    }

In [30]:
features_map = {
    "brain_stimulation_used": (
        "brain_stimulation_used_gpt_4",
        "brain_stimulation_used_anthropic_claude_3.5_sonnet",
        "brain_stimulation_used_meta_llama_llama_3_70b_instruct"
    ),
    "stimulation_details_primary_type": (
        "stimulation_details_primary_type_gpt_4",
        "stimulation_details_primary_type_anthropic_claude_3.5_sonnet",
        "stimulation_details_primary_type_meta_llama_llama_3_70b_instruct"
    ),
    "stimulation_details_is_noninvasive": (
        "stimulation_details_is_noninvasive_gpt_4",
        "stimulation_details_is_noninvasive_anthropic_claude_3.5_sonnet",
        "stimulation_details_is_noninvasive_meta_llama_llama_3_70b_instruct"
    ),
    "stimulation_details_primary_target": (
        "stimulation_details_primary_target_gpt_4",
        "stimulation_details_primary_target_anthropic_claude_3.5_sonnet",
        "stimulation_details_primary_target_meta_llama_llama_3_70b_instruct"
    ),
    "stimulation_details_secondary_targets": (
        "stimulation_details_secondary_targets_gpt_4",
        "stimulation_details_secondary_targets_anthropic_claude_3.5_sonnet",
        "stimulation_details_secondary_targets_meta_llama_llama_3_70b_instruct"
    ),
    "stimulation_details_parameters_intensity": (
        "stimulation_details_parameters_intensity_gpt_4",
        "stimulation_details_parameters_intensity_anthropic_claude_3.5_sonnet",
        "stimulation_details_parameters_intensity_meta_llama_llama_3_70b_instruct"
    ),
    "stimulation_details_parameters_duration": (
        "stimulation_details_parameters_duration_gpt_4",
        "stimulation_details_parameters_duration_anthropic_claude_3.5_sonnet",
        "stimulation_details_parameters_duration_meta_llama_llama_3_70b_instruct"
    ),
    "confidence_level": (
        "confidence_level_gpt_4",
        "confidence_level_anthropic_claude_3.5_sonnet",
        "confidence_level_meta_llama_llama_3_70b_instruct"
    )
}

In [31]:
# all_results = []

# for feature_name, (col_gpt4, col_claude, col_llama) in features_map.items():
#     # Pairwise sets
#     model_pairs = [
#         ("GPT-4",  "Claude", col_gpt4,   col_claude),
#         ("GPT-4",  "Llama",  col_gpt4,   col_llama),
#         ("Claude", "Llama",  col_claude, col_llama)
#     ]
    
#     for (model_a_name, model_b_name, col_a, col_b) in model_pairs:
#         # For binary columns
#         metrics = compute_agreement_metrics(df_stats, col_a, col_b)

#         all_results.append({
#             "Feature": feature_name,
#             "Model A": model_a_name,
#             "Model B": model_b_name,
#             "Column A": col_a,
#             "Column B": col_b,
#             "Confusion Matrix": metrics["ConfusionMatrix"],
#             "p_o (Observed)": round(metrics["ObservedAgreement"], 3),
#             "p_e (Expected)": round(metrics["ExpectedAgreement"], 3),
#             "Cohen's Kappa": round(metrics["CohensKappa"], 3)
#         })

# df_agreement = pd.DataFrame(all_results)

# print("Pairwise Agreement Results:")
# df_agreement

In [32]:
all_results = []

for feature_name, (col_gpt4, col_claude, col_llama) in features_map.items():
    # Pairwise sets
    model_pairs = [
        ("GPT-4",  "Claude", col_gpt4,   col_claude),
        ("GPT-4",  "Llama",  col_gpt4,   col_llama),
        ("Claude", "Llama",  col_claude, col_llama)
    ]
    
    for (model_a_name, model_b_name, col_a, col_b) in model_pairs:
        # Compute multi-class confusion matrix + kappa
        metrics = compute_multiclass_confusion_kappa(df_stats, col_a, col_b)
        
        row = {
            "Feature": feature_name,
            "Model A": model_a_name,
            "Model B": model_b_name,
            "Column A": col_a,
            "Column B": col_b,
            "Labels": metrics["Labels"],
            "NumRows Compared": metrics["NumRows"]
        }
        
        # Convert confusion matrix to something storable in a DataFrame
        # e.g. just store it as a nested list
        cm = metrics["ConfusionMatrix"]
        if cm is not None:
            row["Confusion Matrix"] = cm.tolist()
        else:
            row["Confusion Matrix"] = None
        
        kappa_val = metrics["CohensKappa"]
        row["Cohen's Kappa"] = round(kappa_val, 3) if kappa_val is not None else None
        
        all_results.append(row)

df_agreement = pd.DataFrame(all_results)
df_agreement

Unnamed: 0,Feature,Model A,Model B,Column A,Column B,Labels,NumRows Compared,Confusion Matrix,Cohen's Kappa
0,brain_stimulation_used,GPT-4,Claude,brain_stimulation_used_gpt_4,brain_stimulation_used_anthropic_claude_3.5_so...,"[0, 1]",63,"[[1, 0], [1, 61]]",0.659
1,brain_stimulation_used,GPT-4,Llama,brain_stimulation_used_gpt_4,brain_stimulation_used_meta_llama_llama_3_70b_...,"[0, 1]",63,"[[1, 0], [1, 61]]",0.659
2,brain_stimulation_used,Claude,Llama,brain_stimulation_used_anthropic_claude_3.5_so...,brain_stimulation_used_meta_llama_llama_3_70b_...,"[0, 1]",63,"[[1, 1], [1, 60]]",0.484
3,stimulation_details_primary_type,GPT-4,Claude,stimulation_details_primary_type_gpt_4,stimulation_details_primary_type_anthropic_cla...,"[, ETCC, HD-tDCS, TDCS, TDCs, TMS, TMS, tDCS, ...",63,"[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0.513
4,stimulation_details_primary_type,GPT-4,Llama,stimulation_details_primary_type_gpt_4,stimulation_details_primary_type_meta_llama_ll...,"[, ETCC, HD-tDCS, TDCS, TDCs, TMS, TMS, tDCS, ...",63,"[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]...",0.57
5,stimulation_details_primary_type,Claude,Llama,stimulation_details_primary_type_anthropic_cla...,stimulation_details_primary_type_meta_llama_ll...,"[, HD-tDCS, TDCS, TMS, dual-mode NIBS (rTMS + ...",63,"[[1, 0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 2, 0, 0, ...",0.598
6,stimulation_details_is_noninvasive,GPT-4,Claude,stimulation_details_is_noninvasive_gpt_4,stimulation_details_is_noninvasive_anthropic_c...,"[False, True]",63,"[[2, 0], [1, 60]]",0.792
7,stimulation_details_is_noninvasive,GPT-4,Llama,stimulation_details_is_noninvasive_gpt_4,stimulation_details_is_noninvasive_meta_llama_...,"[False, True]",63,"[[1, 1], [2, 59]]",0.376
8,stimulation_details_is_noninvasive,Claude,Llama,stimulation_details_is_noninvasive_anthropic_c...,stimulation_details_is_noninvasive_meta_llama_...,"[False, True]",63,"[[2, 1], [1, 59]]",0.65
9,stimulation_details_primary_target,GPT-4,Claude,stimulation_details_primary_target_gpt_4,stimulation_details_primary_target_anthropic_c...,"[, Basal ganglia, Bihemispheric, Cerebellum, D...",63,"[[28, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",0.606


In [34]:
df_agreement.to_csv("multi_class_confusion_results.csv", index=False)
df_agreement.head(20)

Unnamed: 0,Feature,Model A,Model B,Column A,Column B,Labels,NumRows Compared,Confusion Matrix,Cohen's Kappa
0,brain_stimulation_used,GPT-4,Claude,brain_stimulation_used_gpt_4,brain_stimulation_used_anthropic_claude_3.5_so...,"[0, 1]",63,"[[1, 0], [1, 61]]",0.659
1,brain_stimulation_used,GPT-4,Llama,brain_stimulation_used_gpt_4,brain_stimulation_used_meta_llama_llama_3_70b_...,"[0, 1]",63,"[[1, 0], [1, 61]]",0.659
2,brain_stimulation_used,Claude,Llama,brain_stimulation_used_anthropic_claude_3.5_so...,brain_stimulation_used_meta_llama_llama_3_70b_...,"[0, 1]",63,"[[1, 1], [1, 60]]",0.484
3,stimulation_details_primary_type,GPT-4,Claude,stimulation_details_primary_type_gpt_4,stimulation_details_primary_type_anthropic_cla...,"[, ETCC, HD-tDCS, TDCS, TDCs, TMS, TMS, tDCS, ...",63,"[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0.513
4,stimulation_details_primary_type,GPT-4,Llama,stimulation_details_primary_type_gpt_4,stimulation_details_primary_type_meta_llama_ll...,"[, ETCC, HD-tDCS, TDCS, TDCs, TMS, TMS, tDCS, ...",63,"[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]...",0.57
5,stimulation_details_primary_type,Claude,Llama,stimulation_details_primary_type_anthropic_cla...,stimulation_details_primary_type_meta_llama_ll...,"[, HD-tDCS, TDCS, TMS, dual-mode NIBS (rTMS + ...",63,"[[1, 0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 2, 0, 0, ...",0.598
6,stimulation_details_is_noninvasive,GPT-4,Claude,stimulation_details_is_noninvasive_gpt_4,stimulation_details_is_noninvasive_anthropic_c...,"[False, True]",63,"[[2, 0], [1, 60]]",0.792
7,stimulation_details_is_noninvasive,GPT-4,Llama,stimulation_details_is_noninvasive_gpt_4,stimulation_details_is_noninvasive_meta_llama_...,"[False, True]",63,"[[1, 1], [2, 59]]",0.376
8,stimulation_details_is_noninvasive,Claude,Llama,stimulation_details_is_noninvasive_anthropic_c...,stimulation_details_is_noninvasive_meta_llama_...,"[False, True]",63,"[[2, 1], [1, 59]]",0.65
9,stimulation_details_primary_target,GPT-4,Claude,stimulation_details_primary_target_gpt_4,stimulation_details_primary_target_anthropic_c...,"[, Basal ganglia, Bihemispheric, Cerebellum, D...",63,"[[28, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",0.606
