In [65]:
import pandas as pd
import re
from fuzzywuzzy import fuzz
import numpy as np

# Define categories
categories = [
    "gender", "age", "disability status", "race", "country", "state", "region", 
    "languages spoken", "education level", "social media usage", "religion", 
    "marital status", "profession", "household income classification", "housing situation"
]

SIMILARITY_THRESHOLD = 80

def is_match(pred, true):
    """Check if predicted label matches ground truth with fuzzy matching for longer strings"""
    if pd.isna(pred) or pd.isna(true):
        return False
    
    pred = str(pred).strip().lower()
    true = str(true).strip().lower()
    
    # Handle exact matches first
    if pred == true:
        return True
    
    # For longer strings, use fuzzy matching
    if len(pred) >= 5 and len(true) >= 5:
        return fuzz.ratio(pred, true) >= SIMILARITY_THRESHOLD
    else:
        return pred == true

def clean_label(label):
    """Clean label by removing QA suffix and extra whitespace"""
    if pd.isna(label):
        return label
    # Remove both (GPT) and (qa) suffixes
    cleaned = str(label).replace(" (GPT)", "").replace(" (qa)", "").strip()
    return cleaned

def safe_eval_labels(labels_str):
    """Safely convert string representation of list to actual list"""
    try:
        if isinstance(labels_str, str):
            return eval(labels_str)
        elif isinstance(labels_str, list):
            return labels_str
        else:
            return [str(labels_str)] * len(categories)  # fallback
    except:
        return ['n/a'] * len(categories)

def load_and_prepare_data(file_path):
    """Load CSV and prepare data with proper label parsing"""
    try:
        df = pd.read_csv(file_path)
        
        # Convert string representations to lists if needed
        if 'labels' in df.columns:
            df['labels'] = df['labels'].apply(safe_eval_labels)
        if 'ground_truth_labels' in df.columns:
            df['ground_truth_labels'] = df['ground_truth_labels'].apply(safe_eval_labels)
        
        return df
    except Exception as e:
        return None

def evaluate_pipeline_qa_method(df):
    """Evaluate accuracy for Pipeline + QA method, separating Pipeline vs QA performance"""
    if df is None or df.empty:
        return None, None
    
    # Separate tracking for Pipeline vs QA
    category_results = {
        cat: {
            'pipeline_correct': 0, 'pipeline_total': 0,
            'qa_correct': 0, 'qa_total': 0
        } for cat in categories
    }
    
    for idx, row in df.iterrows():
        if 'labels' not in row or 'ground_truth_labels' not in row:
            continue
        
        predicted = row['labels']
        groundtruth = row['ground_truth_labels']
        
        # Ensure both are lists and same length
        if not isinstance(predicted, list) or not isinstance(groundtruth, list):
            continue
        if len(predicted) != len(categories) or len(groundtruth) != len(categories):
            continue
        
        for i, cat in enumerate(categories):
            if i < len(predicted) and i < len(groundtruth):
                pred_label = predicted[i]
                true_label = groundtruth[i]
                
                # Check if this label came from QA agent
                is_qa_label = "(qa)" in str(pred_label).lower()
                
                # Clean the label for comparison
                pred_clean = clean_label(pred_label)
                
                # Track totals
                if is_qa_label:
                    category_results[cat]['qa_total'] += 1
                else:
                    category_results[cat]['pipeline_total'] += 1
                
                # Check if match
                if is_match(pred_clean, true_label):
                    if is_qa_label:
                        category_results[cat]['qa_correct'] += 1
                    else:
                        category_results[cat]['pipeline_correct'] += 1
    
    # Create accuracy table with separate Pipeline and QA columns
    accuracy_data = {}
    for cat in categories:
        # Pipeline accuracy
        if category_results[cat]['pipeline_total'] > 0:
            pipeline_acc = (category_results[cat]['pipeline_correct'] / category_results[cat]['pipeline_total']) * 100
        else:
            pipeline_acc = 0.0
        
        # QA accuracy  
        if category_results[cat]['qa_total'] > 0:
            qa_acc = (category_results[cat]['qa_correct'] / category_results[cat]['qa_total']) * 100
        else:
            qa_acc = 0.0
        
        accuracy_data[cat] = {
            'Pipeline Accuracy (%)': pipeline_acc,
            'QA Accuracy (%)': qa_acc
        }
    
    accuracy_table = pd.DataFrame(accuracy_data).T.round(1)
    return None, accuracy_table

def evaluate_single_method(df, method_name):
    """Evaluate accuracy for a single labeling method (non-Pipeline+QA methods)"""
    if df is None or df.empty:
        return None, None
    
    category_results = {cat: {'correct': 0, 'total': 0} for cat in categories}
    
    for idx, row in df.iterrows():
        if 'labels' not in row or 'ground_truth_labels' not in row:
            continue
            
        predicted = row['labels']
        groundtruth = row['ground_truth_labels']
        
        # Ensure both are lists and same length
        if not isinstance(predicted, list) or not isinstance(groundtruth, list):
            continue
        if len(predicted) != len(categories) or len(groundtruth) != len(categories):
            continue
        
        for i, cat in enumerate(categories):
            if i < len(predicted) and i < len(groundtruth):
                pred_label = clean_label(predicted[i])
                true_label = groundtruth[i]
                
                category_results[cat]['total'] += 1
                
                if is_match(pred_label, true_label):
                    category_results[cat]['correct'] += 1
    
    # Create accuracy data
    accuracy_data = {}
    for cat in categories:
        if category_results[cat]['total'] > 0:
            accuracy = (category_results[cat]['correct'] / category_results[cat]['total']) * 100
        else:
            accuracy = 0
        accuracy_data[cat] = accuracy
    
    return None, accuracy_data

def create_combined_table(pipeline_table, chatgpt_accuracy_data):
    """Create a combined table with Pipeline, QA, and ChatGPT Matching Agent columns"""
    if pipeline_table is None or chatgpt_accuracy_data is None:
        return None
    
    # Start with the pipeline table
    combined_table = pipeline_table.copy()
    
    # Add ChatGPT Matching Agent column
    chatgpt_column = []
    for cat in categories:
        if cat in chatgpt_accuracy_data:
            chatgpt_column.append(chatgpt_accuracy_data[cat])
        else:
            chatgpt_column.append(0.0)
    
    combined_table['ChatGPT Matching Agent (%)'] = chatgpt_column
    
    return combined_table.round(1)

def main():
    """Main execution function"""
    
    # File paths
    ground_truth_file = "synthetic_patient_descriptions_and_ground_truth.csv"
    pipeline_file = "synthetic_patient_descriptions_labels_reroute.csv" 
    matching_agent_file = "labeled_output_matching_agent.csv"
    
    # Load all datasets
    pipeline_df = load_and_prepare_data(pipeline_file)
    matching_agent_df = load_and_prepare_data(matching_agent_file)
    
    # Evaluate Pipeline + QA Agent
    _, pipeline_table = evaluate_pipeline_qa_method(pipeline_df)
    
    # Evaluate ChatGPT Matching Agent
    _, chatgpt_data = evaluate_single_method(matching_agent_df, "ChatGPT")
    
    # Create and print combined table
    combined_table = create_combined_table(pipeline_table, chatgpt_data)
    if combined_table is not None:
        print(combined_table)

if __name__ == "__main__":
    main()

                                 Pipeline Accuracy (%)  QA Accuracy (%)  \
gender                                            88.0              0.0   
age                                               83.0              0.0   
disability status                                 74.3             40.0   
race                                              54.5              0.0   
country                                           48.0              0.0   
state                                             89.2            100.0   
region                                            86.6              3.0   
languages spoken                                  70.0              0.0   
education level                                   95.2             86.8   
social media usage                                90.9             27.0   
religion                                          88.9              0.0   
marital status                                    96.8              0.0   
profession               