In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

df = pd.read_pickle('DATA')
df

#Leaf Level Metrics

**Micro-Metrics**

In [None]:
import numpy as np

# --- Etapa 2: Definir Nomes das Colunas ---
gold_standard_col = '21. CID de Alta'
model_cols = [
    'API Maritalk', 'API Deep-Seek', 'API modelo Tunado',
    'API GPT-Mini', 'API GPT‑4o', 'API Gemini'
]


def parse_and_clean_codes(cell_content):

    # 1. Se o dado já for uma lista (ou array numpy), processa cada item dela
    if isinstance(cell_content, (list, np.ndarray)):
        # Filtra valores nulos (NaN) que podem estar dentro da lista
        return {str(code).strip().upper() for code in cell_content if not pd.isna(code)}

    # 2. Se não for uma lista, usa a lógica anterior para tratar como texto (string)
    if pd.isna(cell_content) or not isinstance(cell_content, str) or not cell_content.strip():
        return set()

    cleaned_text = cell_content.strip().replace('[', '').replace(']', '').replace("'", "").replace('"', '')

    if not cleaned_text:
        return set()

    codes = {code.strip().upper() for code in cleaned_text.split(',')}
    return codes

# --- Etapa 4: Loop de Cálculo (sem alterações aqui) ---
results = {}
print("Iniciando o cálculo das métricas.")

for model in model_cols:
    if model not in df.columns:
        print(f"Aviso: A coluna '{model}' não foi encontrada. Pulando.")
        continue

    total_tp, total_fp, total_fn = 0, 0, 0

    for index, row in df.iterrows():
        true_labels = parse_and_clean_codes(row[gold_standard_col])
        predicted_labels = parse_and_clean_codes(row.get(model))

        tp = len(true_labels.intersection(predicted_labels))
        fp = len(predicted_labels.difference(true_labels))
        fn = len(true_labels.difference(predicted_labels))

        total_tp += tp
        total_fp += fp
        total_fn += fn

    micro_precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
    micro_recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
    micro_f1 = 2 * (micro_precision * micro_recall) / (micro_precision + micro_recall) if (micro_precision + micro_recall) > 0 else 0

    results[model] = {
        'Micro-Precision': micro_precision,
        'Micro-Recall': micro_recall,
        'Micro-F1': micro_f1
    }
    print(f"Métricas para '{model}' calculadas.")

# --- Etapa 5: Exibir e Salvar os Resultados Finais ---
results_df = pd.DataFrame(results).T.sort_values(by='Micro-F1', ascending=False)
print("\n" + "="*50)
print("  DESEMPENHO FINAL DOS MODELOS (Correspondência Exata)")
print("="*50)
print(results_df)

**Macro-Metrics**

In [None]:
# --- Etapa 2: Definir Nomes das Colunas ---
gold_standard_col = '21. CID de Alta'
model_cols = [
    'API Maritalk', 'API Deep-Seek', 'API modelo Tunado',
    'API GPT-Mini', 'API GPT‑4o', 'API Gemini'
]


# --- Etapa 3: Função de Limpeza (a mesma de antes) ---
def parse_and_clean_codes(cell_content):
    if isinstance(cell_content, (list, np.ndarray)):
        return {str(code).strip().upper() for code in cell_content if not pd.isna(code)}
    if pd.isna(cell_content) or not isinstance(cell_content, str) or not cell_content.strip():
        return set()
    cleaned_text = cell_content.strip().replace('[', '').replace(']', '').replace("'", "").replace('"', '')
    if not cleaned_text:
        return set()
    codes = {code.strip().upper() for code in cleaned_text.split(',')}
    return codes


# --- Etapa 4: Loop de Cálculo (LÓGICA MACRO) ---
results_macro = {}
print("Iniciando o cálculo das métricas Macro-Averaged...")

for model in model_cols:
    if model not in df.columns:
        print(f"Aviso: A coluna '{model}' não foi encontrada. Pulando.")
        continue

    # Listas para armazenar as métricas de CADA linha (paciente)
    list_precision = []
    list_recall = []
    list_f1 = []

    for index, row in df.iterrows():
        true_labels = parse_and_clean_codes(row[gold_standard_col])
        predicted_labels = parse_and_clean_codes(row.get(model))

        # Calcula TP, FP, FN para esta linha específica
        tp = len(true_labels.intersection(predicted_labels))
        fp = len(predicted_labels.difference(true_labels))
        fn = len(true_labels.difference(predicted_labels))

        # Calcula as métricas para ESTA LINHA, com cuidado para não dividir por zero
        precision_row = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall_row = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1_row = 2 * (precision_row * recall_row) / (precision_row + recall_row) if (precision_row + recall_row) > 0 else 0

        # Adiciona as métricas da linha às listas
        list_precision.append(precision_row)
        list_recall.append(recall_row)
        list_f1.append(f1_row)

    # Após percorrer todas as linhas, calcula a média das métricas
    macro_precision = np.mean(list_precision)
    macro_recall = np.mean(list_recall)
    macro_f1 = np.mean(list_f1)

    results_macro[model] = {
        'Macro-Precision': macro_precision,
        'Macro-Recall': macro_recall,
        'Macro-F1': macro_f1
    }
    print(f"Métricas Macro para '{model}' calculadas.")


# --- Etapa 5: Exibir e Salvar os Resultados Finais ---
results_macro_df = pd.DataFrame(results_macro).T.sort_values(by='Macro-F1', ascending=False)
print("\n" + "="*50)
print("  DESEMPENHO FINAL DOS MODELOS (Métricas Macro-Averaged)")
print("="*50)
print(results_macro_df)

try:
    results_macro_df.to_csv("resultados_finais_macro.csv")
    print("\n[✓] Resultados salvos com sucesso no arquivo 'resultados_finais_macro.csv'")
except Exception as e:
    print(f"\n[X] Ocorreu um erro ao salvar o arquivo: {e}")


#Three-Character Metrics

**Data Cleansing**

In [None]:
# --- Etapa 3: Funções de Limpeza e Transformação ---

df2 = df.copy()

def parse_and_clean_codes(cell_content):
    """Lê uma célula (texto ou lista) e retorna um conjunto de códigos limpos."""
    if isinstance(cell_content, (list, np.ndarray)):
        return {str(code).strip().upper() for code in cell_content if not pd.isna(code)}
    if pd.isna(cell_content) or not isinstance(cell_content, str) or not cell_content.strip():
        return set()
    cleaned_text = cell_content.strip().replace('[', '').replace(']', '').replace("'", "").replace('"', '')
    if not cleaned_text:
        return set()
    codes = {code.strip().upper() for code in cleaned_text.split(',')}
    return codes

def truncate_codes_in_cell(cell_content):
    """Usa a função anterior para ler os códigos e retorna uma NOVA LISTA com os códigos truncados."""
    full_codes_set = parse_and_clean_codes(cell_content)
    # Pega cada código do conjunto, fatia os 3 primeiros caracteres e retorna como uma lista
    truncated_codes_list = [code[:3] for code in full_codes_set]
    return truncated_codes_list


# --- Etapa 4: Aplicar a Transformação em df2 ---

# Lista de todas as colunas que contêm códigos CID
columns_to_transform = [
    '21. CID de Alta', 'API Maritalk', 'API Deep-Seek', 'API modelo Tunado',
    'API GPT-Mini', 'API GPT‑4o', 'API Gemini'
]

print("Iniciando a transformação para remover a especificidade (3 caracteres)...")

for col in columns_to_transform:
    if col in df2.columns:
        # Aplica a função de truncar em cada célula da coluna especificada
        df2[col] = df2[col].apply(truncate_codes_in_cell)
        print(f"Coluna '{col}' transformada.")

print("\n" + "="*50)
print("  DATA FRAME ORIGINAL (df)")
print("="*50)
print(df)

print("\n" + "="*50)
print("  NOVO DATAFRAME (df2) - SEM ESPECIFICIDADE (3 CARACTERES)")
print("="*50)
print(df2)

**Micro-Metrics**

In [None]:
import numpy as np

# --- Etapa 2: Definir Nomes das Colunas ---
gold_standard_col = '21. CID de Alta'
model_cols = [
    'API Maritalk', 'API Deep-Seek', 'API modelo Tunado',
    'API GPT-Mini', 'API GPT‑4o', 'API Gemini'
]


def parse_and_clean_codes(cell_content):

    # 1. Se o dado já for uma lista (ou array numpy), processa cada item dela
    if isinstance(cell_content, (list, np.ndarray)):
        # Filtra valores nulos (NaN) que podem estar dentro da lista
        return {str(code).strip().upper() for code in cell_content if not pd.isna(code)}

    # 2. Se não for uma lista, usa a lógica anterior para tratar como texto (string)
    if pd.isna(cell_content) or not isinstance(cell_content, str) or not cell_content.strip():
        return set()

    cleaned_text = cell_content.strip().replace('[', '').replace(']', '').replace("'", "").replace('"', '')

    if not cleaned_text:
        return set()

    codes = {code.strip().upper() for code in cleaned_text.split(',')}
    return codes

# --- Etapa 4: Loop de Cálculo (sem alterações aqui) ---
results = {}
print("Iniciando o cálculo das métricas.")

for model in model_cols:
    if model not in df2.columns:
        print(f"Aviso: A coluna '{model}' não foi encontrada. Pulando.")
        continue

    total_tp, total_fp, total_fn = 0, 0, 0

    for index, row in df2.iterrows():
        true_labels = parse_and_clean_codes(row[gold_standard_col])
        predicted_labels = parse_and_clean_codes(row.get(model))

        tp = len(true_labels.intersection(predicted_labels))
        fp = len(predicted_labels.difference(true_labels))
        fn = len(true_labels.difference(predicted_labels))

        total_tp += tp
        total_fp += fp
        total_fn += fn

    micro_precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
    micro_recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
    micro_f1 = 2 * (micro_precision * micro_recall) / (micro_precision + micro_recall) if (micro_precision + micro_recall) > 0 else 0

    results[model] = {
        'Micro-Precision': micro_precision,
        'Micro-Recall': micro_recall,
        'Micro-F1': micro_f1
    }
    print(f"Métricas para '{model}' calculadas.")

# --- Etapa 5: Exibir e Salvar os Resultados Finais ---
results_df2 = pd.DataFrame(results).T.sort_values(by='Micro-F1', ascending=False)
print("\n" + "="*50)
print("  DESEMPENHO FINAL DOS MODELOS (Correspondência Exata)")
print("="*50)
print(results_df2)

**Macro-Metrics**

In [None]:
# --- Etapa 2: Definir Nomes das Colunas ---
gold_standard_col = '21. CID de Alta'
model_cols = [
    'API Maritalk', 'API Deep-Seek', 'API modelo Tunado',
    'API GPT-Mini', 'API GPT‑4o', 'API Gemini'
]


# --- Etapa 3: Função de Limpeza (a mesma de antes) ---
def parse_and_clean_codes(cell_content):
    if isinstance(cell_content, (list, np.ndarray)):
        return {str(code).strip().upper() for code in cell_content if not pd.isna(code)}
    if pd.isna(cell_content) or not isinstance(cell_content, str) or not cell_content.strip():
        return set()
    cleaned_text = cell_content.strip().replace('[', '').replace(']', '').replace("'", "").replace('"', '')
    if not cleaned_text:
        return set()
    codes = {code.strip().upper() for code in cleaned_text.split(',')}
    return codes


# --- Etapa 4: Loop de Cálculo (LÓGICA MACRO) ---
results_macro = {}
print("Iniciando o cálculo das métricas Macro-Averaged...")

for model in model_cols:
    if model not in df2.columns:
        print(f"Aviso: A coluna '{model}' não foi encontrada. Pulando.")
        continue

    # Listas para armazenar as métricas de CADA linha (paciente)
    list_precision = []
    list_recall = []
    list_f1 = []

    for index, row in df2.iterrows():
        true_labels = parse_and_clean_codes(row[gold_standard_col])
        predicted_labels = parse_and_clean_codes(row.get(model))

        # Calcula TP, FP, FN para esta linha específica
        tp = len(true_labels.intersection(predicted_labels))
        fp = len(predicted_labels.difference(true_labels))
        fn = len(true_labels.difference(predicted_labels))

        # Calcula as métricas para ESTA LINHA, com cuidado para não dividir por zero
        precision_row = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall_row = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1_row = 2 * (precision_row * recall_row) / (precision_row + recall_row) if (precision_row + recall_row) > 0 else 0

        # Adiciona as métricas da linha às listas
        list_precision.append(precision_row)
        list_recall.append(recall_row)
        list_f1.append(f1_row)

    # Após percorrer todas as linhas, calcula a média das métricas
    macro_precision = np.mean(list_precision)
    macro_recall = np.mean(list_recall)
    macro_f1 = np.mean(list_f1)

    results_macro[model] = {
        'Macro-Precision': macro_precision,
        'Macro-Recall': macro_recall,
        'Macro-F1': macro_f1
    }
    print(f"Métricas Macro para '{model}' calculadas.")


# --- Etapa 5: Exibir e Salvar os Resultados Finais ---
results_macro_df2 = pd.DataFrame(results_macro).T.sort_values(by='Macro-F1', ascending=False)
print("\n" + "="*50)
print("  DESEMPENHO FINAL DOS MODELOS (Métricas Macro-Averaged)")
print("="*50)
print(results_macro_df2)


# Bootstraping

In [None]:
import pandas as pd

df = pd.read_excel('/content/drive/MyDrive/Doutorado - Ricardo- Após a Qualificação/tabela_auditoria_llms_11_06.xlsx', engine='openpyxl')

# --- Extrair apenas os CIDs da coluna "API Deep-Seek - Com Prompt" ---

import re

col = "API Deep-Seek - Sem Prompt"
cid_re = re.compile(r"[A-Z][0-9]{2}(?:\.[0-9A-Z]{1,3})?")

def extrair_cids(x):
    # Une tudo em uma string (seja lista, dict serializado ou texto solto)
    if isinstance(x, list):
        txt = " ".join(map(str, x))
    else:
        txt = str(x)
    # Coleta todos os matches válidos
    cids = cid_re.findall(txt)
    # Remove duplicatas preservando a ordem
    return list(dict.fromkeys(cids))

df[col] = df[col].apply(extrair_cids)

df["21. CID de Alta"] = df["21. CID de Alta"].astype(str).apply(
    lambda x: [cid.strip() for cid in x.split() if cid.strip()]
)
colunas_cid = [ "API Maritalk",	"API Deep-Seek - Sem Prompt",	"API modelo Tunado",	"API GPT-Mini",	"API GPT‑4o",	"API Gemini"]

import ast

for col in colunas_cid:
    df[col] = df[col].apply(
        lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith("[") else x
    )

import re

padrao_cid = re.compile(r"^[A-Z][0-9]{2}(\.[0-9A-Z]{1,3})?$")

def filtrar_cids(lista):
    if not isinstance(lista, list):
        return []
    return [item for item in lista if isinstance(item, str) and padrao_cid.match(item)]

for col in colunas_cid:
    df[col] = df[col].apply(filtrar_cids)

# Incluindo a coluna "21. CID de Alta"
todas_colunas = colunas_cid + ["21. CID de Alta"]

# Renomeando a coluna específica
df.rename(
    columns={'API Deep-Seek - Sem Prompt': 'API Deep-Seek'},
    inplace=True
)


df

**Leaf Level Metrics**

In [None]:
# ============================================
# Leaf-level: cálculo de métricas, bootstrap, ICs,
# plots com nomes padronizados e salvamento no Drive
# ============================================

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

# -----------------------------
# CONFIGURAÇÕES INICIAIS
# -----------------------------
# >>>> Garanta que 'df' já esteja carregado com as colunas abaixo <<<<
gold_standard_col = '21. CID de Alta'
model_cols = [
    'API Maritalk',
    'API Deep-Seek',
    'API modelo Tunado',
    'API GPT-Mini',
    'API GPT‑4o',
    'API Gemini'
]
metrics_to_calculate = [
    'Micro-Precision', 'Micro-Recall', 'Micro-F1',
    'Macro-Precision', 'Macro-Recall', 'Macro-F1'
]

# Diretório de saída (Leaf level)
output_dir = "/content/drive/MyDrive/Doutorado - Ricardo- Após a Qualificação/Paper: EVALUATING LARGE LANGUAGE MODELS FOR AUTOMATED ICD-10 CODING OF OBSTETRIC CLINICAL NOTES IN PORTUGUESE: A COMPARATIVE STUDY/Bootstrap/Leaf level"
os.makedirs(output_dir, exist_ok=True)

# -----------------------------
# FUNÇÕES AUXILIARES
# -----------------------------
def parse_and_clean_codes(cell_content):
    if isinstance(cell_content, (list, np.ndarray)):
        return {str(code).strip().upper() for code in cell_content if not pd.isna(code)}
    if pd.isna(cell_content) or not isinstance(cell_content, str) or not cell_content.strip():
        return set()
    cleaned_text = (cell_content.strip()
                               .replace('[', '')
                               .replace(']', '')
                               .replace("'", "")
                               .replace('"', ''))
    if not cleaned_text:
        return set()
    codes = {code.strip().upper() for code in cleaned_text.split(',')}
    return codes

def calculate_all_metrics_micro_and_macro(dataframe, models, gold_standard):
    final_results = {}
    for model in models:
        total_tp, total_fp, total_fn = 0, 0, 0
        list_precision, list_recall, list_f1 = [], [], []

        for _, row in dataframe.iterrows():
            true_labels = parse_and_clean_codes(row[gold_standard])
            predicted_labels = parse_and_clean_codes(row.get(model))

            tp = len(true_labels.intersection(predicted_labels))
            fp = len(predicted_labels.difference(true_labels))
            fn = len(true_labels.difference(predicted_labels))

            total_tp += tp
            total_fp += fp
            total_fn += fn

            precision_row = tp / (tp + fp) if (tp + fp) > 0 else 0.0
            recall_row = tp / (tp + fn) if (tp + fn) > 0 else 0.0
            f1_row = (2 * precision_row * recall_row / (precision_row + recall_row)
                      if (precision_row + recall_row) > 0 else 0.0)
            list_precision.append(precision_row)
            list_recall.append(recall_row)
            list_f1.append(f1_row)

        results = {}
        results['Micro-Precision'] = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0.0
        results['Micro-Recall']    = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0.0
        mp, mr = results['Micro-Precision'], results['Micro-Recall']
        results['Micro-F1']        = 2 * mp * mr / (mp + mr) if (mp + mr) > 0 else 0.0
        results['Macro-Precision'] = float(np.mean(list_precision)) if list_precision else 0.0
        results['Macro-Recall']    = float(np.mean(list_recall)) if list_recall else 0.0
        results['Macro-F1']        = float(np.mean(list_f1)) if list_f1 else 0.0

        final_results[model] = results

    return pd.DataFrame(final_results).T

# -----------------------------
# ESTIMATIVAS PONTUAIS
# -----------------------------
print("Calculando as estimativas pontuais (6 métricas no dataset completo) [Leaf level]...")
point_estimates_df = calculate_all_metrics_micro_and_macro(df, model_cols, gold_standard_col)

# -----------------------------
# BOOTSTRAP
# -----------------------------
n_bootstraps = 10000
print(f"\nIniciando o bootstrapping com {n_bootstraps} iterações para todas as 6 métricas [Leaf level]...")
bootstrap_scores = {model: {metric: [] for metric in metrics_to_calculate} for model in model_cols}

for i in tqdm(range(n_bootstraps)):
    df_resampled = df.sample(n=len(df), replace=True)
    resampled_results = calculate_all_metrics_micro_and_macro(df_resampled, model_cols, gold_standard_col)
    for model in model_cols:
        for metric in metrics_to_calculate:
            bootstrap_scores[model][metric].append(float(resampled_results.loc[model, metric]))

print("Cálculo do bootstrapping concluído.")

# -----------------------------
# CIs E TABELA FINAL
# -----------------------------
final_results_with_ci = {}
alpha = (1.0 - 0.95) / 2.0

for model in model_cols:
    model_results = {}
    for metric in metrics_to_calculate:
        scores = bootstrap_scores[model][metric]
        lower_bound = float(np.percentile(scores, alpha * 100))
        upper_bound = float(np.percentile(scores, (1 - alpha) * 100))
        model_results[metric] = float(point_estimates_df.loc[model, metric])
        model_results[f'{metric} IC 95%'] = f"[{lower_bound:.4f}, {upper_bound:.4f}]"
    final_results_with_ci[model] = model_results

final_df = pd.DataFrame(final_results_with_ci).T
final_df = final_df.sort_values(by='Micro-F1', ascending=False)

print("\n" + "="*80)
print("  DESEMPENHO FINAL DOS MODELOS (MICRO E MACRO) COM INTERVALO DE CONFIANÇA (IC) — Leaf level")
print("="*80)
print(final_df)

# Salvar tabela com ICs (valores separados) no Drive
final_csv_path = os.path.join(output_dir, "final_results_with_CI.csv")
final_df.to_csv(final_csv_path, index=True)
print(f"\nTabela final com IC salva: {final_csv_path}")

# -----------------------------
# MAPEAMENTO DE NOMES PARA PLOTS
# -----------------------------
name_map = {
    'API Maritalk': 'Sabiá-3.1',
    'API Deep-Seek': 'DeepSeek-V3',
    'API modelo Tunado': 'Fine-Tuned GPT-4o-Mini',
    'API GPT-Mini': 'GPT-4o-Mini',
    'API GPT-4o': 'GPT-4o',
    'API GPT-4o': 'GPT-4o',  # caso a coluna tenha U+202F
    'API Gemini': 'Gemini-1.5 Flash',
}
metric_label_map = {
    'Micro-Precision': 'Micro Precision',
    'Micro-Recall': 'Micro Recall',
    'Micro-F1': 'Micro F1-Score',
    'Macro-Precision': 'Macro Precision',
    'Macro-Recall': 'Macro Recall',
    'Macro-F1': 'Macro F1-Score',
}

# -----------------------------
# PLOTS + SALVAMENTO
# -----------------------------
print("\nGenerating distribution plots for each metric (Leaf level) and saving to Drive...")
for metric in metrics_to_calculate:
    fig, axes = plt.subplots(1, len(model_cols), figsize=(len(model_cols) * 6, 5), sharey=True)

    suptitle_txt = metric_label_map.get(metric, metric)
    fig.suptitle(f'Bootstrap Distribution for {suptitle_txt}', fontsize=16)

    # índice ordenado por Micro-F1 no final_df
    for i, model in enumerate(final_df.index):
        ax = axes[i]
        scores = bootstrap_scores[model][metric]
        sns.histplot(scores, kde=True, ax=ax, bins=30, stat="density")

        point_estimate = float(point_estimates_df.loc[model, metric])
        ax.axvline(point_estimate, color='black', linestyle='-', linewidth=2,
                   label=f'Point Estimate ({point_estimate:.4f})')

        mean_bootstrap = float(np.mean(scores))
        ax.axvline(mean_bootstrap, color='red', linestyle='--', linewidth=2,
                   label=f'Bootstrap Mean ({mean_bootstrap:.4f})')

        lower_bound = float(np.percentile(scores, 2.5))
        upper_bound = float(np.percentile(scores, 97.5))
        ax.axvline(lower_bound, color='g', linestyle=':', linewidth=2, label='95% CI')
        ax.axvline(upper_bound, color='g', linestyle=':', linewidth=2)

        ax.set_title(name_map.get(model, model), fontsize=12)
        ax.set_xlabel(metric_label_map.get(metric, metric))
        if i == 0:
            ax.set_ylabel('Density')
        ax.legend()

    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    safe_metric = metric.replace(' ', '_').replace('-', '')
    fig_path = os.path.join(output_dir, f'distribution_bootstrap_{safe_metric}.png')
    plt.savefig(fig_path, dpi=200, bbox_inches='tight')
    plt.close(fig)
    print(f"Chart saved: {fig_path}")

# -----------------------------
# PE vs BOOTSTRAP MEAN + SALVAR
# -----------------------------
rows = []
for model in model_cols:
    # garantir chave correta se houver variação em 'API GPT-4o'
    model_key = model
    if model_key not in point_estimates_df.index and model_key == 'API GPT-4o' and 'API GPT-4o' in point_estimates_df.index:
        model_key = 'API GPT-4o'
    if model_key not in point_estimates_df.index and model_key == 'API GPT-4o' and 'API GPT-4o' in point_estimates_df.index:
        model_key = 'API GPT-4o'

    for metric in metrics_to_calculate:
        pe = float(point_estimates_df.loc[model_key, metric])
        bm = float(np.mean(bootstrap_scores[model_key][metric]))
        diff = bm - pe
        rows.append({
            'Model': name_map.get(model_key, model_key),
            'Metric': metric_label_map.get(metric, metric),
            'Point Estimate': f"{pe:.4f}",
            'Bootstrap Mean': f"{bm:.4f}",
            'Diff (BM-PE)': f"{diff:.6f}",
        })

pe_vs_bm_df = pd.DataFrame(rows)
pe_vs_bm_df['abs_diff'] = pe_vs_bm_df['Diff (BM-PE)'].astype(float).abs()
pe_vs_bm_df = pe_vs_bm_df.sort_values(by='abs_diff', ascending=False).drop(columns='abs_diff')

pebm_csv_path = os.path.join(output_dir, "point_estimate_vs_bootstrap_mean.csv")
pe_vs_bm_df.to_csv(pebm_csv_path, index=False)
print(f"\nComparação PE vs BM salva: {pebm_csv_path}")


Calculando as estimativas pontuais (6 métricas no dataset completo) [Leaf level]...

Iniciando o bootstrapping com 10000 iterações para todas as 6 métricas [Leaf level]...


  0%|          | 0/10000 [00:00<?, ?it/s]

KeyboardInterrupt: 

**Three-Character Category - Data Cleasing**

In [None]:
# --- Etapa 3: Funções de Limpeza e Transformação ---

df2 = df.copy()

def parse_and_clean_codes(cell_content):
    """Lê uma célula (texto ou lista) e retorna um conjunto de códigos limpos."""
    if isinstance(cell_content, (list, np.ndarray)):
        return {str(code).strip().upper() for code in cell_content if not pd.isna(code)}
    if pd.isna(cell_content) or not isinstance(cell_content, str) or not cell_content.strip():
        return set()
    cleaned_text = cell_content.strip().replace('[', '').replace(']', '').replace("'", "").replace('"', '')
    if not cleaned_text:
        return set()
    codes = {code.strip().upper() for code in cleaned_text.split(',')}
    return codes

def truncate_codes_in_cell(cell_content):
    """Usa a função anterior para ler os códigos e retorna uma NOVA LISTA com os códigos truncados."""
    full_codes_set = parse_and_clean_codes(cell_content)
    # Pega cada código do conjunto, fatia os 3 primeiros caracteres e retorna como uma lista
    truncated_codes_list = [code[:3] for code in full_codes_set]
    return truncated_codes_list


# --- Etapa 4: Aplicar a Transformação em df2 ---

# Lista de todas as colunas que contêm códigos CID
columns_to_transform = [
    '21. CID de Alta', 'API Maritalk', 'API Deep-Seek', 'API modelo Tunado',
    'API GPT-Mini', 'API GPT‑4o', 'API Gemini'
]

print("Iniciando a transformação para remover a especificidade (3 caracteres)...")

for col in columns_to_transform:
    if col in df2.columns:
        # Aplica a função de truncar em cada célula da coluna especificada
        df2[col] = df2[col].apply(truncate_codes_in_cell)
        print(f"Coluna '{col}' transformada.")

print("\n" + "="*50)
print("  DATA FRAME ORIGINAL (df)")
print("="*50)
print(df)

print("\n" + "="*50)
print("  NOVO DATAFRAME (df2) - SEM ESPECIFICIDADE (3 CARACTERES)")
print("="*50)
print(df2)

**Three Category Metrics Bootstrap**

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

# -----------------------------
# CONFIGURAÇÕES INICIAIS
# -----------------------------
# >>>> Garanta que 'df2' já esteja carregado com as colunas abaixo <<<<
gold_standard_col = '21. CID de Alta'
model_cols = [
    'API Maritalk',
    'API Deep-Seek',
    'API modelo Tunado',
    'API GPT-Mini',
    'API GPT‑4o',
    'API Gemini'
]
metrics_to_calculate = [
    'Micro-Precision', 'Micro-Recall', 'Micro-F1',
    'Macro-Precision', 'Macro-Recall', 'Macro-F1'
]

# Diretório de saída (Tree Category level)
output_dir = "/content/drive/MyDrive/Doutorado - Ricardo- Após a Qualificação/Paper: EVALUATING LARGE LANGUAGE MODELS FOR AUTOMATED ICD-10 CODING OF OBSTETRIC CLINICAL NOTES IN PORTUGUESE: A COMPARATIVE STUDY/Bootstrap/Three-character category"
os.makedirs(output_dir, exist_ok=True)

# -----------------------------
# FUNÇÕES AUXILIARES
# -----------------------------
def parse_and_clean_codes(cell_content):
    if isinstance(cell_content, (list, np.ndarray)):
        return {str(code).strip().upper() for code in cell_content if not pd.isna(code)}
    if pd.isna(cell_content) or not isinstance(cell_content, str) or not cell_content.strip():
        return set()
    cleaned_text = (cell_content.strip()
                               .replace('[', '')
                               .replace(']', '')
                               .replace("'", "")
                               .replace('"', ''))
    if not cleaned_text:
        return set()
    codes = {code.strip().upper() for code in cleaned_text.split(',')}
    return codes

def calculate_all_metrics_micro_and_macro(dataframe, models, gold_standard):
    final_results = {}
    for model in models:
        total_tp, total_fp, total_fn = 0, 0, 0
        list_precision, list_recall, list_f1 = [], [], []

        for _, row in dataframe.iterrows():
            true_labels = parse_and_clean_codes(row[gold_standard])
            predicted_labels = parse_and_clean_codes(row.get(model))

            tp = len(true_labels.intersection(predicted_labels))
            fp = len(predicted_labels.difference(true_labels))
            fn = len(true_labels.difference(predicted_labels))

            total_tp += tp
            total_fp += fp
            total_fn += fn

            precision_row = tp / (tp + fp) if (tp + fp) > 0 else 0.0
            recall_row = tp / (tp + fn) if (tp + fn) > 0 else 0.0
            f1_row = (2 * precision_row * recall_row / (precision_row + recall_row)
                      if (precision_row + recall_row) > 0 else 0.0)
            list_precision.append(precision_row)
            list_recall.append(recall_row)
            list_f1.append(f1_row)

        results = {}
        results['Micro-Precision'] = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0.0
        results['Micro-Recall']    = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0.0
        mp, mr = results['Micro-Precision'], results['Micro-Recall']
        results['Micro-F1']        = 2 * mp * mr / (mp + mr) if (mp + mr) > 0 else 0.0
        results['Macro-Precision'] = float(np.mean(list_precision)) if list_precision else 0.0
        results['Macro-Recall']    = float(np.mean(list_recall)) if list_recall else 0.0
        results['Macro-F1']        = float(np.mean(list_f1)) if list_f1 else 0.0

        final_results[model] = results

    return pd.DataFrame(final_results).T

# -----------------------------
# ESTIMATIVAS PONTUAIS
# -----------------------------
print("Calculando as estimativas pontuais (6 métricas no dataset completo) [Leaf level]...")
point_estimates_df2 = calculate_all_metrics_micro_and_macro(df2, model_cols, gold_standard_col)

# -----------------------------
# BOOTSTRAP
# -----------------------------
n_bootstraps = 10000
print(f"\nIniciando o bootstrapping com {n_bootstraps} iterações para todas as 6 métricas [Leaf level]...")
bootstrap_scores = {model: {metric: [] for metric in metrics_to_calculate} for model in model_cols}

for i in tqdm(range(n_bootstraps)):
    df2_resampled = df2.sample(n=len(df2), replace=True)
    resampled_results = calculate_all_metrics_micro_and_macro(df2_resampled, model_cols, gold_standard_col)
    for model in model_cols:
        for metric in metrics_to_calculate:
            bootstrap_scores[model][metric].append(float(resampled_results.loc[model, metric]))

print("Cálculo do bootstrapping concluído.")

# -----------------------------
# CIs E TABELA FINAL
# -----------------------------
final_results_with_ci = {}
alpha = (1.0 - 0.95) / 2.0

for model in model_cols:
    model_results = {}
    for metric in metrics_to_calculate:
        scores = bootstrap_scores[model][metric]
        lower_bound = float(np.percentile(scores, alpha * 100))
        upper_bound = float(np.percentile(scores, (1 - alpha) * 100))
        model_results[metric] = float(point_estimates_df2.loc[model, metric])
        model_results[f'{metric} IC 95%'] = f"[{lower_bound:.4f}, {upper_bound:.4f}]"
    final_results_with_ci[model] = model_results

final_df2 = pd.DataFrame(final_results_with_ci).T
final_df2 = final_df2.sort_values(by='Micro-F1', ascending=False)

print("\n" + "="*80)
print("  DESEMPENHO FINAL DOS MODELOS (MICRO E MACRO) COM INTERVALO DE CONFIANÇA (IC) — Leaf level")
print("="*80)
print(final_df2)

# Salvar tabela com ICs (valores separados) no Drive
final_csv_path = os.path.join(output_dir, "final_results_with_CI.csv")
final_df2.to_csv(final_csv_path, index=True)
print(f"\nTabela final com IC salva: {final_csv_path}")

# -----------------------------
# MAPEAMENTO DE NOMES PARA PLOTS
# -----------------------------
name_map = {
    'API Maritalk': 'Sabiá-3.1',
    'API Deep-Seek': 'DeepSeek-V3',
    'API modelo Tunado': 'Fine-Tuned GPT-4o-Mini',
    'API GPT-Mini': 'GPT-4o-Mini',
    'API GPT-4o': 'GPT-4o',
    'API GPT-4o': 'GPT-4o',  # caso a coluna tenha U+202F
    'API Gemini': 'Gemini-1.5 Flash',
}
metric_label_map = {
    'Micro-Precision': 'Micro Precision',
    'Micro-Recall': 'Micro Recall',
    'Micro-F1': 'Micro F1-Score',
    'Macro-Precision': 'Macro Precision',
    'Macro-Recall': 'Macro Recall',
    'Macro-F1': 'Macro F1-Score',
}

# -----------------------------
# PLOTS + SALVAMENTO
# -----------------------------
print("\nGenerating distribution plots for each metric (Leaf level) and saving to Drive...")
for metric in metrics_to_calculate:
    fig, axes = plt.subplots(1, len(model_cols), figsize=(len(model_cols) * 6, 5), sharey=True)

    suptitle_txt = metric_label_map.get(metric, metric)
    fig.suptitle(f'Bootstrap Distribution for {suptitle_txt}', fontsize=16)

    # índice ordenado por Micro-F1 no final_df2
    for i, model in enumerate(final_df2.index):
        ax = axes[i]
        scores = bootstrap_scores[model][metric]
        sns.histplot(scores, kde=True, ax=ax, bins=30, stat="density")

        point_estimate = float(point_estimates_df2.loc[model, metric])
        ax.axvline(point_estimate, color='black', linestyle='-', linewidth=2,
                   label=f'Point Estimate ({point_estimate:.4f})')

        mean_bootstrap = float(np.mean(scores))
        ax.axvline(mean_bootstrap, color='red', linestyle='--', linewidth=2,
                   label=f'Bootstrap Mean ({mean_bootstrap:.4f})')

        lower_bound = float(np.percentile(scores, 2.5))
        upper_bound = float(np.percentile(scores, 97.5))
        ax.axvline(lower_bound, color='g', linestyle=':', linewidth=2, label='95% CI')
        ax.axvline(upper_bound, color='g', linestyle=':', linewidth=2)

        ax.set_title(name_map.get(model, model), fontsize=12)
        ax.set_xlabel(metric_label_map.get(metric, metric))
        if i == 0:
            ax.set_ylabel('Density')
        ax.legend()

    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    safe_metric = metric.replace(' ', '_').replace('-', '')
    fig_path = os.path.join(output_dir, f'distribution_bootstrap_{safe_metric}.png')
    plt.savefig(fig_path, dpi=200, bbox_inches='tight')
    plt.close(fig)
    print(f"Chart saved: {fig_path}")

# -----------------------------
# PE vs BOOTSTRAP MEAN + SALVAR
# -----------------------------
rows = []
for model in model_cols:
    # garantir chave correta se houver variação em 'API GPT-4o'
    model_key = model
    if model_key not in point_estimates_df2.index and model_key == 'API GPT-4o' and 'API GPT-4o' in point_estimates_df2.index:
        model_key = 'API GPT-4o'
    if model_key not in point_estimates_df2.index and model_key == 'API GPT-4o' and 'API GPT-4o' in point_estimates_df2.index:
        model_key = 'API GPT-4o'

    for metric in metrics_to_calculate:
        pe = float(point_estimates_df2.loc[model_key, metric])
        bm = float(np.mean(bootstrap_scores[model_key][metric]))
        diff = bm - pe
        rows.append({
            'Model': name_map.get(model_key, model_key),
            'Metric': metric_label_map.get(metric, metric),
            'Point Estimate': f"{pe:.4f}",
            'Bootstrap Mean': f"{bm:.4f}",
            'Diff (BM-PE)': f"{diff:.6f}",
        })

pe_vs_bm_df2 = pd.DataFrame(rows)
pe_vs_bm_df2['abs_diff'] = pe_vs_bm_df2['Diff (BM-PE)'].astype(float).abs()
pe_vs_bm_df2 = pe_vs_bm_df2.sort_values(by='abs_diff', ascending=False).drop(columns='abs_diff')

pebm_csv_path = os.path.join(output_dir, "point_estimate_vs_bootstrap_mean.csv")
pe_vs_bm_df2.to_csv(pebm_csv_path, index=False)
print(f"\nComparação PE vs BM salva: {pebm_csv_path}")
