In [None]:
# ==========================================================
# PROJETO: Sistema de Triagem Emergencial Assistido por IA
# DISCIPLINA: Intelig√™ncia Artificial
# INSTITUI√á√ÉO: Universidade Presbiteriana Mackenzie
#
# AUTOR: Pedro Mastandrea
# RA: 10389910
# E-MAIL: pedro.mastandrea@mackenzista.com.br
#
# S√çNTESE DO CONTE√öDO DO ARQUIVO:
# Este notebook cont√©m a an√°lise explorat√≥ria, prepara√ß√£o dos dados e
# desenvolvimento do modelo preditivo V4 (XGBoost balanceado e calibrado)
# aplicado ao dataset "Emergency Service ‚Äì Triage Application" (Kaggle).
# As etapas incluem: limpeza e transforma√ß√£o dos dados, engenharia de atributos,
# consolida√ß√£o de vari√°veis categ√≥ricas, treinamento supervisionado e avalia√ß√£o
# por m√©tricas de desempenho e calibra√ß√£o probabil√≠stica.
#
# HIST√ìRICO DE ALTERA√á√ïES:
# 2025-09-17 ‚Äì Pedro Mastandrea - Cria√ß√£o do notebook e an√°lise explorat√≥ria inicial.
# 2025-09-20 ‚Äì Pedro Mastandrea - Limpeza, normaliza√ß√£o e sele√ß√£o de atributos.
# 2025-09-26 ‚Äì Pedro Mastandrea - Implementa√ß√£o do One-Hot Encoding e engenharia de atributos.
# 2025-10-05 ‚Äì Pedro Mastandrea - Treinamento do modelo Random Forest (vers√µes V1‚ÄìV3).
# 2025-10-27 ‚Äì Pedro Mastandrea - Implementa√ß√£o do modelo XGBoost (V4) com balanceamento de classes.
# 2025-10-31 ‚Äì Pedro Mastandrea - Inclus√£o da calibra√ß√£o isot√¥nica e c√°lculo do Brier Score.
# 2025-11-13 ‚Äì Pedro Mastandrea - Unifica√ß√£o das etapas de an√°lise explorat√≥ria e modelo final (vers√£o consolidada).
# ==========================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, brier_score_loss
from xgboost import XGBClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.preprocessing import label_binarize
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

# ==========================================================
# 1. CARREGAMENTO, LIMPEZA E AN√ÅLISE EXPLORAT√ìRIA DOS DADOS
# ==========================================================

def load_and_explore_dataset(file_path):
    try:
        df = pd.read_csv(file_path, encoding='latin1', sep=';')
        print("Dataset carregado com sucesso!")
    except FileNotFoundError:
        print("Arquivo n√£o encontrado.")
        return None
    except Exception as e:
        print(f"Erro ao carregar o arquivo: {e}")
        return None

    # Limpeza inicial
    df.dropna(subset=['KTAS_expert'], inplace=True)
    if 'Saturation' in df.columns:
        df.drop('Saturation', axis=1, inplace=True)

    # Convers√£o de colunas num√©ricas
    num_cols = ['Age', 'SBP', 'DBP', 'HR', 'RR', 'BT', 'NRS_pain']
    for col in num_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col].astype(str).str.replace(',', '.'), errors='coerce')

    print("üîπ Dados limpos e num√©ricos convertidos com sucesso.")
    print(df[num_cols].describe())

    # An√°lise simples: distribui√ß√£o de triagem
    plt.figure(figsize=(8,4))
    sns.countplot(x='KTAS_expert', data=df, palette='viridis')
    plt.title("Distribui√ß√£o dos N√≠veis de Triagem (KTAS Expert)")
    plt.xlabel("N√≠vel de Triagem")
    plt.ylabel("Contagem")
    plt.show()

    return df


# ==========================================================
# 2Ô∏è. PR√â-PROCESSAMENTO E ENGENHARIA DE ATRIBUTOS
# ==========================================================

def preprocess_and_feature_engineer(df):
    print("\nüîπ Iniciando pr√©-processamento e engenharia de atributos...")

    df['Chief_complain'] = df['Chief_complain'].str.lower()
    complaint_mapping = {
        'abd pain': 'abdo pain',
        'abd. pain': 'abdo pain',
        'abdomen pain': 'abdo pain',
    }
    df['Chief_complain'].replace(complaint_mapping, inplace=True)

    # Agrupamento de queixas raras
    low_freq = 10
    counts = df['Chief_complain'].value_counts()
    rare = counts[counts < low_freq].index
    df['Chief_complain_Grouped'] = df['Chief_complain'].apply(lambda x: 'OUTROS' if x in rare else x)
    df.drop(columns=['Chief_complain'], inplace=True)

    # Preenche valores ausentes
    df.fillna(df.median(numeric_only=True), inplace=True)

    # One-Hot Encoding
    cat_features = ['Sex', 'Chief_complain_Grouped']
    df_encoded = pd.get_dummies(df, columns=cat_features, drop_first=True)

    X = df_encoded.drop(columns=['KTAS_expert'])
    y = df_encoded['KTAS_expert']

    print("‚úÖ Pr√©-processamento conclu√≠do. Total de features:", X.shape[1])
    return X, y


# ==========================================================
# 3Ô∏è. MODELO PREDITIVO V4 (XGBOOST BALANCEADO E CALIBRADO)
# ==========================================================

def train_and_evaluate_model_v4(X, y):
    y_adjusted = y - 1  # Corrige classes 1‚Äì5 para 0‚Äì4
    X_train, X_test, y_train, y_test = train_test_split(X, y_adjusted, test_size=0.2, random_state=42, stratify=y_adjusted)

    # Pesos de classe
    classes = np.unique(y_train)
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
    class_weights = dict(zip(classes, weights))
    sample_weights = np.array([class_weights[label] for label in y_train])

    model = XGBClassifier(
        objective='multi:softprob',
        num_class=5,
        use_label_encoder=False,
        eval_metric='mlogloss',
        n_estimators=100,
        random_state=42
    )

    model.fit(X_train, y_train, sample_weight=sample_weights)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)

    # Calibra√ß√£o isot√¥nica
    calib = CalibratedClassifierCV(model, method='isotonic', cv='prefit')
    calib.fit(X_train, y_train)
    y_proba_calib = calib.predict_proba(X_test)

    # M√©tricas
    accuracy = accuracy_score(y_test, y_pred)
    y_test_onehot = label_binarize(y_test, classes=np.arange(5))
    brier = np.mean(np.sum((y_proba_calib - y_test_onehot) ** 2, axis=1))

    print("\nüìä RESULTADOS FINAIS (Modelo V4 - XGBoost Balanceado)")
    print(f"Acur√°cia: {accuracy:.4f}")
    print(f"Brier Score (Multiclasse): {brier:.4f}")
    print("\nRelat√≥rio de Classifica√ß√£o:\n", classification_report(y_test, y_pred, zero_division=0))
    print("\nMatriz de Confus√£o:\n", confusion_matrix(y_test, y_pred))


# ==========================================================
# 4Ô∏è. EXEC PRINCIPAL
# ==========================================================

if __name__ == "__main__":
    df = load_and_explore_dataset('data.csv')
    if df is not None:
        X, y = preprocess_and_feature_engineer(df)
        train_and_evaluate_model_v4(X, y)


Atributos selecionados para atributos:


Unnamed: 0,Chief_complain,Age,Sex,NRS_pain,SBP,DBP,HR,RR,BT
0,right ocular pain,71,2,2.0,160.0,100.0,84.0,18.0,36.6
1,right forearm burn,56,1,2.0,137.0,75.0,60.0,20.0,36.5
2,"arm pain, Lt",68,1,2.0,130.0,80.0,102.0,20.0,36.6
3,ascites tapping,71,2,3.0,139.0,94.0,88.0,20.0,36.5
4,"distension, abd",58,2,3.0,91.0,67.0,93.0,18.0,36.5



Atributos selecionados para alvos:


Unnamed: 0,Diagnosis in ED,KTAS_expert
0,Corneal abrasion,4
1,"Burn of hand, firts degree dorsum",5
2,"Fracture of surgical neck of humerus, closed",5
3,Alcoholic liver cirrhosis with ascites,5
4,Ascites,5



atributos ap√≥s One-Hot Encoding:


Unnamed: 0,Age,Sex,NRS_pain,SBP,DBP,HR,RR,BT,Chief_complain_??,Chief_complain_?? ??,...,Chief_complain_vaginal spotting,Chief_complain_visual disturbance,Chief_complain_voiding difficulty,Chief_complain_vomiting,Chief_complain_watery diarrhea,Chief_complain_weakness,Chief_complain_whirling type vertigo,Chief_complain_wrist laceration,"Chief_complain_wrist pain, Lt","Chief_complain_wrist pain, Rt"
0,71,2,2.0,160.0,100.0,84.0,18.0,36.6,False,False,...,False,False,False,False,False,False,False,False,False,False
1,56,1,2.0,137.0,75.0,60.0,20.0,36.5,False,False,...,False,False,False,False,False,False,False,False,False,False
2,68,1,2.0,130.0,80.0,102.0,20.0,36.6,False,False,...,False,False,False,False,False,False,False,False,False,False
3,71,2,3.0,139.0,94.0,88.0,20.0,36.5,False,False,...,False,False,False,False,False,False,False,False,False,False
4,58,2,3.0,91.0,67.0,93.0,18.0,36.5,False,False,...,False,False,False,False,False,False,False,False,False,False



Shape das atributos ap√≥s One-Hot Encoding: (1265, 425)

alvos ap√≥s One-Hot Encoding:


Unnamed: 0,KTAS_expert,Diagnosis in ED_(Seizure) Convulsive NOS,"Diagnosis in ED_Abdomen pain, generalized","Diagnosis in ED_Abdomen pain, right upper quadrant","Diagnosis in ED_Abdominal aortic aneurysm, ruptured",Diagnosis in ED_Abdominal pain,Diagnosis in ED_Abnormal finding on abdominal and retroperitoneal imaging,Diagnosis in ED_Abnormal involuntary movement,"Diagnosis in ED_Abnormal uterine and vaginal bleeding, unspecified",Diagnosis in ED_Abscess of abdominopelvic,...,Diagnosis in ED_Ventricular tachycardia,"Diagnosis in ED_Vertebra compression fracture, NOS, Lumbar region",Diagnosis in ED_Vestibular neuritis,Diagnosis in ED_Vestibular neuronitis,Diagnosis in ED_Viral infection,"Diagnosis in ED_Viral infection, unspecified",Diagnosis in ED_Vomiting,Diagnosis in ED_Whiplash injury,Diagnosis in ED_drug eruption,Diagnosis in ED_fever of unknown origin
0,4,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,5,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,5,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,5,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,5,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False



Shape dos alvos ap√≥s One-Hot Encoding: (1265, 584)
