In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    roc_auc_score, roc_curve,
    precision_recall_curve, average_precision_score,
    accuracy_score, confusion_matrix, classification_report
)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.utils import shuffle
import joblib
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=UserWarning)

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

caminho = "/Users/renanmoura/Documents/mestrado/PE-AI/data/dados.xlsx"
df = pd.read_excel(caminho)
print("Shape original:", df.shape)

target_col = "PreEclampsia"

display(df.head())

Shape original: (571, 59)


Unnamed: 0,PacienteId,Data,peso,imc,diabetes,hipertensao,dataNascimento,dataProvavelParto,pesoPrimeiroTrimestre,origemRacial,...,ComprimentoNascimento,PerimetroCefalico,Apgar1Minuto,Apgar5Minutos,DiasHospital,Intercorrencias,TipoDiabetes,PrimeiroPesoGravidez,DataParto,DataPesoCartao
0,44,25/11/2024,81.0,31.64,Sim,N√£o,1989-06-16,,,,...,50.0,36.0,8.0,9.0,2.0,,Diabetes Gestacional,72.0,45634.0,45629.0
1,44,11/06/2024,76.0,29.69,,N√£o,,,,Pardo,...,,,,,,,,,,
2,44,07/05/2024,,,,,,2024-12-19,,,...,,,,,,,,,,
3,44,12/05/2023,,,,,,,,,...,,,,,,,,,,
4,168,11/12/2024,105.0,38.57,Sim,N√£o,1990-07-26,,,,...,46.0,34.5,8.0,9.0,1.0,,Diabetes Gestacional,105.0,45641.0,45637.0


In [2]:
# MAPAS
map_raca = {"Branco": 1, "Pardo": 2, "Preto": 3}
map_boolean = {
    "Sim": 1, "YES": 1, "SIM": 1, "TRUE": 1,
    "Nao": 0, "NAO": 0, "N√£o": 0, "N√ÉO": 0, "FALSE": 0
}
map_hist_diabetes = {
    "N√£o": 0, "NAO": 0, "N√ÉO": 0, "Nao": 0,
    "1¬∫ grau": 3, "1¬∞ GRAU": 3, "1 GRAU": 3,
    "2¬∫ grau": 2, "2¬∞ GRAU": 2, "2 GRAU": 2,
    "3¬∫ grau": 1, "3¬∞ GRAU": 1, "3 GRAU": 1
}

# FEATURES 
input_features = [
    "idade", "imc", "diabetes", "hipertensao",
    "origemRacial", "historicoFamiliarDiabetes", "TipoDiabetes",
    "mediaIP","perdasGestacionais", "peso",
    "idadeGestacional", "idadeGestacionalCorrigida", "pesoFetal",
    "percentilArteriaUterina", "percentilArtUmbilical",
    "percentilPeso","circunferenciaAbdominal"
]

df_processed = df.copy()

# IDADE POR PACIENTE

if "paciente_id" in df_processed.columns:
    paciente_ids = df_processed["paciente_id"]
else:
    df_processed["paciente_id_temp"] = df_processed[
        ["dataNascimento","origemRacial","imc"]
    ].astype(str).agg("_".join, axis=1)
    paciente_ids = df_processed["paciente_id_temp"]

df_processed["paciente_id_base"] = paciente_ids

data_referencia = pd.to_datetime("2025-12-02")

paciente_to_nasc = {}
for pid in paciente_ids.unique():
    nasc = pd.to_datetime(
        df_processed.loc[paciente_ids == pid, "dataNascimento"],
        errors="coerce"
    ).dropna()
    paciente_to_nasc[pid] = nasc.mode().iloc[0] if len(nasc) else None

def calc_idade(d):
    if pd.isna(d):
        return 28
    return np.clip((data_referencia - d).days / 365.25, 15, 50)

df_processed["idade"] = df_processed["paciente_id_base"].map(
    lambda x: calc_idade(paciente_to_nasc.get(x))
)

# GESTA√á√ïES POR DATA

if "Data" not in df_processed.columns:
    raise ValueError("Coluna 'Data' (data da consulta) √© obrigat√≥ria")

df_processed["Data"] = pd.to_datetime(
    df_processed["Data"], errors="coerce", dayfirst=True
)

df_processed = df_processed.sort_values(
    ["paciente_id_base","Data"]
).reset_index(drop=True)

MAX_GAP = 270   # dias (~9 meses)

episodios = []

for pid, grupo in df_processed.groupby("paciente_id_base"):
    datas = grupo["Data"].values
    ep = 1
    
    for i in range(len(grupo)):
        if i == 0:
            episodios.append(f"{pid}")
            continue
        
        gap = (datas[i] - datas[i-1]).astype('timedelta64[D]').astype(int)
        if gap > MAX_GAP:
            ep += 1
            
        suf = "" if ep == 1 else chr(ord("A") + ep - 2)
        episodios.append(f"{pid}{suf}")

df_processed["PacienteIdEpisodio"] = episodios

# CONVERS√ïES CATEG√ìRICAS

if "origemRacial" in df_processed.columns:
    df_processed["origemRacial"] = (
        df_processed["origemRacial"]
        .astype(str).str.strip()
        .map(map_raca)
        .astype(float)
    )

for col in ["diabetes","hipertensao"]:
    if col in df_processed.columns:
        df_processed[col] = (
            df_processed[col]
            .astype(str).str.strip()
            .map(map_boolean)
            .astype(float)
        )

if "historicoFamiliarDiabetes" in df_processed.columns:
    df_processed["historicoFamiliarDiabetes"] = (
        df_processed["historicoFamiliarDiabetes"]
        .astype(str).str.strip()
        .replace(map_hist_diabetes)
        .astype(float)
    )

if "TipoDiabetes" in df_processed.columns:
    df_processed["TipoDiabetes"] = (
        df_processed["TipoDiabetes"]
        .astype(str).str.strip()
        .replace({
            "Diabetes Gestacional": 1,
            "Tipo 1": 2,
            "Tipo 2": 3
        })
    )
    df_processed["TipoDiabetes"] = pd.to_numeric(
        df_processed["TipoDiabetes"], errors="coerce"
    ).fillna(0)

# DADOS OBST√âTRICOS

if "perdasGestacionais" in df_processed.columns:
    df_processed["perdasGestacionais"] = (
        pd.to_numeric(df_processed["perdasGestacionais"], errors="coerce")
        .fillna(0)
    )

if "mediaIP" in df_processed.columns:
    df_processed["mediaIP"] = np.where(
        df_processed["mediaIP"] >= 1.3,
        df_processed["mediaIP"] * 1.3,
        df_processed["mediaIP"]
    )

# peso = peso - pesoFetal

peso_mae_kg = pd.to_numeric(df_processed["peso"], errors="coerce").fillna(0)
peso_feto_g = pd.to_numeric(df_processed["pesoFetal"], errors="coerce").fillna(0)

peso_feto_kg = peso_feto_g / 1000.0

df_processed["peso"] = (peso_mae_kg - peso_feto_kg).clip(lower=35)

# GARANTIR NUM√âRICO + NAN

for col in input_features:
    if col not in df_processed.columns:
        print(f" Criando {col}=0")
        df_processed[col] = 0
        
    df_processed[col] = (
        pd.to_numeric(df_processed[col], errors="coerce")
        .fillna(0)
        .astype(float)
    )

# LIMITES CL√çNICOS

def aplicar_limites_realistas(X, feature_names):
    limites = {
        "idade": (15, 50),
        "peso": (35, 150),
        "imc": (15, 50),
        "pesoFetal": (0, 5000)
    }
    X = X.copy()
    for f,(lo,hi) in limites.items():
        if f in feature_names:
            X[f] = X[f].clip(lo,hi)
    return X

df_processed[input_features] = aplicar_limites_realistas(
    df_processed[input_features],
    input_features
)

# ALVO

alvo = (
    df_processed[target_col]
    .replace({True:1,False:0})
    .astype(str).str.upper()
    .replace({"TRUE":1,"FALSE":0})
)

alvo = pd.to_numeric(alvo, errors="coerce")
df_processed[target_col] = alvo.astype("Int64")

df_processed = df_processed[~df_processed[target_col].isna()]
df_processed[target_col] = df_processed[target_col].astype(int)

# ESTAT√çSTICAS FINAIS


print("\n=== ESTAT√çSTICAS FINAIS ===")

for c in ["idade","peso","imc"]:
    s = df_processed[c]
    print(f"{c}: min={s.min():.1f}, max={s.max():.1f}, mean={s.mean():.1f}")

print(
    f"\nShape final: {df_processed.shape}"
    f"\nGesta√ß√µes √∫nicas: {df_processed['PacienteIdEpisodio'].nunique()}"
    f"\nSem NaNs nas features"
)

print("Target:", df_processed[target_col].value_counts().to_dict())


392 gesta√ß√µes identificadas

=== ESTAT√çSTICAS FINAIS ===
idade: min=15.0, max=43.6, mean=32.0
peso: min=35.0, max=119.1, mean=73.0
imc: min=15.0, max=43.0, mean=28.3

Shape final: (151, 63)
Gesta√ß√µes √∫nicas: 146
Sem NaNs nas features
Target: {0: 127, 1: 24}


In [3]:
# Preparar dados
X = df_processed[input_features].copy()
y = df_processed[target_col].copy()
groups = df_processed["PacienteIdEpisodio"].values


# Split por paciente
gss = GroupShuffleSplit(n_splits=1, test_size=0.20, random_state=RANDOM_STATE)
train_idx, test_idx = next(gss.split(X, y, groups=groups))

X_train = X.iloc[train_idx].reset_index(drop=True)
X_test = X.iloc[test_idx].reset_index(drop=True)
y_train = y.iloc[train_idx].reset_index(drop=True)
y_test = y.iloc[test_idx].reset_index(drop=True)

print(f"Treino: {X_train.shape} | Teste: {X_test.shape}")
print(f"Pacientes treino: {df_processed['PacienteId'].iloc[train_idx].nunique()}")
print(f"Pacientes teste: {df_processed['PacienteId'].iloc[test_idx].nunique()}")
print(f"\nDistribui√ß√£o y_train:\n{y_train.value_counts()}")
print(f"\nDistribui√ß√£o y_test:\n{y_test.value_counts()}")

Treino: (121, 17) | Teste: (30, 17)
Pacientes treino: 121
Pacientes teste: 30

Distribui√ß√£o y_train:
PreEclampsia
0    100
1     21
Name: count, dtype: int64

Distribui√ß√£o y_test:
PreEclampsia
0    27
1     3
Name: count, dtype: int64


In [4]:
sm = SMOTE(random_state=RANDOM_STATE)
X_train_smote, y_train_smote = sm.fit_resample(X_train, y_train)

print(f"Shape ap√≥s SMOTE: {X_train_smote.shape}")
print(f"Distribui√ß√£o p√≥s-SMOTE:\n{pd.Series(y_train_smote).value_counts()}")

Shape ap√≥s SMOTE: (200, 17)
Distribui√ß√£o p√≥s-SMOTE:
PreEclampsia
0    100
1    100
Name: count, dtype: int64


In [5]:
X_np = X_train_smote.values
y_np = y_train_smote.values.astype(float)

cols_binarias = ["diabetes", "hipertensao"]

cont_cols = [c for c in input_features if c not in cols_binarias]
cont_idx = [input_features.index(c) for c in cont_cols]

# Fun√ß√£o de augmenta√ß√£o com ru√≠do gaussiano
def augment_gaussian_noise(X_np, y_np, factor=0.3, noise_std=0.02, random_state=None):
    rng = np.random.RandomState(random_state)
    n_new = int(len(X_np) * factor)
    if n_new == 0:
        return X_np.copy(), y_np.copy()
    idx = rng.randint(0, len(X_np), size=n_new)
    X_new = X_np[idx].copy()

    # ru√≠do s√≥ nas colunas cont√≠nuas
    noise = rng.normal(0, noise_std, size=X_new[:, cont_idx].shape)
    X_new[:, cont_idx] += noise

    y_new = y_np[idx]
    return X_new, y_new

Xg, yg = augment_gaussian_noise(
    X_np, y_np, factor=0.4, noise_std=0.02, random_state=RANDOM_STATE
)

X_aug = np.vstack([X_np, Xg])
y_aug = np.concatenate([y_np, yg])
X_final, y_final = shuffle(X_aug, y_aug, random_state=RANDOM_STATE)


X_final_df = pd.DataFrame(X_final, columns=input_features)


X_final_df = aplicar_limites_realistas(X_final_df, input_features)


for col in cols_binarias:
    if col in X_final_df.columns:
        X_final_df[col] = X_final_df[col].clip(0, 1)

X_final = X_final_df.values

print(f"Shape final ap√≥s augmentation: {X_final.shape}")


Shape final ap√≥s augmentation: (280, 17)


In [6]:
X_train_val, X_test_final, y_train_val, y_test_final = train_test_split(
    X_final, y_final, test_size=0.15, stratify=np.round(y_final), random_state=RANDOM_STATE
)

X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.15, stratify=np.round(y_train_val), random_state=RANDOM_STATE
)

print(f"Treino final: {X_train_final.shape}")
print(f"Valida√ß√£o: {X_val.shape}")
print(f"Teste final: {X_test_final.shape}")

Treino final: (202, 17)
Valida√ß√£o: (36, 17)
Teste final: (42, 17)


In [7]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_final)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test_final)


X_train_scaled = pd.DataFrame(X_train_scaled, columns=input_features)
X_val_scaled = pd.DataFrame(X_val_scaled, columns=input_features)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=input_features)

# Suprimir warnings
import warnings
warnings.filterwarnings('ignore', category=UserWarning)


lgb_model = lgb.LGBMClassifier(
    objective='binary',
    boosting_type='gbdt',
    n_estimators=500,
    learning_rate=0.1,
    num_leaves=31,
    max_depth=6,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=RANDOM_STATE,
    n_jobs=-1,
    verbosity=-1, 
    force_row_wise=True 
)


lgb_model.fit(
    X_train_scaled,
    y_train_final,
    eval_set=[(X_val_scaled, y_val)],
    eval_metric='binary_logloss',
    callbacks=[
        lgb.early_stopping(stopping_rounds=100, verbose=False),  # verbose=False
        lgb.log_evaluation(period=0, show_stdv=False)  # show_stdv=False
    ]
)

print("LightGBM treinado com sucesso!")

class ManualPipeline:
    def __init__(self, scaler, model, feature_names):
        self.scaler = scaler
        self.model = model
        self.feature_names = feature_names
    
    def predict_proba(self, X):
        X_df = pd.DataFrame(X, columns=self.feature_names)
        X_scaled = self.scaler.transform(X_df)
        return self.model.predict_proba(X_scaled)
    
    def predict(self, X):
        X_df = pd.DataFrame(X, columns=self.feature_names)
        X_scaled = self.scaler.transform(X_df)
        return self.model.predict(X_scaled)

trained_model = ManualPipeline(scaler, lgb_model, input_features)

# import√¢ncia das features
if hasattr(lgb_model, 'feature_importances_'):
    feature_importance = lgb_model.feature_importances_
    importance_df = pd.DataFrame({
        'feature': input_features,
        'importance': feature_importance
    }).sort_values('importance', ascending=False)
    
    print("\nTop 10 features mais importantes:")
    print(importance_df.head(10))

LightGBM treinado com sucesso!

Top 10 features mais importantes:
                    feature  importance
9                      peso         287
1                       imc         256
0                     idade         164
13  percentilArteriaUterina         111
7                   mediaIP         102
4              origemRacial          96
14    percentilArtUmbilical          91
15            percentilPeso          85
3               hipertensao          83
6              TipoDiabetes          79


In [8]:
print("Antes do augmentation:")
print(f"X_train_smote shape: {X_train_smote.shape}")
print(f"Distribui√ß√£o y_train_smote: {pd.Series(y_train_smote).value_counts()}")

print("\nAp√≥s augmentation:")
print(f"X_final shape: {X_final.shape}")
print(f"Distribui√ß√£o y_final: {pd.Series(y_final).value_counts()}")

Antes do augmentation:
X_train_smote shape: (200, 17)
Distribui√ß√£o y_train_smote: PreEclampsia
0    100
1    100
Name: count, dtype: int64

Ap√≥s augmentation:
X_final shape: (280, 17)
Distribui√ß√£o y_final: 0.0    146
1.0    134
Name: count, dtype: int64


In [9]:
from sklearn.metrics import precision_score, recall_score, f1_score

y_proba = lgb_model.predict_proba(X_test_scaled)[:, 1]
y_pred = (y_proba >= 0.5).astype(int)
auc = roc_auc_score(y_test_final, y_proba)
accuracy = accuracy_score(y_test_final, y_pred)
precision = precision_score(y_test_final, y_pred)
recall = recall_score(y_test_final, y_pred)
f1 = f1_score(y_test_final, y_pred)

print(f"üìä M√©tricas calculadas:")
print(f"AUC: {auc:.4f}")
print(f"Acur√°cia: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

bundle_lgbm = {
    "model": lgb_model,
    "scaler": scaler,
    "input_features": input_features,
    "target": target_col,
    "map_raca": map_raca,
    "map_boolean": map_boolean,
    "map_hist_diabetes": map_hist_diabetes,
    "performance": {
        "auc": auc,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    },
    "version": "2.0_otimizado_final"
}

# Salvar
bundle_path = "/Users/renanmoura/Documents/mestrado/PE-AI/models/model_lgbm_bundle.pkl"
joblib.dump(bundle_lgbm, bundle_path)

print("Bundle salvo com sucesso!")
print(f"Features no modelo: {len(input_features)}")

üìä M√©tricas calculadas:
AUC: 0.9886
Acur√°cia: 0.9286
Precision: 0.9048
Recall: 0.9500
F1-Score: 0.9268
Bundle salvo com sucesso!
Features no modelo: 17


In [10]:
import warnings
warnings.filterwarnings('ignore', category=UserWarning)

y_proba = trained_model.predict_proba(X_test_final)[:, 1]
y_pred = (y_proba >= 0.5).astype(int)
y_test_int = np.round(y_test_final).astype(int)

print(f"\nClassification Report:\n{classification_report(y_test_int, y_pred)}")
print(f"\nMatriz de Confus√£o:\n{confusion_matrix(y_test_int, y_pred)}")


Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.91      0.93        22
           1       0.90      0.95      0.93        20

    accuracy                           0.93        42
   macro avg       0.93      0.93      0.93        42
weighted avg       0.93      0.93      0.93        42


Matriz de Confus√£o:
[[20  2]
 [ 1 19]]
