In [1]:
PATH = "/content/models"
mapping_pos = ["Contratado pela Decision", "Aprovado", "Finalista", "Documentação PJ", "Encaminhado ao Cliente com Aprovação"]
mapping_neg = ["Não Aprovado pelo Cliente", "Não Aprovado pelo RH", "Desistiu", "Prospect", "Encaminhado ao Requisitante (sem retorno)"]

In [None]:
import pandas as pd, numpy as np, re, unicodedata, joblib, json
from pathlib import Path
from typing import List, Set
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# =========================
# Helpers
# =========================

def norm_txt(t: str) -> str:
    t = str(t or "")
    t = unicodedata.normalize("NFKD", t).encode("ascii","ignore").decode("ascii")
    t = re.sub(r"\s+"," ", t.lower()).strip()
    return t

def split_areas(s: str) -> List[str]:
    s = norm_txt(s)
    if not s:
        return []
    # separadores comuns do dataset
    tokens = re.split(r"[;,/|\-•–—]+|\s{2,}", s)
    tokens = [tok.strip() for tok in tokens if tok.strip()]
    return tokens

def jaccard(a: List[str], b: List[str]) -> float:
    sa, sb = set(a), set(b)
    if not sa and not sb:
        return 0.0
    return len(sa & sb) / max(1, len(sa | sb))

def map_level(val: str, mapping: dict) -> int:
    v = norm_txt(val)
    return mapping.get(v, 0)

def contains_kw(text: str, kw: str) -> int:
    return int(bool(re.search(rf"\b{re.escape(kw.lower())}\b", norm_txt(text))))

def any_kw(text: str, kws: Set[str]) -> int:
    text = norm_txt(text)
    return int(any(re.search(rf"\b{re.escape(k)}\b", text) for k in kws))

# =========================
# Configs simples
# =========================

# Idiomas (ajuste se quiser granularidade CEFR)
MAP_LVL = {
    "nenhum":0, "basico":1, "básico":1, "intermediario":2, "intermediário":2,
    "avancado":3, "avançado":3, "fluente":4
}

# Senioridade (exemplos do dataset)
MAP_SENIOR = {
    "estagiario":1, "estagiário":1, "junior":2, "jr":2, "analista":3,
    "pleno":3, "senior":4, "sênior":4, "especialista":5
}

# Dicionário inicial de skills (pode crescer depois automaticamente)
SEED_SKILLS = {
    "sap","control-m","controlm","sql","pl/sql","oracle","aws","azure","gcp",
    "linux","windows","vmware","jcl","abap","java","python","etl","bi","powercenter",
    "connect direct","b2b","devops","git","docker","kubernetes"
}

# =========================
# Feature builder
# =========================

def build_features(df: pd.DataFrame, save_dir: str = "../app/model"):
    Path(save_dir).mkdir(parents=True, exist_ok=True)

    # -------- rótulo
    POS = {
        "contratado pela decision",
        "aprovado",
        "finalista",
        "documentação pj",
        "encaminhado ao cliente com aprovação",
        'contratado como hunting',
        'documentação clt',
        'encaminhar proposta',
        'proposta aceita'
    }
    sit = df["situacao_candidado"].fillna("").map(norm_txt)
    y = sit.isin(POS).astype(int).values

    # -------- textos base
    job_text_raw = (df["atividades_vaga"].fillna("") + " " + df["competencias_vaga"].fillna(""))
    cv_text_raw  = df["cv_texto_pt"].fillna("")
    job_text = job_text_raw.map(norm_txt)
    cv_text  = cv_text_raw.map(norm_txt)

    # -------- TF-IDF + similaridade
    # Usamos um único vocabulário para vaga e CV (mesmo espaço vetorial)
    tfidf = TfidfVectorizer(max_features=40000, ngram_range=(1,2), min_df=2)
    X_job = tfidf.fit_transform(job_text)
    X_cv  = tfidf.transform(cv_text)
    sim_tfidf = cosine_similarity(X_job, X_cv).diagonal()

    # -------- Idiomas
    vi = df.get("nivel_ingles_vaga", "").fillna("").map(lambda x: map_level(x, MAP_LVL)).astype(int).values
    ci = df.get("applicant_nivel_ingles", "").fillna("").map(lambda x: map_level(x, MAP_LVL)).astype(int).values
    ingles_ok = (ci >= vi).astype(int)

    ve = df.get("nivel_espanhol_vaga", "").fillna("").map(lambda x: map_level(x, MAP_LVL)).astype(int).values
    ce = df.get("applicant_nivel_espanhol", "").fillna("").map(lambda x: map_level(x, MAP_LVL)).astype(int).values
    espanhol_ok = (ce >= ve).astype(int)

    # -------- Senioridade (diferença vaga - candidato)
    vaga_sen = df.get("nivel_profissional_vaga","").fillna("").map(lambda x: map_level(x, MAP_SENIOR)).astype(int).values
    cand_sen = df.get("applicant_nivel_profissional","").fillna("").map(lambda x: map_level(x, MAP_SENIOR)).astype(int).values
    senior_ok = (cand_sen >= vaga_sen).astype(int)
    senior_gap = np.clip(cand_sen - vaga_sen, -3, 3)

    # -------- SAP / Control-M / SQL / AWS / Oracle (flags no CV + vaga)
    def flag_pair(kw: str):
        cv_f  = np.array([contains_kw(t, kw) for t in cv_text_raw], dtype=int)
        job_f = np.array([contains_kw(t, kw) for t in job_text_raw], dtype=int)
        return cv_f, job_f, (cv_f & job_f)

    cv_sap, job_sap, sap_match = flag_pair("sap")
    cv_ctrlm, job_ctrlm, ctrlm_match = flag_pair("control-m")
    cv_sql, job_sql, sql_match = flag_pair("sql")
    cv_aws, job_aws, aws_match = flag_pair("aws")
    cv_orc, job_orc, orc_match = flag_pair("oracle")

    # -------- Contagem de skills (seed + mining simples a partir das vagas)
    # pega top termos de vaga (unigramas) com maior df para virar skill também
    vocab = tfidf.get_feature_names_out()
    # heurística: unigramas com letras (sem números) e tamanho >=3
    mined = {v for v in vocab if (len(v.split())==1 and re.match(r"^[a-z][a-z0-9\-_\.]+$", v) and len(v)>=3)}
    # mantém apenas termos que parecem "tecnologias" (simples): presentes em muitos CVs também
    # (limita tamanho para não "inundar" o feature space)
    mined = set(list(mined)[:500])  # limite simples
    skills = (SEED_SKILLS | mined)

    def count_skills(text: str, skills: Set[str]) -> int:
        text = norm_txt(text)
        c = 0
        for k in skills:
            if re.search(rf"\b{re.escape(k)}\b", text):
                c += 1
        return c

    skills_in_job = np.array([count_skills(t, skills) for t in job_text_raw], dtype=int)
    skills_in_cv  = np.array([count_skills(t, skills) for t in cv_text_raw], dtype=int)
    # “cobertura”: quantas skills da vaga aparecem no CV (aproximação rápida)
    # como proxy: usa interseção por regex simples
    def overlap_count(jt: str, ct: str) -> int:
        jt = norm_txt(jt); ct = norm_txt(ct)
        sj = {k for k in skills if re.search(rf"\b{re.escape(k)}\b", jt)}
        sc = {k for k in skills if re.search(rf"\b{re.escape(k)}\b", ct)}
        return len(sj & sc)
    skills_overlap = np.array([overlap_count(j, c) for j, c in zip(job_text_raw, cv_text_raw)], dtype=int)

    # -------- Área de atuação (Jaccard)
    areas_vaga = [split_areas(s) for s in df.get("areas_atuacao_vaga","").fillna("").tolist()]
    areas_cand = [split_areas(s) for s in df.get("applicant_area_atuacao","").fillna("").tolist()]
    area_jacc = np.array([jaccard(a, b) for a, b in zip(areas_vaga, areas_cand)], dtype=float)

    # -------- Monta matriz X
    feats = np.c_[
        sim_tfidf,
        ingles_ok, espanhol_ok,
        senior_ok, senior_gap,
        sap_match, ctrlm_match, sql_match, aws_match, orc_match,
        cv_sap, cv_ctrlm, cv_sql, cv_aws, cv_orc,
        job_sap, job_ctrlm, job_sql, job_aws, job_orc,
        skills_in_job, skills_in_cv, skills_overlap,
        area_jacc
    ].astype(float)

    feat_names = [
        "sim_tfidf",
        "ingles_ok","espanhol_ok",
        "senior_ok","senior_gap",
        "sap_match","ctrlm_match","sql_match","aws_match","oracle_match",
        "cv_sap","cv_ctrlm","cv_sql","cv_aws","cv_oracle",
        "job_sap","job_ctrlm","job_sql","job_aws","job_oracle",
        "skills_in_job","skills_in_cv","skills_overlap",
        "area_jacc"
    ]

    # -------- groups por vaga para GroupKFold
    groups_job = df["id_vaga"].values

    # -------- salva artefatos e dados
    np.save(Path(save_dir)/"X.npy", feats)
    np.save(Path(save_dir)/"y.npy", y)
    np.save(Path(save_dir)/"groups_job.npy", groups_job)

    joblib.dump({
        "tfidf": tfidf,
        "feat_names": feat_names,
        "map_lvl": MAP_LVL,
        "map_senior": MAP_SENIOR,
        "skills_seed": sorted(SEED_SKILLS),
        "skills_mined_sample": sorted(list(mined))[:50],  # só para inspecionar
    }, Path(save_dir)/"artifacts.joblib")

    print(f"[ok] features salvas em '{save_dir}': X.npy {feats.shape}, y.npy {y.shape}, groups_job.npy {groups_job.shape}")
    print(f"[ok] exemplos de features: {feat_names[:8]} ... total={len(feat_names)}")

if __name__ == "__main__":
    # ajuste o caminho se necessário
    df = pd.read_pickle('../app/model/df_final.pkl')
    build_features(df, save_dir="../app/model")


[ok] features salvas em '../build': X.npy (53759, 24), y.npy (53759,), groups_job.npy (53759,)
[ok] exemplos de features: ['sim_tfidf', 'ingles_ok', 'espanhol_ok', 'senior_ok', 'senior_gap', 'sap_match', 'ctrlm_match', 'sql_match'] ... total=24


In [None]:

X = np.load("../app/model/X.npy", allow_pickle=True)
y = np.load("../app/model/y.npy", allow_pickle=True)
groups = np.load("../app/model/groups_job.npy", allow_pickle=True)
arts = joblib.load("../app/model/artifacts.joblib",)
print("feat_names:", arts["feat_names"])

feat_names: ['sim_tfidf', 'ingles_ok', 'espanhol_ok', 'senior_ok', 'senior_gap', 'sap_match', 'ctrlm_match', 'sql_match', 'aws_match', 'oracle_match', 'cv_sap', 'cv_ctrlm', 'cv_sql', 'cv_aws', 'cv_oracle', 'job_sap', 'job_ctrlm', 'job_sql', 'job_aws', 'job_oracle', 'skills_in_job', 'skills_in_cv', 'skills_overlap', 'area_jacc']


In [10]:
X

array([[ 0.18971818,  1.        ,  1.        , ..., 31.        ,
         1.        ,  0.        ],
       [ 0.0813253 ,  1.        ,  1.        , ..., 10.        ,
         1.        ,  0.125     ],
       [ 0.19803253,  1.        ,  1.        , ..., 29.        ,
         0.        ,  0.        ],
       ...,
       [ 0.13800335,  0.        ,  1.        , ..., 24.        ,
         4.        ,  0.        ],
       [ 0.29219967,  0.        ,  1.        , ..., 13.        ,
         5.        ,  0.        ],
       [ 0.        ,  0.        ,  1.        , ...,  0.        ,
         0.        ,  0.        ]])

In [None]:
# train.py
import json, os
from pathlib import Path

import joblib
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GroupKFold
from sklearn.metrics import average_precision_score, f1_score, precision_recall_curve
from sklearn.neural_network import MLPClassifier
BUILD_DIR = Path("../app/model")
OUT_DIR = Path("../app/model")  # pode separar se quiser
OUT_DIR.mkdir(parents=True, exist_ok=True)

def precision_at_k_per_job(y_true, y_score, job_ids, k=3):
    """
    Calcula Precision@K por vaga e retorna a média.
    Para cada job_id, ranqueia candidatos por score desc, pega top-K (ou todos se <K).
    """
    assert len(y_true) == len(y_score) == len(job_ids)
    jobs = np.unique(job_ids)
    precs = []
    for j in jobs:
        mask = (job_ids == j)
        if mask.sum() == 0:
            continue
        # ranking desc
        idx = np.argsort(-y_score[mask])
        topk = idx[:min(k, mask.sum())]
        yk = y_true[mask][topk]
        precs.append(yk.mean() if len(yk) > 0 else 0.0)
    return float(np.mean(precs)) if len(precs) > 0 else 0.0

def choose_best_threshold_by_f1(y_true, y_score):
    """
    Encontra o threshold que maximiza F1 usando a curva de Precision-Recall.
    """
    prec, rec, thr = precision_recall_curve(y_true, y_score)
    # evita divisão por zero
    denom = (prec + rec)
    denom[denom == 0] = 1e-9
    f1s = 2 * (prec * rec) / denom
    # últimos pontos de PR podem não ter threshold; alinhar:
    thr_full = np.r_[thr, [1.0]]  # garante mesmo tamanho que f1s
    best_idx = int(np.nanargmax(f1s))
    return float(thr_full[best_idx]), float(f1s[best_idx])

def eval_cv(model_name, model, X, y, groups, k_s=[3,5], n_splits=5):
    gkf = GroupKFold(n_splits=n_splits)
    oof = np.zeros(len(y), dtype=float)

    fold_metrics = []
    for fold, (tr, va) in enumerate(gkf.split(X, y, groups)):
        m = model
        m.fit(X[tr], y[tr])
        p = m.predict_proba(X[va])[:, 1]
        oof[va] = p

        ap = average_precision_score(y[va], p)
        thr, f1_best = choose_best_threshold_by_f1(y[va], p)
        metrics = {"fold": fold, "auc_pr": float(ap), "f1_best": float(f1_best), "thr_best": float(thr)}
        # Precision@K por vaga (usando scores)
        for K in k_s:
            pk = precision_at_k_per_job(y[va], p, groups[va], k=K)
            metrics[f"p@{K}"] = float(pk)
        fold_metrics.append(metrics)

    # agregados
    ap_mean = float(average_precision_score(y, oof))
    thr_global, f1_global = choose_best_threshold_by_f1(y, oof)

    agg = {
        "model": model_name,
        "auc_pr_oof": ap_mean,
        "f1_best_oof": float(f1_global),
        "thr_best_oof": float(thr_global),
    }
    # p@K global (média por vaga no conjunto todo, usando OOF)
    for K in k_s:
        agg[f"p@{K}_oof"] = float(precision_at_k_per_job(y, oof, groups, k=K))

    return oof, fold_metrics, agg

def main():

    X = np.load(BUILD_DIR / "X.npy", allow_pickle=True)
    y = np.load(BUILD_DIR / "y.npy", allow_pickle=True)
    groups = np.load(BUILD_DIR / "groups_job.npy", allow_pickle=True)
    arts = joblib.load(BUILD_DIR / "artifacts.joblib")
    feat_names = arts.get("feat_names", [f"f{i}" for i in range(X.shape[1])])

    # 2) Modelos candidatos
    models = {
        "logreg": LogisticRegression(max_iter=1000, class_weight="balanced"),
        "rf": RandomForestClassifier(
            n_estimators=400,
            max_depth=None,
            min_samples_split=2,
            min_samples_leaf=1,
            n_jobs=-1,
            class_weight="balanced_subsample",
            random_state=42,
        ),
         "mlp": MLPClassifier(
                hidden_layer_sizes=(256,128),
                activation="relu",
                alpha=1e-4,                # L2
                learning_rate_init=1e-3,
                max_iter=300,
                early_stopping=True,
                n_iter_no_change=15,
                validation_fraction=0.1,
                random_state=42
            )
    }

    results = {}
    best_name, best_ap = None, -1.0
    best_oof = None
    for name, mdl in models.items():
        oof, fold_metrics, agg = eval_cv(name, mdl, X, y, groups, k_s=[3,5], n_splits=5)
        results[name] = {"folds": fold_metrics, "agg": agg}
        if agg["auc_pr_oof"] > best_ap:
            best_ap = agg["auc_pr_oof"]
            best_name = name
            best_oof = oof

    # 3) Treina final no conjunto todo com o melhor modelo e salva
    best_model = models[best_name]
    best_model.fit(X, y)

    # threshold ótimo a partir do OOF (guardamos para a API)
    thr_best = results[best_name]["agg"]["thr_best_oof"]

    joblib.dump(best_model, OUT_DIR / "model.joblib")
    with open(OUT_DIR / "report.json", "w", encoding="utf-8") as f:
        json.dump({
            "best_model": best_name,
            "results": results,
            "feat_names": feat_names,
            "threshold_best_f1": thr_best,
        }, f, ensure_ascii=False, indent=2)

    # salva OOF para auditoria/monitoramento (útil no Streamlit depois)
    np.save(OUT_DIR / "oof_scores.npy", best_oof)
    np.save(OUT_DIR / "labels.npy", y)
    np.save(OUT_DIR / "groups_job.npy", groups)  # regrava pra conveniência

    print(f"[ok] melhor modelo: {best_name}")
    print(f"[ok] AUC-PR (OOF): {results[best_name]['agg']['auc_pr_oof']:.4f}")
    print(f"[ok] F1_best (OOF): {results[best_name]['agg']['f1_best_oof']:.4f} @thr={thr_best:.3f}")
    print(f"[ok] P@3 (OOF): {results[best_name]['agg']['p@3_oof']:.4f} | P@5 (OOF): {results[best_name]['agg']['p@5_oof']:.4f}")
    print(f"[ok] artefatos salvos em {OUT_DIR.resolve()}/")

if __name__ == "__main__":
    main()


[ok] melhor modelo: rf
[ok] AUC-PR (OOF): 0.1521
[ok] F1_best (OOF): 0.2085 @thr=0.562
[ok] P@3 (OOF): 0.1388 | P@5 (OOF): 0.1385
[ok] artefatos salvos em C:\Users\kaio-\mlops\FIAP_PROJECTS_05\build/


In [None]:
df3 = pd.read_csv('/content/models/csv_df_final.csv')

In [None]:
df3.info()

In [None]:
# train_mlflow.py
import os, json
from pathlib import Path
import joblib
import numpy as np

import mlflow
import mlflow.sklearn

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GroupKFold
from sklearn.metrics import average_precision_score, f1_score, precision_recall_curve

# --------- Paths
BUILD_DIR = Path("../app/model")
OUT_DIR = Path("../app/model")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ======================= utils métricas =======================
def precision_at_k_per_job(y_true, y_score, job_ids, k=3):
    assert len(y_true) == len(y_score) == len(job_ids)
    jobs = np.unique(job_ids)
    precs = []
    for j in jobs:
        mask = (job_ids == j)
        if mask.sum() == 0:
            continue
        idx = np.argsort(-y_score[mask])
        topk = idx[:min(k, mask.sum())]
        yk = y_true[mask][topk]
        precs.append(yk.mean() if len(yk) > 0 else 0.0)
    return float(np.mean(precs)) if len(precs) > 0 else 0.0

def choose_best_threshold_by_f1(y_true, y_score):
    prec, rec, thr = precision_recall_curve(y_true, y_score)
    denom = (prec + rec)
    denom[denom == 0] = 1e-9
    f1s = 2 * (prec * rec) / denom
    thr_full = np.r_[thr, [1.0]]
    best_idx = int(np.nanargmax(f1s))
    return float(thr_full[best_idx]), float(f1s[best_idx])

def eval_cv(model_name, model, X, y, groups, k_s=(3,5), n_splits=5, log_to_mlflow=True):
    gkf = GroupKFold(n_splits=n_splits)
    oof = np.zeros(len(y), dtype=float)
    fold_metrics = []

    for fold, (tr, va) in enumerate(gkf.split(X, y, groups)):
        m = model
        m.fit(X[tr], y[tr])
        p = m.predict_proba(X[va])[:, 1]
        oof[va] = p

        ap = average_precision_score(y[va], p)
        thr, f1_best = choose_best_threshold_by_f1(y[va], p)

        metrics = {"fold": fold, "auc_pr": float(ap), "f1_best": float(f1_best), "thr_best": float(thr)}
        for K in k_s:
            pk = precision_at_k_per_job(y[va], p, groups[va], k=K)
            metrics[f"p@{K}"] = float(pk)
        fold_metrics.append(metrics)

        # log por fold (como métricas separadas)
        if log_to_mlflow:
            mlflow.log_metrics({f"{model_name}_fold{fold}_aucpr": ap,
                                f"{model_name}_fold{fold}_f1best": f1_best,
                                f"{model_name}_fold{fold}_thrbest": thr,
                                **{f"{model_name}_fold{fold}_p@{K}": metrics[f"p@{K}"] for K in k_s}
                                }, step=fold)

    # agregados (OOF)
    ap_mean = float(average_precision_score(y, oof))
    thr_global, f1_global = choose_best_threshold_by_f1(y, oof)
    agg = {
        "model": model_name,
        "auc_pr_oof": ap_mean,
        "f1_best_oof": float(f1_global),
        "thr_best_oof": float(thr_global),
    }
    for K in k_s:
        agg[f"p@{K}_oof"] = float(precision_at_k_per_job(y, oof, groups, k=K))

    if log_to_mlflow:
        mlflow.log_metrics({
            f"{model_name}_aucpr_oof": ap_mean,
            f"{model_name}_f1best_oof": f1_global,
            f"{model_name}_thrbest_oof": thr_global,
            **{f"{model_name}_p@{K}_oof": agg[f"p@{K}_oof"] for K in k_s}
        })

    return oof, fold_metrics, agg

# ======================= treino com MLflow =======================
def main():
    # ---------- Config MLflow
    tracking_uri = os.getenv("MLFLOW_TRACKING_URI", "file://" + str((Path.cwd() / "mlruns").resolve()))
    mlflow.set_tracking_uri(tracking_uri)
    experiment_name = os.getenv("MLFLOW_EXPERIMENT_NAME", "datathon-recrutamento")
    mlflow.set_experiment(experiment_name)

    # ---------- Dados
    X = np.load(BUILD_DIR / "X.npy", allow_pickle=True)
    y = np.load(BUILD_DIR / "y.npy", allow_pickle=True)
    groups = np.load(BUILD_DIR / "groups_job.npy", allow_pickle=True)
    arts = joblib.load(BUILD_DIR / "artifacts.joblib")
    feat_names = arts.get("feat_names", [f"f{i}" for i in range(X.shape[1])])

    n_splits = int(os.getenv("CV_SPLITS", "5"))

    models = {
        "logreg": LogisticRegression(max_iter=1000, class_weight="balanced"),
        "rf": RandomForestClassifier(
            n_estimators=400,
            max_depth=None,
            min_samples_split=2,
            min_samples_leaf=1,
            n_jobs=-1,
            class_weight="balanced_subsample",
            random_state=42,
        ),
        "mlp": MLPClassifier(
            hidden_layer_sizes=(256,128),
            activation="relu",
            alpha=1e-4,
            learning_rate_init=1e-3,
            max_iter=300,
            early_stopping=True,
            n_iter_no_change=15,
            validation_fraction=0.1,
            random_state=42
        ),
    }

    with mlflow.start_run(run_name="train_cv") as run:
        # tags/params gerais
        mlflow.set_tags({
            "project": "vaga-match",
            "stage": os.getenv("STAGE", "dev"),
        })
        mlflow.log_params({
            "n_features": X.shape[1],
            "n_samples": X.shape[0],
            "cv_splits": n_splits
        })
        # hiperparâmetros dos modelos
        for name, mdl in models.items():
            # registra os get_params (prefixados pelo nome do modelo)
            params = {f"{name}__{k}": v for k, v in mdl.get_params().items()}
            # cuidado: mlflow só aceita tipos simples
            clean_params = {k: (str(v) if not isinstance(v, (int, float, str, bool)) else v)
                            for k, v in params.items()}
            mlflow.log_params(clean_params)

        # --------- CV e escolha do melhor
        results = {}
        best_name, best_ap = None, -1.0
        best_oof = None

        for name, mdl in models.items():
            oof, fold_metrics, agg = eval_cv(name, mdl, X, y, groups, k_s=(3,5), n_splits=n_splits, log_to_mlflow=True)
            results[name] = {"folds": fold_metrics, "agg": agg}
            if agg["auc_pr_oof"] > best_ap:
                best_ap = agg["auc_pr_oof"]
                best_name = name
                best_oof = oof

        # --------- Treino final e salvamento
        best_model = models[best_name]
        best_model.fit(X, y)
        thr_best = results[best_name]["agg"]["thr_best_oof"]

        # salva local
        joblib.dump(best_model, OUT_DIR / "model.joblib")
        with open(OUT_DIR / "report.json", "w", encoding="utf-8") as f:
            json.dump({
                "best_model": best_name,
                "results": results,
                "feat_names": feat_names,
                "threshold_best_f1": thr_best,
            }, f, ensure_ascii=False, indent=2)
        np.save(OUT_DIR / "oof_scores.npy", best_oof)
        np.save(OUT_DIR / "labels.npy", y)
        np.save(OUT_DIR / "groups_job.npy", groups)

        # --------- Log no MLflow
        mlflow.log_metric("best_aucpr_oof", best_ap)
        mlflow.log_metric("best_thrbest_oof", thr_best)
        mlflow.log_param("best_model_name", best_name)

        # artefatos úteis
        mlflow.log_artifact(OUT_DIR / "report.json", artifact_path="artifacts")
        mlflow.log_artifact(OUT_DIR / "oof_scores.npy", artifact_path="artifacts")
        mlflow.log_artifact(OUT_DIR / "labels.npy", artifact_path="artifacts")
        mlflow.log_artifact(OUT_DIR / "groups_job.npy", artifact_path="artifacts")

        # log do modelo no MLflow (com assinatura simples)
        signature = None
        try:
            import mlflow.models.signature as msign
            from mlflow.types.schema import Schema, ColSpec
            signature = msign.ModelSignature(
                inputs=Schema([ColSpec("double", name) for name in feat_names]),
                outputs=Schema([ColSpec("double", "score")])
            )
        except Exception:
            pass

        # exemplo de entrada (apenas para rastreabilidade)
        input_example = np.zeros((1, X.shape[1]))

        mlflow.sklearn.log_model(
            sk_model=best_model,
            artifact_path="model",
            signature=signature,
            input_example=input_example,
            registered_model_name=os.getenv("MLFLOW_REGISTER_MODEL_NAME") if os.getenv("MLFLOW_REGISTER_MODEL", "0") == "1" else None
        )

        print(f"[ok] melhor modelo: {best_name}")
        print(f"[ok] AUC-PR (OOF): {results[best_name]['agg']['auc_pr_oof']:.4f}")
        print(f"[ok] F1_best (OOF): {results[best_name]['agg']['f1_best_oof']:.4f} @thr={thr_best:.3f}")
        print(f"[ok] P@3 (OOF): {results[best_name]['agg']['p@3_oof']:.4f} | P@5 (OOF): {results[best_name]['agg']['p@5_oof']:.4f}")
        print(f"[ok] artefatos salvos em {OUT_DIR.resolve()}/")
        print(f"[ok] run_id: {run.info.run_id}")

if __name__ == "__main__":
    main()


: 