# Configurar MLflow

In [None]:
import os
import shutil
import mlflow
import subprocess

# Definir diretório de rastreamento
mlflow_dir = "/home/jupyter/smart_ads/notebooks/mlruns"
trash_dir = os.path.join(mlflow_dir, ".trash")

# Limpar e recriar os diretórios
if os.path.exists(mlflow_dir):
    print(f"Removendo diretório MLflow existente: {mlflow_dir}")
    shutil.rmtree(mlflow_dir, ignore_errors=True)

# Criar diretórios necessários
os.makedirs(mlflow_dir, exist_ok=True)
os.makedirs(trash_dir, exist_ok=True)  # Criar diretório .trash explicitamente

# Configurar MLflow
mlflow.set_tracking_uri(f"file://{mlflow_dir}")
print(f"MLflow configurado para usar: {mlflow.get_tracking_uri()}")

# Nome do experimento
EXPERIMENT_NAME = "smart-ads-baseline"

# Verificar se o experimento já existe e removê-lo
client = mlflow.tracking.MlflowClient()
try:
    existing_exp = client.get_experiment_by_name(EXPERIMENT_NAME)
    if existing_exp:
        print(f"Removendo experimento existente: {EXPERIMENT_NAME}")
        client.delete_experiment(existing_exp.experiment_id)
except Exception as e:
    print(f"Erro ao verificar experimento existente: {e}")

# Criar um novo experimento
experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
print(f"Criado novo experimento: {EXPERIMENT_NAME} (ID: {experiment_id})")

# Baseline model
Train a baseline model

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import (precision_recall_fscore_support, roc_auc_score, 
                             average_precision_score, confusion_matrix, 
                             precision_recall_curve, PrecisionRecallDisplay)
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time
import joblib
import re
import hashlib
import mlflow
import mlflow.sklearn
import mlflow.lightgbm
import mlflow.xgboost
from datetime import datetime
import glob
import pandas as pd

# 1. Configuração mínima
EXPERIMENT_NAME = "smart-ads-baseline"
GENERATE_LEARNING_CURVES = False  # Desativado para economizar tempo

# 2. Configuração do experimento MLflow
try:
    existing_exp = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
    if existing_exp:
        EXPERIMENT_ID = existing_exp.experiment_id
        print(f"Usando experimento existente: {EXPERIMENT_NAME} (ID: {EXPERIMENT_ID})")
    else:
        EXPERIMENT_ID = mlflow.create_experiment(EXPERIMENT_NAME)
        print(f"Criado novo experimento: {EXPERIMENT_NAME} (ID: {EXPERIMENT_ID})")
except Exception as e:
    print(f"Erro ao configurar experimento: {e}")
    EXPERIMENT_ID = mlflow.create_experiment(EXPERIMENT_NAME)

# 3. Função para gerar hash do dataset (rastreabilidade)
def get_data_hash(df):
    return hashlib.md5(pd.util.hash_pandas_object(df).values).hexdigest()

# 4. Carregar dados pré-processados
DATA_PATH = "datasets/split/"
print("Carregando datasets...")
train_df = pd.read_csv(f"{DATA_PATH}train.csv")
val_df = pd.read_csv(f"{DATA_PATH}validation.csv")
train_hash = get_data_hash(train_df)
val_hash = get_data_hash(val_df)

# 5. Sanitizar nomes das colunas
def sanitize_column_names(df):
    sanitized_columns = {}
    for col in df.columns:
        new_col = re.sub(r'[^\w\s]', '_', col)
        new_col = re.sub(r'\s+', '_', new_col)
        if new_col in sanitized_columns.values():
            new_col = f"{new_col}_{df.columns.get_loc(col)}"
        sanitized_columns[col] = new_col
    df.rename(columns=sanitized_columns, inplace=True)
    return sanitized_columns

# 6. Aplicar sanitização e preparar features/target
column_mapping = sanitize_column_names(train_df)
sanitize_column_names(val_df)
target_col = 'target' if 'target' in train_df.columns else column_mapping.get('target', 'target')
feature_cols = [col for col in train_df.columns if col != target_col]

# 7. Converter colunas inteiras para float
print("Convertendo colunas inteiras para float...")
integer_columns = []
for col in train_df.columns:
    if pd.api.types.is_integer_dtype(train_df[col].dtype):
        train_df[col] = train_df[col].astype(float)
        val_df[col] = val_df[col].astype(float)
        integer_columns.append(col)

# 8. Criar cópias para X e y
X_train = train_df[feature_cols].copy()
y_train = train_df[target_col].copy()
X_val = val_df[feature_cols].copy()
y_val = val_df[target_col].copy()

# 9. Verificar se ainda existem colunas inteiras
int_cols_remaining = [col for col in X_train.columns if pd.api.types.is_integer_dtype(X_train[col].dtype)]
if int_cols_remaining:
    for col in int_cols_remaining:
        X_train[col] = X_train[col].astype(float)
        X_val[col] = X_val[col].astype(float)

print(f"Dados carregados - treino: {X_train.shape}, validação: {X_val.shape}")
print(f"Taxa de conversão - treino: {y_train.mean():.4f}, validação: {y_val.mean():.4f}")

# 10. Criar diretório para artefatos temporários
os.makedirs("/tmp/mlflow_artifacts", exist_ok=True)

# 11. Definir modelos com pesos balanceados
models = {
    'random_forest': RandomForestClassifier(random_state=42, n_jobs=-1, class_weight='balanced'),
    'lightgbm': lgb.LGBMClassifier(random_state=42, n_jobs=-1, scale_pos_weight=50),
    'xgboost': xgb.XGBClassifier(random_state=42, n_jobs=-1, scale_pos_weight=50)
}

# 12. Funções para visualizações
def plot_confusion_matrix(y_true, y_pred, title, filename):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Não Converteu', 'Converteu'],
                yticklabels=['Não Converteu', 'Converteu'])
    plt.xlabel('Predito')
    plt.ylabel('Real')
    plt.title(title)
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()
    return cm

def plot_prob_histogram(y_true, y_pred_proba, threshold, title, filename):
    plt.figure(figsize=(10, 6))
    plt.hist(y_pred_proba[y_true == 0], bins=50, alpha=0.5, color='blue', label='Classe 0 (Não converteu)')
    plt.hist(y_pred_proba[y_true == 1], bins=50, alpha=0.5, color='red', label='Classe 1 (Converteu)')
    plt.axvline(x=threshold, color='green', linestyle='--', label=f'Threshold: {threshold:.2f}')
    plt.title(title)
    plt.xlabel('Probabilidade prevista')
    plt.ylabel('Frequência')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.savefig(filename)
    plt.close()

def plot_precision_recall_curve(y_true, y_pred_proba, title, filename):
    display = PrecisionRecallDisplay.from_predictions(y_true, y_pred_proba, name="PR curve")
    _, ax = plt.subplots(figsize=(10, 6))
    display.plot(ax=ax)
    ax.set_title(title)
    ax.grid(True, alpha=0.3)
    plt.savefig(filename)
    plt.close()

def plot_learning_curve(model, X, y, model_name, filename=None):
    """Versão leve da curva de aprendizado - usa apenas 3 pontos e 3 folds"""
    from sklearn.model_selection import learning_curve
    train_sizes = np.linspace(0.3, 1.0, 3)
    cv = 3
    print(f"  Gerando curva de aprendizado leve para {model_name}...")
    train_sizes, train_scores, test_scores = learning_curve(
        model, X, y, cv=cv, train_sizes=train_sizes, scoring='f1',
        n_jobs=-1, shuffle=True, random_state=42
    )
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.figure(figsize=(8, 6))
    plt.grid(True, alpha=0.3)
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Treino")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Validação")
    plt.title(f"Curva de Aprendizado - {model_name}")
    plt.xlabel("Tamanho do conjunto de treino")
    plt.ylabel("F1-Score")
    plt.legend(loc="best")
    plt.savefig(filename)
    plt.close()
    return filename

# 13. Treinar modelos
print("\nTreinando modelos base com pesos balanceados...")
results = {}

for name, model in models.items():
    print(f"Treinando {name}...")
    
    # 14. Iniciar execução MLflow com tags simplificadas
    with mlflow.start_run(experiment_id=EXPERIMENT_ID, run_name=f"{name}_baseline") as run:
        run_id = run.info.run_id
        
        # 15. Registrar tags simplificadas
        mlflow.set_tags({
            "model_type": name,
            "experiment_type": "baseline",
            "class_balance": "weighted",
            "train_data_hash": train_hash,
            "val_data_hash": val_hash,
            "data_path": DATA_PATH,
            "feature_count": len(feature_cols),
            "dataset_size": len(X_train),
            "positive_ratio": float(y_train.mean()),
            "converted_int_cols": ','.join(integer_columns)
        })
        
        # 16. Registrar parâmetros do modelo
        mlflow.log_params(model.get_params())
        
        # 17. Treinar modelo com medição de tempo
        start_time = time.time()
        model.fit(X_train, y_train)
        train_time = time.time() - start_time
        mlflow.log_metric("training_time_seconds", train_time)
        
        # 18. Fazer predições
        y_pred_proba = model.predict_proba(X_val)[:, 1]
        
        # 19. Testar diferentes thresholds para encontrar o melhor F1
        best_f1 = 0
        best_threshold = 0.5
        best_precision = 0
        best_recall = 0
        
        thresholds_to_test = np.arange(0.01, 0.5, 0.01)
        f1_scores = []
        precisions = []
        recalls = []
        
        for threshold in thresholds_to_test:
            y_pred_t = (y_pred_proba >= threshold).astype(int)
            precision, recall, f1, _ = precision_recall_fscore_support(y_val, y_pred_t, average='binary')
            
            f1_scores.append(f1)
            precisions.append(precision)
            recalls.append(recall)
            
            if f1 > best_f1:
                best_f1 = f1
                best_threshold = threshold
                best_precision = precision
                best_recall = recall
        
        # 20. Fazer predições finais com o melhor threshold
        y_pred = (y_pred_proba >= best_threshold).astype(int)
        auc_score = roc_auc_score(y_val, y_pred_proba)
        pr_auc = average_precision_score(y_val, y_pred_proba)
        
        # 21. Contar predições positivas
        positive_count = y_pred.sum()
        positive_pct = positive_count / len(y_pred) * 100
        
        # 22. Registrar métricas
        mlflow.log_metric("precision", best_precision)
        mlflow.log_metric("recall", best_recall)
        mlflow.log_metric("f1", best_f1)
        mlflow.log_metric("threshold", best_threshold)
        mlflow.log_metric("auc", auc_score)
        mlflow.log_metric("pr_auc", pr_auc)
        mlflow.log_metric("positive_predictions", positive_count)
        mlflow.log_metric("positive_pct", positive_pct)
        
        # 23. Gráfico de Threshold vs F1/Precision/Recall
        plt.figure(figsize=(10, 6))
        plt.plot(thresholds_to_test, f1_scores, label='F1')
        plt.plot(thresholds_to_test, precisions, label='Precision')
        plt.plot(thresholds_to_test, recalls, label='Recall')
        plt.axvline(x=best_threshold, color='r', linestyle='--', label=f'Melhor threshold: {best_threshold:.2f}')
        plt.title(f'Efeito do threshold nas métricas - {name}')
        plt.xlabel('Threshold')
        plt.ylabel('Valor')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        threshold_fig_path = f"/tmp/mlflow_artifacts/threshold_plot_{name}.png"
        plt.savefig(threshold_fig_path)
        mlflow.log_artifact(threshold_fig_path)
        plt.close()
        
        # 24. Plotar matriz de confusão
        cm_fig_path = f"/tmp/mlflow_artifacts/confusion_matrix_{name}.png"
        plot_confusion_matrix(y_val, y_pred, f'Matriz de Confusão - {name} (threshold={best_threshold:.2f})', cm_fig_path)
        mlflow.log_artifact(cm_fig_path)
        
        # 25. Plotar histograma de probabilidades
        hist_fig_path = f"/tmp/mlflow_artifacts/prob_histogram_{name}.png"
        plot_prob_histogram(y_val, y_pred_proba, best_threshold, f'Distribuição de Probabilidades - {name}', hist_fig_path)
        mlflow.log_artifact(hist_fig_path)
        
        # 26. Plotar curva precision-recall
        pr_curve_path = f"/tmp/mlflow_artifacts/pr_curve_{name}.png"
        plot_precision_recall_curve(y_val, y_pred_proba, f'Curva Precision-Recall - {name}', pr_curve_path)
        mlflow.log_artifact(pr_curve_path)
        
        # 27. Gerar curva de aprendizado apenas se necessário
        if GENERATE_LEARNING_CURVES:
            learning_curve_path = f"/tmp/mlflow_artifacts/learning_curve_{name}.png"
            plot_learning_curve(model, X_train, y_train, model_name=name.capitalize(), filename=learning_curve_path)
            mlflow.log_artifact(learning_curve_path)
        
        # 28. Registrar feature importance (quando disponível) - tudo em uma única pasta
        if hasattr(model, 'feature_importances_'):
            # Criar DataFrame de importância
            importance_df = pd.DataFrame({
                'feature': feature_cols,
                'importance': model.feature_importances_
            }).sort_values('importance', ascending=False)
            
            # Salvar tabela de importância
            importance_path = f"/tmp/mlflow_artifacts/feature_importance_{name}.csv"
            importance_df.to_csv(importance_path, index=False)
            mlflow.log_artifact(importance_path)
            
            # Plotar top 20 features
            plt.figure(figsize=(12, 8))
            top_features = importance_df.head(20)
            sns.barplot(x='importance', y='feature', data=top_features)
            plt.title(f'Top 20 Features - {name}')
            plt.tight_layout()
            importance_fig_path = f"/tmp/mlflow_artifacts/feature_importance_plot_{name}.png"
            plt.savefig(importance_fig_path)
            mlflow.log_artifact(importance_fig_path)
            plt.close()
        
        # 29. Preparar exemplo de entrada para assinatura e registrar modelo
        input_example = X_train.iloc[:5].copy().astype(float)
        
        if name == 'random_forest':
            mlflow.sklearn.log_model(model, name, input_example=input_example)
        elif name == 'lightgbm':
            mlflow.lightgbm.log_model(model, name, input_example=input_example)
        elif name == 'xgboost':
            mlflow.xgboost.log_model(model, name, input_example=input_example, model_format="json")
        
        model_uri = f"runs:/{run_id}/{name}"
        print(f"  Modelo {name} salvo em: {run.info.artifact_uri}/{name}")
        print(f"  URI do modelo para carregamento: {model_uri}")
        
        # 30. Armazenar resultados no dicionário local
        results[f"{name}_precision"] = float(best_precision)
        results[f"{name}_recall"] = float(best_recall) 
        results[f"{name}_f1"] = float(best_f1)
        results[f"{name}_threshold"] = float(best_threshold)
        results[f"{name}_auc"] = float(auc_score)
        results[f"{name}_pr_auc"] = float(pr_auc)
        results[f"{name}_model_uri"] = model_uri
        
        print(f"  {name} - F1: {best_f1:.4f}, PR-AUC: {pr_auc:.4f}, Threshold: {best_threshold:.4f}")
        print(f"  {name} - Precision: {best_precision:.4f}, Recall: {best_recall:.4f}")
        print(f"  {name} - Predições positivas: {positive_count} ({positive_pct:.2f}%)")

# 31. Mostrar resultados finais
print("\nResultados dos modelos com threshold otimizado:")
for model_name in models.keys():
    print(f"\n{model_name.upper()}:")
    print(f"  F1: {results[f'{model_name}_f1']:.4f}")
    print(f"  Precisão: {results[f'{model_name}_precision']:.4f}")
    print(f"  Recall: {results[f'{model_name}_recall']:.4f}")
    print(f"  Threshold: {results[f'{model_name}_threshold']:.4f}")
    print(f"  AUC: {results[f'{model_name}_auc']:.4f}")
    print(f"  PR-AUC: {results[f'{model_name}_pr_auc']:.4f}")
    print(f"  Model URI: {results[f'{model_name}_model_uri']}")

print("\nModelos treinados e registrados no MLflow com rastreabilidade.")
print("\nPara carregar um modelo específico em código futuro, use:")
for model_name in models.keys():
    model_type = "sklearn" if model_name == "random_forest" else model_name
    print(f"""
# Carregar modelo {model_name}:
import mlflow.{model_type}
model_{model_name} = mlflow.{model_type}.load_model("{results[f'{model_name}_model_uri']}")
""")