In [26]:
import pandas as pd
import numpy as np
import plotly.express
import sklearn
import warnings
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, PowerTransformer

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns",None)




In [27]:
%reload_ext watermark
%watermark -a "Washington Ying Ye Wu" --iversion --python


Author: Washington Ying Ye Wu

Python implementation: CPython
Python version       : 3.13.3
IPython version      : 9.4.0

mlflow  : 3.1.2
numpy   : 2.2.6
lightgbm: 4.6.0
sklearn : 1.5.2
pandas  : 2.2.3
optuna  : 4.4.0
catboost: 1.2.8
plotly  : 6.2.0



In [28]:
database = pd.read_csv("heart.csv")

dbtreino = database #database para treinar
dbteste= database #database como teste

In [29]:
dbtreino.head(5)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [30]:
dbteste.head(5) 

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [31]:
dbtreino.columns #verificando colunas

Index(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope',
       'HeartDisease'],
      dtype='object')

In [32]:
print(f"\n Proporcoes de doenca do treino = {dbtreino['HeartDisease'].value_counts()/len(dbtreino)}")
print(f"\n Proporcoes de doenca do treino = {dbteste['HeartDisease'].value_counts()/len(dbteste)}")

proportions = (dbtreino['HeartDisease'].value_counts()/len(dbtreino)).values


 Proporcoes de doenca do treino = HeartDisease
1    0.553377
0    0.446623
Name: count, dtype: float64

 Proporcoes de doenca do treino = HeartDisease
1    0.553377
0    0.446623
Name: count, dtype: float64


In [33]:
categoria_nume = dbtreino.drop('HeartDisease',axis=1).select_dtypes(include=np.number).columns
print(categoria_nume)
categoria_categ = dbtreino.select_dtypes(include='object').columns
print(categoria_categ)

categ_pipeline = Pipeline([("imputer",SimpleImputer(strategy="constant",fill_value="unknow")),
                           ("encoder", OneHotEncoder(drop="first", handle_unknown="ignore"))])
nume_pipeline = Pipeline([("inputer", SimpleImputer(strategy="median")),
                          ("scaler", PowerTransformer(method="yeo-johnson")),])

transform = ColumnTransformer([("cat", categ_pipeline, categoria_categ),
                            ("num", nume_pipeline, categoria_nume),])
transform

Index(['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak'], dtype='object')
Index(['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'], dtype='object')


In [34]:
x_treino = dbtreino.drop('HeartDisease', axis=1)
y_treino = dbtreino["HeartDisease"]
y_treino.replace(to_replace=[1, 0], value=[0, 1], inplace=True)

x_teste = dbteste.drop("HeartDisease", axis=1)
y_teste = dbteste["HeartDisease"]
y_teste.replace(to_replace=[1, 0], value=[0, 1], inplace=True)

In [35]:
import mlflow
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import LinearSVC

SEED = 15

In [36]:
experimento = "heartdisease_classical_approach"
mlflow.set_experiment(experimento)

def objetivo(trial):
    modelo = trial.suggest_categorical( #escolha de modelos
        "modelo",["RandomForestClassifier","SVC","LGBMClassifier","LogisticRegression","LinearSVC"]
    )
    model = None

    if modelo == "RandomForestClassifier": #arvores de decisao
        estimativa_n_rand = trial.suggest_int("estimativa_n_rand", 2, 100) #quantidade de arvores
        max_profundidade_rand = trial.suggest_int("max_profundidade_rand", 2, 12, log=True) #profundidade da arvore
        model = RandomForestClassifier( #cria o modelo
            n_estimators=estimativa_n_rand,
            max_depth=max_profundidade_rand,
            random_state=SEED,
            class_weight={0: proportions[0], 1: proportions[1]},
        )

    #melhor para dados muito grandesw
    elif modelo == "LinearSVC":
        #valores grandes de c = menores erros de classificacao
        #menores valores = maior regularizacao
        C = trial.suggest_loguniform("C_linear",1e-4,10) #tentativa de regularizar o ajustamento de dados em escala logaritmica
        
        #tipo de regularizacao l1 = modelo esparso, eliminando variaveis irrelevantes e gera coeficientes zerados. 
        # l2 = mantem coeficientes pequenos e nao zera coeficientes
        penalty = trial.suggest_categorical("penalty_linear",["l1","l2"]) 

        #funcao de perda do linearSVC                                                               
        loss = "squared_hinge" 


        if penalty == "l1": 
            dual = False
        else:
            dual = True

        model = LinearSVC(C=C, penalty=penalty, loss=loss,dual=dual,random_state=SEED,
                          class_weight={0: proportions[0], 1: proportions[1]},
                          max_iter=500)
    
    #melhor para dados definidos
    elif modelo == "SVC": 
        C= trial.suggest_loguniform("C",0.1,10) #regularizacao da SVM em escala logaritmica
        kernel = trial.suggest_categorical("kernel", ["linear", "rbf"]) #separacao linear ou nao linear
        gamma = trial.suggest_loguniform("gamma", 0.001, 0.1) #caso nao linear, define influencia dos dados
        model = SVC(
            C=C,
            kernel=kernel,
            gamma=gamma,
            random_state=SEED,
            class_weight={0: proportions[0], 1: proportions[1]},
        )
    
    elif modelo == "LGBMClassifier": #arvore de decisao
        n_estimators = trial.suggest_int("n_estimators", 50, 500) #numero de arvores
        learning_rate = trial.suggest_loguniform("learning_rate", 0.01, 0.3) #taxa de aprendizado
        num_leaves = trial.suggest_int("num_leaves", 2, 64) #numero de folhas por arvore
        max_depth = trial.suggest_int("max_depth", 3, 15) #limite de profundidade

        model = LGBMClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            num_leaves=num_leaves,
            max_depth=max_depth,
            random_state=SEED,
            n_jobs=-1,
            class_weight={0: proportions[0], 1: proportions[1]},
        )

    elif modelo == "LogisticRegression": #modelo de classificacao binaria
        #escolha entre dois modelos e ajusta para encontrar os melhores coeficientes
        solver = trial.suggest_categorical("solver", ["liblinear", "lbfgs"]) 
        model = LogisticRegression(
            solver=solver,
            random_state=SEED,
            class_weight={0: proportions[0], 1: proportions[1]},
        )

    modelo_pipeline = Pipeline([("transform",transform),("model",model)]) #trata dados numericos e categoricos e escolhe o modelo

    with mlflow.start_run(nested=True) as child_run:
        mlflow.autolog()

        modelo_pipeline.fit(x_treino, y_treino)
        preds = modelo_pipeline.predict(x_teste)

        #avalia e realiza logging com mlflow
        score_acc = sklearn.metrics.accuracy_score(y_teste, preds)
        score_recall = sklearn.metrics.recall_score(y_teste, preds)
        score_f1 = sklearn.metrics.f1_score(y_teste, preds)
        score_auc = sklearn.metrics.roc_auc_score(y_teste, preds)

        mlflow.log_metric("accuracy", score_acc)
        mlflow.log_metric("recall", score_recall)
        mlflow.log_metric("f1", score_f1)
        mlflow.log_metric("auc", score_auc)
        mlflow.log_params(trial.params)

        return score_f1

    

In [None]:
with mlflow.start_run(run_name="ParentRun Optuna") as parent_run:
    study = optuna.create_study(direction="maximize")
    study.optimize(objetivo, n_trials=15, show_progress_bar=True)
    print("numero da melhor tentativa:", study.best_trial.number)
    print("melhor valor (F1-score):", study.best_trial.value)
    print("melhores hiperparametros encontrados:")
    for key, value in study.best_trial.params.items():
        print(f"  {key}: {value}")


In [38]:
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_C,params_C_linear,params_estimativa_n_rand,params_gamma,params_kernel,params_learning_rate,params_max_depth,params_max_profundidade_rand,params_modelo,params_n_estimators,params_num_leaves,params_penalty_linear,params_solver,state
0,0,0.882759,2025-07-08 16:30:32.272110,2025-07-08 16:30:38.238705,0 days 00:00:05.966595,,,,,,,,,LogisticRegression,,,,lbfgs,COMPLETE
1,1,0.978389,2025-07-08 16:30:38.243389,2025-07-08 16:30:44.839071,0 days 00:00:06.595682,,,65.0,,,,,10.0,RandomForestClassifier,,,,,COMPLETE
2,2,0.910521,2025-07-08 16:30:44.849077,2025-07-08 16:30:50.906098,0 days 00:00:06.057021,,,,,,0.018773,13.0,,LGBMClassifier,98.0,21.0,,,COMPLETE
3,3,0.882759,2025-07-08 16:30:50.910523,2025-07-08 16:30:56.991359,0 days 00:00:06.080836,,,,,,,,,LogisticRegression,,,,liblinear,COMPLETE
4,4,0.866928,2025-07-08 16:30:56.994900,2025-07-08 16:31:02.849989,0 days 00:00:05.855089,4.316318,,,0.001546,rbf,,,,SVC,,,,,COMPLETE
5,5,0.877953,2025-07-08 16:31:02.853570,2025-07-08 16:31:09.386931,0 days 00:00:06.533361,7.217739,,,0.014709,linear,,,,SVC,,,,,COMPLETE
6,6,0.85939,2025-07-08 16:31:09.390778,2025-07-08 16:31:15.864474,0 days 00:00:06.473696,0.280895,,,0.008005,rbf,,,,SVC,,,,,COMPLETE
7,7,0.984221,2025-07-08 16:31:15.874807,2025-07-08 16:31:23.754550,0 days 00:00:07.879743,,,,,,0.027241,5.0,,LGBMClassifier,401.0,44.0,,,COMPLETE
8,8,0.944882,2025-07-08 16:31:23.760601,2025-07-08 16:31:30.898362,0 days 00:00:07.137761,,,10.0,,,,,8.0,RandomForestClassifier,,,,,COMPLETE
9,9,0.880234,2025-07-08 16:31:30.904568,2025-07-08 16:31:40.027458,0 days 00:00:09.122890,9.712259,,,0.001599,rbf,,,,SVC,,,,,COMPLETE


In [None]:
def salvarmelhor_modelo (study, x_treino, y_treino, transform, SEED):
    proportions = y_treino.value_counts(normalize=True).to_dict()
    
    #recebe os melhores hiperparametros
    melhores_params = study.best_trial.params
    melhor_modelo = melhores_params["modelo"]

    if melhor_modelo == "RandomForestClassifier":
        model = RandomForestClassifier(
            n_estimators=melhores_params["estimativa_n_rand"],
            max_depth=melhores_params["max_profundidade_rand"],
            random_state=SEED,
            class_weight={0: proportions[0], 1: proportions[1]},
        )

    elif melhor_modelo == "LinearSVC":
        penalty = melhores_params["penalty_linear"]
        if penalty == "l1": 
            dual = False
        else:
            dual = True
        model = LinearSVC(
            C=melhores_params["C_linear"],
            penalty=penalty,
            loss="squared_hinge",
            dual=dual,
            random_state=SEED,
            class_weight={0: proportions[0], 1: proportions[1]},
            max_iter=500,
        )

    elif melhor_modelo == "SVC":
        model = SVC(
            C=melhores_params["C"],
            kernel=melhores_params["kernel"],
            gamma=melhores_params["gamma"],
            random_state=SEED,
            class_weight={0: proportions[0], 1: proportions[1]},
        )

    elif melhor_modelo == "LGBMClassifier":
        model = LGBMClassifier(
            n_estimators=melhores_params["n_estimators"],
            learning_rate=melhores_params["learning_rate"],
            num_leaves=melhores_params["num_leaves"],
            max_depth=melhores_params["max_depth"],
            random_state=SEED,
            n_jobs=-1,
            class_weight={0: proportions[0], 1: proportions[1]},
        )

    elif melhor_modelo == "LogisticRegression":
        model = LogisticRegression(
            solver=melhores_params["solver"],
            random_state=SEED,
            class_weight={0: proportions[0], 1: proportions[1]},
        )

    #pipeline com o modelo final
    modelo_final = Pipeline([
        ("transform", transform),
        ("model", model),
    ])

    #treinar com dados finais
    modelo_final.fit(x_treino, y_treino)

    #salvar com mlflow
    with mlflow.start_run(run_name="Modelo_HeartDisease"):
        mlflow.sklearn.log_model(modelo_final, "modelo_final")
        mlflow.log_params(melhores_params)
        print("modelo salvo com sucesso")