# Treinamento

## Inicialização

In [1]:
# Config Inicial
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import shutil

from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

from sklearn.metrics import *

import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature

### MLFlow

In [2]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")

mlflow.set_experiment("ecd15")

<Experiment: artifact_location='/app/mlruns/1', creation_time=1744227466344, experiment_id='1', last_update_time=1744227466344, lifecycle_stage='active', name='ecd15', tags={}>

In [3]:
seeds = [42, 47, 123]

## Dataset

O pré-processamento inicial foi realizado no dataset `brasile-real-state-dataset.csv`, onde as colunas de latitude e longitude foram convertidas em informações de cidade e estado. Esse processo resultou na criação do dataset `brasil_estado_cidade.csv`, que está sendo utilizado para o treinamento dos modelos apresentados a seguir.


### Leitura

In [5]:
# Descomentar, quando quiser carregar para o treinamento, dados originais defasados para primeira execução do treinamento
#shutil.copyfile("dataset/brasil_estado_cidade.csv", "dataset/dados_treinamento.csv")


# Carregar o conjunto de dados
dados = pd.read_csv("dataset/dados_treinamento.csv", encoding="utf-8")

# Eliminando registros com valores null
dados.dropna(inplace=True)

# float64
dados = dados.astype({col: 'float64' for col in dados.select_dtypes(include='int').columns})

def remover_outliers_por_cidade(df):
    # Remove outliers da coluna 'price_brl' agrupando por cidade.

    def remover_outliers_grupo(grupo):
        # Remove outliers de um grupo usando o método Z-score.
        z_scores = np.abs(stats.zscore(grupo["price_brl"]))
        return grupo[(z_scores < 3)]

    df_filtrado = df.groupby("city").apply(remover_outliers_grupo).reset_index(drop=True)
    return df_filtrado

dados = remover_outliers_por_cidade(dados)

dados.sample(10)

  df_filtrado = df.groupby("city").apply(remover_outliers_grupo).reset_index(drop=True)


Unnamed: 0,id,property_type,state,region,lat,lon,area_m2,price_brl,city
8160,6429.0,house,Rio de Janeiro,Southeast,-22.91616,-43.355623,140.0,454317.88,Rio de Janeiro
1416,4024.0,house,Rio de Janeiro,Southeast,-22.862112,-42.021259,230.0,768558.29,Cabo Frio
4908,2230.0,apartment,Rio Grande do Sul,South,-30.032711,-51.211208,109.0,393742.15,Porto Alegre
7566,5544.0,apartment,Rio de Janeiro,Southeast,-22.944958,-43.240059,148.0,908635.8,Rio de Janeiro
5374,2956.0,house,Rio Grande do Sul,South,-30.150503,-51.213573,110.0,548970.21,Porto Alegre
4806,2081.0,apartment,Rio Grande do Sul,South,-30.027777,-51.162804,99.0,648782.97,Porto Alegre
3291,4096.0,apartment,Rio de Janeiro,Southeast,-22.906288,-43.099586,62.0,373550.26,Niterói
5368,2946.0,house,Rio Grande do Sul,South,-30.117683,-51.260231,186.0,1396270.36,Porto Alegre
5533,3217.0,apartment,Rio Grande do Sul,South,-30.031361,-51.15786,125.0,1009595.32,Porto Alegre
7261,5109.0,apartment,Rio de Janeiro,Southeast,-22.956388,-43.375317,174.0,1261994.16,Rio de Janeiro


### Tratamento Features

In [5]:
# Exemplo de pré-processamento
x_features = dados.drop(["price_brl"], axis=1)  # Features
y_target = dados["price_brl"]  # Variável alvo

# Identificando colunas numéricas e categóricas
numeric_features = x_features.select_dtypes(include=['number']).columns
categorical_features = x_features.select_dtypes(include=['object']).columns

# Criando transformadores
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Criando o ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


### Divisão dos Dados

In [6]:
test_sizes = [0.1, 0.2, 0.3]
#x_train, x_test, y_train, y_test = train_test_split(x_features, y_target, test_size=0.2, random_state=42)

## Modelos

- Árvore de Decisão: `DecisionTreeRegressor`
- Regressão Linear: `LinearRegression`
- Random Forest: `RandomForestRegressor`
- Gradient Boosting: `GradientBoostingRegressor`
- XGBoost: `XGBRegressor`

In [7]:
models = {
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42)
}

## Execução

In [8]:
# para cada semente 
for seed in seeds:
    # para cada tamanho de teste
    for test_size in test_sizes:
        # Dividir os dados em conjuntos de treino e teste
        x_train, x_test, y_train, y_test = train_test_split(x_features, y_target, test_size=test_size, random_state=seed)

        # Para cada modelo
        for model_name, model in models.items():
            # Criar o pipeline
            pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                       ('model', model)])

            # Treinar o modelo
            pipeline.fit(x_train, y_train)

            # Fazer previsões
            y_pred = pipeline.predict(x_test)

            # Avaliar o modelo
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            mae = mean_absolute_error(y_test, y_pred)
            
            print(f"Modelo: {model_name}, Test Size: {test_size}, Seed: {seed}")
            print(f"MSE: {mse}, R2: {r2}, MAE: {mae}")
            
            # Logar os resultados no MLflow
            with mlflow.start_run():
                mlflow.log_param("model_type", model_name)
                mlflow.log_param("test_size", test_size)
                mlflow.log_param("random_seed", seed)
                
                # registrar métricas
                mlflow.log_metric("mse", mse)
                mlflow.log_metric("r2", r2)
                mlflow.log_metric("mae", mae)

                # Logar o modelo
                signature = infer_signature(x_test, y_pred)
                
                registered_model_name= model_name.lower().replace(" ", "_") + "_model"
                 
                mlflow.sklearn.log_model(
                    sk_model=pipeline,
                    artifact_path=registered_model_name,
                    signature=signature,
                    registered_model_name= registered_model_name,
                )
                
            mlflow.end_run()
                

Modelo: Decision Tree, Test Size: 0.1, Seed: 42
MSE: 774057741995.8984, R2: 0.7670971810981704, MAE: 349363.0108622449


Registered model 'decision_tree_model' already exists. Creating a new version of this model...
Created version '26' of model 'decision_tree_model'.


Modelo: Linear Regression, Test Size: 0.1, Seed: 42
MSE: 2199784252261.418, R2: 0.33811662162762457, MAE: 892908.60254239


Registered model 'linear_regression_model' already exists. Creating a new version of this model...
Created version '26' of model 'linear_regression_model'.


Modelo: Random Forest, Test Size: 0.1, Seed: 42
MSE: 499186417304.9078, R2: 0.8498020012718455, MAE: 323265.798247449


Registered model 'random_forest_model' already exists. Creating a new version of this model...
Created version '25' of model 'random_forest_model'.


Modelo: Gradient Boosting, Test Size: 0.1, Seed: 42
MSE: 746333363750.0034, R2: 0.7754390469506066, MAE: 455995.7297825873


Registered model 'gradient_boosting_model' already exists. Creating a new version of this model...
Created version '25' of model 'gradient_boosting_model'.


Modelo: XGBoost, Test Size: 0.1, Seed: 42
MSE: 541613091403.2191, R2: 0.837036426485852, MAE: 370490.8937595663


Registered model 'xgboost_model' already exists. Creating a new version of this model...
Created version '25' of model 'xgboost_model'.


Modelo: Decision Tree, Test Size: 0.2, Seed: 42
MSE: 920426371321.241, R2: 0.725827778319228, MAE: 381089.7109693877


Registered model 'decision_tree_model' already exists. Creating a new version of this model...
Created version '27' of model 'decision_tree_model'.


Modelo: Linear Regression, Test Size: 0.2, Seed: 42
MSE: 293062677387203.56, R2: -86.29611390384845, MAE: 1089963.6110702255


Registered model 'linear_regression_model' already exists. Creating a new version of this model...
Created version '27' of model 'linear_regression_model'.


Modelo: Random Forest, Test Size: 0.2, Seed: 42
MSE: 534992008698.6826, R2: 0.8406391296722381, MAE: 337181.9775269388


Registered model 'random_forest_model' already exists. Creating a new version of this model...
Created version '26' of model 'random_forest_model'.


Modelo: Gradient Boosting, Test Size: 0.2, Seed: 42
MSE: 776910009928.753, R2: 0.768577748198987, MAE: 460076.7054672476


Registered model 'gradient_boosting_model' already exists. Creating a new version of this model...
Created version '26' of model 'gradient_boosting_model'.


Modelo: XGBoost, Test Size: 0.2, Seed: 42
MSE: 554225274175.8407, R2: 0.8349100161979262, MAE: 371699.6693422353


Registered model 'xgboost_model' already exists. Creating a new version of this model...
Created version '26' of model 'xgboost_model'.


Modelo: Decision Tree, Test Size: 0.3, Seed: 42
MSE: 1054209920501.4044, R2: 0.7090054436900005, MAE: 418202.5879591836


Registered model 'decision_tree_model' already exists. Creating a new version of this model...
Created version '28' of model 'decision_tree_model'.


Modelo: Linear Regression, Test Size: 0.3, Seed: 42
MSE: 158036335165796.25, R2: -42.622918299380565, MAE: 996560.0153769052


Registered model 'linear_regression_model' already exists. Creating a new version of this model...
Created version '28' of model 'linear_regression_model'.


Modelo: Random Forest, Test Size: 0.3, Seed: 42
MSE: 571602823885.9476, R2: 0.8422199346757024, MAE: 344299.86669278913


Registered model 'random_forest_model' already exists. Creating a new version of this model...
Created version '27' of model 'random_forest_model'.


Modelo: Gradient Boosting, Test Size: 0.3, Seed: 42
MSE: 853788523613.7699, R2: 0.7643279504584899, MAE: 471990.8598665985


Registered model 'gradient_boosting_model' already exists. Creating a new version of this model...
Created version '27' of model 'gradient_boosting_model'.


Modelo: XGBoost, Test Size: 0.3, Seed: 42
MSE: 588103046511.4581, R2: 0.8376653626985738, MAE: 376057.7703492507


Registered model 'xgboost_model' already exists. Creating a new version of this model...
Created version '27' of model 'xgboost_model'.


Modelo: Decision Tree, Test Size: 0.1, Seed: 47
MSE: 733142029677.6044, R2: 0.8001423748708438, MAE: 358420.7369336735


Registered model 'decision_tree_model' already exists. Creating a new version of this model...
Created version '29' of model 'decision_tree_model'.


Modelo: Linear Regression, Test Size: 0.1, Seed: 47
MSE: 2495981184817.1206, R2: 0.3195849483844625, MAE: 917207.7588834051


Registered model 'linear_regression_model' already exists. Creating a new version of this model...
Created version '29' of model 'linear_regression_model'.


Modelo: Random Forest, Test Size: 0.1, Seed: 47
MSE: 524892181699.1084, R2: 0.8569121662150836, MAE: 343183.3890582143


Registered model 'random_forest_model' already exists. Creating a new version of this model...
Created version '28' of model 'random_forest_model'.


Modelo: Gradient Boosting, Test Size: 0.1, Seed: 47
MSE: 881595483721.6643, R2: 0.7596733339941215, MAE: 496099.308104567


Registered model 'gradient_boosting_model' already exists. Creating a new version of this model...
Created version '28' of model 'gradient_boosting_model'.


Modelo: XGBoost, Test Size: 0.1, Seed: 47
MSE: 607372395099.522, R2: 0.8344277104028837, MAE: 386236.6595285395


Registered model 'xgboost_model' already exists. Creating a new version of this model...
Created version '28' of model 'xgboost_model'.


Modelo: Decision Tree, Test Size: 0.2, Seed: 47
MSE: 844574911965.9696, R2: 0.7797718609172126, MAE: 376105.23448469385


Registered model 'decision_tree_model' already exists. Creating a new version of this model...
Created version '30' of model 'decision_tree_model'.


Modelo: Linear Regression, Test Size: 0.2, Seed: 47
MSE: 2568202622824.5166, R2: 0.33032526020028585, MAE: 917809.6937460116


Registered model 'linear_regression_model' already exists. Creating a new version of this model...
Created version '30' of model 'linear_regression_model'.


Modelo: Random Forest, Test Size: 0.2, Seed: 47
MSE: 575303664415.1403, R2: 0.8499860064198073, MAE: 345795.0988277551


Registered model 'random_forest_model' already exists. Creating a new version of this model...
Created version '29' of model 'random_forest_model'.


Modelo: Gradient Boosting, Test Size: 0.2, Seed: 47
MSE: 945576395819.85, R2: 0.753435098460034, MAE: 493743.12648267334


Registered model 'gradient_boosting_model' already exists. Creating a new version of this model...
Created version '29' of model 'gradient_boosting_model'.


Modelo: XGBoost, Test Size: 0.2, Seed: 47
MSE: 629278132883.3668, R2: 0.8359118294814103, MAE: 382576.7249178093


Registered model 'xgboost_model' already exists. Creating a new version of this model...
Created version '29' of model 'xgboost_model'.


Modelo: Decision Tree, Test Size: 0.3, Seed: 47
MSE: 952493946175.1445, R2: 0.745466232475294, MAE: 405923.1434591837


Registered model 'decision_tree_model' already exists. Creating a new version of this model...
Created version '31' of model 'decision_tree_model'.


Modelo: Linear Regression, Test Size: 0.3, Seed: 47
MSE: 2496044591784.3774, R2: 0.3329851214196373, MAE: 915461.5397872302


Registered model 'linear_regression_model' already exists. Creating a new version of this model...
Created version '31' of model 'linear_regression_model'.


Modelo: Random Forest, Test Size: 0.3, Seed: 47
MSE: 596437673109.0566, R2: 0.8406147055949879, MAE: 358219.7650202211


Registered model 'random_forest_model' already exists. Creating a new version of this model...
Created version '30' of model 'random_forest_model'.


Modelo: Gradient Boosting, Test Size: 0.3, Seed: 47
MSE: 892093959196.2308, R2: 0.7616068455531215, MAE: 485936.2467743653


Registered model 'gradient_boosting_model' already exists. Creating a new version of this model...
Created version '30' of model 'gradient_boosting_model'.


Modelo: XGBoost, Test Size: 0.3, Seed: 47
MSE: 692168224208.8716, R2: 0.8150327499967401, MAE: 394015.77263297193


Registered model 'xgboost_model' already exists. Creating a new version of this model...
Created version '30' of model 'xgboost_model'.


Modelo: Decision Tree, Test Size: 0.1, Seed: 123
MSE: 734886382805.2773, R2: 0.8182087518611222, MAE: 345320.0110918367


Registered model 'decision_tree_model' already exists. Creating a new version of this model...
Created version '32' of model 'decision_tree_model'.


Modelo: Linear Regression, Test Size: 0.1, Seed: 123
MSE: 220762129164389.8, R2: -53.61064994753689, MAE: 1202908.6741728361


Registered model 'linear_regression_model' already exists. Creating a new version of this model...
Created version '32' of model 'linear_regression_model'.


Modelo: Random Forest, Test Size: 0.1, Seed: 123
MSE: 442868568939.5728, R2: 0.8904461535922401, MAE: 314652.92124903057


Registered model 'random_forest_model' already exists. Creating a new version of this model...
Created version '31' of model 'random_forest_model'.


Modelo: Gradient Boosting, Test Size: 0.1, Seed: 123
MSE: 863552152979.3734, R2: 0.7863802794605161, MAE: 476542.71941303986


Registered model 'gradient_boosting_model' already exists. Creating a new version of this model...
Created version '31' of model 'gradient_boosting_model'.


Modelo: XGBoost, Test Size: 0.1, Seed: 123
MSE: 550207071814.4164, R2: 0.863893477059456, MAE: 375611.4812433036


Registered model 'xgboost_model' already exists. Creating a new version of this model...
Created version '31' of model 'xgboost_model'.


Modelo: Decision Tree, Test Size: 0.2, Seed: 123
MSE: 870421587535.2987, R2: 0.7667059447759692, MAE: 390590.8498239796


Registered model 'decision_tree_model' already exists. Creating a new version of this model...
Created version '33' of model 'decision_tree_model'.


Modelo: Linear Regression, Test Size: 0.2, Seed: 123
MSE: 284317706155509.3, R2: -75.20402755500736, MAE: 1117247.8268835822


Registered model 'linear_regression_model' already exists. Creating a new version of this model...
Created version '33' of model 'linear_regression_model'.


Modelo: Random Forest, Test Size: 0.2, Seed: 123
MSE: 521556933837.38885, R2: 0.8602101167209372, MAE: 337921.9740347194


Registered model 'random_forest_model' already exists. Creating a new version of this model...
Created version '32' of model 'random_forest_model'.


Modelo: Gradient Boosting, Test Size: 0.2, Seed: 123
MSE: 896667960499.8198, R2: 0.7596712814915327, MAE: 484357.4506445516


Registered model 'gradient_boosting_model' already exists. Creating a new version of this model...
Created version '32' of model 'gradient_boosting_model'.


Modelo: XGBoost, Test Size: 0.2, Seed: 123
MSE: 615934065670.2048, R2: 0.8349147608600651, MAE: 387472.49976610334


Registered model 'xgboost_model' already exists. Creating a new version of this model...
Created version '32' of model 'xgboost_model'.


Modelo: Decision Tree, Test Size: 0.3, Seed: 123
MSE: 923529811398.6324, R2: 0.7569166994450989, MAE: 407163.1649013605


Registered model 'decision_tree_model' already exists. Creating a new version of this model...
Created version '34' of model 'decision_tree_model'.


Modelo: Linear Regression, Test Size: 0.3, Seed: 123
MSE: 151352852065607.75, R2: -38.837751174255274, MAE: 1017715.461141149


Registered model 'linear_regression_model' already exists. Creating a new version of this model...
Created version '34' of model 'linear_regression_model'.


Modelo: Random Forest, Test Size: 0.3, Seed: 123
MSE: 560934000818.8547, R2: 0.8523559427864997, MAE: 354051.0513690986


Registered model 'random_forest_model' already exists. Creating a new version of this model...
Created version '33' of model 'random_forest_model'.


Modelo: Gradient Boosting, Test Size: 0.3, Seed: 123
MSE: 922498727862.6497, R2: 0.7571880920801617, MAE: 493347.57351917034


Registered model 'gradient_boosting_model' already exists. Creating a new version of this model...
Created version '33' of model 'gradient_boosting_model'.


Modelo: XGBoost, Test Size: 0.3, Seed: 123
MSE: 597322441521.3182, R2: 0.8427781011631683, MAE: 390276.3078306229


Registered model 'xgboost_model' already exists. Creating a new version of this model...
Created version '33' of model 'xgboost_model'.
