# Treinamento

## Inicialização

In [5]:
# Config Inicial
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

from sklearn.metrics import *

import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature

### MLFlow

In [6]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")

mlflow.set_experiment("ecd15")

<Experiment: artifact_location='file:///C:/Estudo/ECD15-MLOps/models/mlruns/0', creation_time=1743297663367, experiment_id='0', last_update_time=1743297701488, lifecycle_stage='active', name='ecd15', tags={}>

In [7]:
seeds = [42, 47, 123]

## Dataset

O pré-processamento inicial foi realizado no dataset `brasile-real-state-dataset.csv`, onde as colunas de latitude e longitude foram convertidas em informações de cidade e estado. Esse processo resultou na criação do dataset `brasil_estado_cidade.csv`, que está sendo utilizado para o treinamento dos modelos apresentados a seguir.


### Leitura

In [8]:
# Carregar o conjunto de dados
dados = pd.read_csv("dataset/brasil_estado_cidade.csv", encoding="utf-8")

# Eliminando registros com valores null
dados.dropna(inplace=True)

# float64
dados = dados.astype({col: 'float64' for col in dados.select_dtypes(include='int').columns})

def remover_outliers_por_cidade(df):
    # Remove outliers da coluna 'price_brl' agrupando por cidade.

    def remover_outliers_grupo(grupo):
        # Remove outliers de um grupo usando o método Z-score.
        z_scores = np.abs(stats.zscore(grupo["price_brl"]))
        return grupo[(z_scores < 3)]

    df_filtrado = df.groupby("city").apply(remover_outliers_grupo).reset_index(drop=True)
    return df_filtrado

dados = remover_outliers_por_cidade(dados)

dados.sample(10)

  df_filtrado = df.groupby("city").apply(remover_outliers_grupo).reset_index(drop=True)


Unnamed: 0.1,Unnamed: 0,property_type,state,region,lat,lon,area_m2,price_brl,city
8974,5809.0,apartment,Rio de Janeiro,Southeast,-22.872278,-43.02898,53.0,329382.13,Sete Pontes
717,8352.0,apartment,Santa Catarina,South,-26.985996,-48.636383,76.0,499063.83,Balneário Camboriú
8108,6339.0,apartment,Rio de Janeiro,Southeast,-22.914837,-43.182472,75.0,538988.95,Rio de Janeiro
5606,3331.0,apartment,Rio Grande do Sul,South,-30.015261,-51.165073,87.0,548970.21,Porto Alegre
4257,1259.0,apartment,Rio Grande do Sul,South,-30.031038,-51.217381,90.0,666332.92,Porto Alegre
2879,230.0,apartment,Pernambuco,Northeast,-8.215346,-34.927231,103.0,469120.01,Jaboatão dos Guararapes
8852,12225.0,house,So Paulo,Southeast,-23.648643,-46.54393,127.0,548970.21,Santo André
9585,10306.0,house,So Paulo,Southeast,-23.501787,-46.681152,200.0,479101.27,São Paulo
8375,1768.0,apartment,Rio Grande do Sul,South,-29.70067,-52.443275,72.0,334572.82,Santa Cruz do Sul
7294,5154.0,apartment,Rio de Janeiro,Southeast,-22.899942,-43.281296,73.0,409232.33,Rio de Janeiro


### Tratamento Features

In [9]:
# Exemplo de pré-processamento
x_features = dados.drop(["price_brl"], axis=1)  # Features
y_target = dados["price_brl"]  # Variável alvo

# Identificando colunas numéricas e categóricas
numeric_features = x_features.select_dtypes(include=['number']).columns
categorical_features = x_features.select_dtypes(include=['object']).columns

# Criando transformadores
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Criando o ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


### Divisão dos Dados

In [10]:
test_sizes = [0.1, 0.2, 0.3]
#x_train, x_test, y_train, y_test = train_test_split(x_features, y_target, test_size=0.2, random_state=42)

## Modelos

- Árvore de Decisão: `DecisionTreeRegressor`
- Regressão Linear: `LinearRegression`
- Random Forest: `RandomForestRegressor`
- Gradient Boosting: `GradientBoostingRegressor`
- XGBoost: `XGBRegressor`

In [11]:
models = {
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42)
}

## Execução

In [12]:
# para cada semente 
for seed in seeds:
    # para cada tamanho de teste
    for test_size in test_sizes:
        # Dividir os dados em conjuntos de treino e teste
        x_train, x_test, y_train, y_test = train_test_split(x_features, y_target, test_size=test_size, random_state=seed)

        # Para cada modelo
        for model_name, model in models.items():
            # Criar o pipeline
            pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                       ('model', model)])

            # Treinar o modelo
            pipeline.fit(x_train, y_train)

            # Fazer previsões
            y_pred = pipeline.predict(x_test)

            # Avaliar o modelo
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            mae = mean_absolute_error(y_test, y_pred)
            
            print(f"Modelo: {model_name}, Test Size: {test_size}, Seed: {seed}")
            print(f"MSE: {mse}, R2: {r2}, MAE: {mae}")
            
            # Logar os resultados no MLflow
            with mlflow.start_run():
                mlflow.log_param("model_type", model_name)
                mlflow.log_param("test_size", test_size)
                mlflow.log_param("random_seed", seed)
                
                # registrar métricas
                mlflow.log_metric("mse", mse)
                mlflow.log_metric("r2", r2)
                mlflow.log_metric("mae", mae)

                # Logar o modelo
                signature = infer_signature(x_test, y_pred)
                
                registered_model_name= model_name.lower().replace(" ", "_") + "_model"
                 
                mlflow.sklearn.log_model(
                    sk_model=pipeline,
                    artifact_path="'sklearn'-model",
                    signature=signature,
                    registered_model_name= registered_model_name,
                )
                
            mlflow.end_run()
                

Modelo: Decision Tree, Test Size: 0.1, Seed: 42
MSE: 61318563319.787544, R2: 0.4643648515387715, MAE: 173409.66728007182


Registered model 'decision_tree_model' already exists. Creating a new version of this model...
Created version '23' of model 'decision_tree_model'.


Modelo: Linear Regression, Test Size: 0.1, Seed: 42
MSE: 53575827203.35227, R2: 0.5319998609827157, MAE: 173412.6447177572


Registered model 'linear_regression_model' already exists. Creating a new version of this model...
Created version '23' of model 'linear_regression_model'.


Modelo: Random Forest, Test Size: 0.1, Seed: 42
MSE: 35609940151.11541, R2: 0.6889370111288469, MAE: 132206.2374586176


Registered model 'random_forest_model' already exists. Creating a new version of this model...
Created version '22' of model 'random_forest_model'.


Modelo: Gradient Boosting, Test Size: 0.1, Seed: 42
MSE: 37160859764.30143, R2: 0.6753892857373056, MAE: 142903.46345812114


Registered model 'gradient_boosting_model' already exists. Creating a new version of this model...
Created version '22' of model 'gradient_boosting_model'.


Modelo: XGBoost, Test Size: 0.1, Seed: 42
MSE: 35437700128.731316, R2: 0.6904415768747783, MAE: 134797.36147834378


Registered model 'xgboost_model' already exists. Creating a new version of this model...
Created version '22' of model 'xgboost_model'.


Modelo: Decision Tree, Test Size: 0.2, Seed: 42
MSE: 60234484268.80455, R2: 0.4839914622045445, MAE: 169972.06397395598


Registered model 'decision_tree_model' already exists. Creating a new version of this model...
Created version '24' of model 'decision_tree_model'.


Modelo: Linear Regression, Test Size: 0.2, Seed: 42
MSE: 53086354959.45636, R2: 0.545227078275053, MAE: 173764.0734761504


Registered model 'linear_regression_model' already exists. Creating a new version of this model...
Created version '24' of model 'linear_regression_model'.


Modelo: Random Forest, Test Size: 0.2, Seed: 42
MSE: 34415400038.12209, R2: 0.7051748601006242, MAE: 129875.12021360575


Registered model 'random_forest_model' already exists. Creating a new version of this model...
Created version '23' of model 'random_forest_model'.


Modelo: Gradient Boosting, Test Size: 0.2, Seed: 42
MSE: 36512699763.93193, R2: 0.6872079997884422, MAE: 141645.96487383224


Registered model 'gradient_boosting_model' already exists. Creating a new version of this model...
Created version '23' of model 'gradient_boosting_model'.


Modelo: XGBoost, Test Size: 0.2, Seed: 42
MSE: 33523428841.29063, R2: 0.7128160768989399, MAE: 130945.1318292546


Registered model 'xgboost_model' already exists. Creating a new version of this model...
Created version '23' of model 'xgboost_model'.


Modelo: Decision Tree, Test Size: 0.3, Seed: 42
MSE: 61209737715.86217, R2: 0.47475884579582217, MAE: 172580.1899281652


Registered model 'decision_tree_model' already exists. Creating a new version of this model...
Created version '25' of model 'decision_tree_model'.


Modelo: Linear Regression, Test Size: 0.3, Seed: 42
MSE: 54685765875.346565, R2: 0.5307410902454659, MAE: 175958.76369341835


Registered model 'linear_regression_model' already exists. Creating a new version of this model...
Created version '25' of model 'linear_regression_model'.


Modelo: Random Forest, Test Size: 0.3, Seed: 42
MSE: 35320779432.26966, R2: 0.6969121638371459, MAE: 131905.3177776115


Registered model 'random_forest_model' already exists. Creating a new version of this model...
Created version '24' of model 'random_forest_model'.


Modelo: Gradient Boosting, Test Size: 0.3, Seed: 42
MSE: 37337443039.665245, R2: 0.6796071604125762, MAE: 142814.49462159694


Registered model 'gradient_boosting_model' already exists. Creating a new version of this model...
Created version '24' of model 'gradient_boosting_model'.


Modelo: XGBoost, Test Size: 0.3, Seed: 42
MSE: 34492060902.08964, R2: 0.7040234028906887, MAE: 132658.0528461024


Registered model 'xgboost_model' already exists. Creating a new version of this model...
Created version '24' of model 'xgboost_model'.


Modelo: Decision Tree, Test Size: 0.1, Seed: 47
MSE: 65322070122.38855, R2: 0.4607357460145043, MAE: 175668.96981149013


Registered model 'decision_tree_model' already exists. Creating a new version of this model...
Created version '26' of model 'decision_tree_model'.


Modelo: Linear Regression, Test Size: 0.1, Seed: 47
MSE: 58307299456.23486, R2: 0.5186459601745141, MAE: 182221.6780128887


Registered model 'linear_regression_model' already exists. Creating a new version of this model...
Created version '26' of model 'linear_regression_model'.


Modelo: Random Forest, Test Size: 0.1, Seed: 47
MSE: 37212141854.77767, R2: 0.6927963568986575, MAE: 136306.39953536805


Registered model 'random_forest_model' already exists. Creating a new version of this model...
Created version '25' of model 'random_forest_model'.


Modelo: Gradient Boosting, Test Size: 0.1, Seed: 47
MSE: 41182027040.48046, R2: 0.6600230971249696, MAE: 146885.3764817992


Registered model 'gradient_boosting_model' already exists. Creating a new version of this model...
Created version '25' of model 'gradient_boosting_model'.


Modelo: XGBoost, Test Size: 0.1, Seed: 47
MSE: 36022027367.79652, R2: 0.7026213088601732, MAE: 135572.6124478232


Registered model 'xgboost_model' already exists. Creating a new version of this model...
Created version '25' of model 'xgboost_model'.


Modelo: Decision Tree, Test Size: 0.2, Seed: 47
MSE: 62908425780.65023, R2: 0.4688968380333659, MAE: 175165.05669959585


Registered model 'decision_tree_model' already exists. Creating a new version of this model...
Created version '27' of model 'decision_tree_model'.


Modelo: Linear Regression, Test Size: 0.2, Seed: 47
MSE: 56839157394.32, R2: 0.5201365184228184, MAE: 179765.89586411332


Registered model 'linear_regression_model' already exists. Creating a new version of this model...
Created version '27' of model 'linear_regression_model'.


Modelo: Random Forest, Test Size: 0.2, Seed: 47
MSE: 37137807103.35725, R2: 0.6864647853393464, MAE: 135326.87215002245


Registered model 'random_forest_model' already exists. Creating a new version of this model...
Created version '26' of model 'random_forest_model'.


Modelo: Gradient Boosting, Test Size: 0.2, Seed: 47
MSE: 40280208826.061806, R2: 0.6599351198710504, MAE: 146408.9990681346


Registered model 'gradient_boosting_model' already exists. Creating a new version of this model...
Created version '26' of model 'gradient_boosting_model'.


Modelo: XGBoost, Test Size: 0.2, Seed: 47
MSE: 35770812093.93399, R2: 0.6980056141375233, MAE: 134917.4975446228


Registered model 'xgboost_model' already exists. Creating a new version of this model...
Created version '26' of model 'xgboost_model'.


Modelo: Decision Tree, Test Size: 0.3, Seed: 47
MSE: 65904808872.47329, R2: 0.4342869398037643, MAE: 180476.29380724335


Registered model 'decision_tree_model' already exists. Creating a new version of this model...
Created version '28' of model 'decision_tree_model'.


Modelo: Linear Regression, Test Size: 0.3, Seed: 47
MSE: 56671956586.253815, R2: 0.5135398078499168, MAE: 179735.95102699214


Registered model 'linear_regression_model' already exists. Creating a new version of this model...
Created version '28' of model 'linear_regression_model'.


Modelo: Random Forest, Test Size: 0.3, Seed: 47
MSE: 37553884415.851776, R2: 0.6776453306122765, MAE: 137225.04068982342


Registered model 'random_forest_model' already exists. Creating a new version of this model...
Created version '27' of model 'random_forest_model'.


Modelo: Gradient Boosting, Test Size: 0.3, Seed: 47
MSE: 39975024885.49525, R2: 0.656862768494582, MAE: 146693.90711630083


Registered model 'gradient_boosting_model' already exists. Creating a new version of this model...
Created version '27' of model 'gradient_boosting_model'.


Modelo: XGBoost, Test Size: 0.3, Seed: 47
MSE: 36307243729.50807, R2: 0.6883462328636021, MAE: 137182.97523776564


Registered model 'xgboost_model' already exists. Creating a new version of this model...
Created version '27' of model 'xgboost_model'.


Modelo: Decision Tree, Test Size: 0.1, Seed: 123
MSE: 63424352270.36406, R2: 0.4662427582353449, MAE: 177855.1455924596


Registered model 'decision_tree_model' already exists. Creating a new version of this model...
Created version '29' of model 'decision_tree_model'.


Modelo: Linear Regression, Test Size: 0.1, Seed: 123
MSE: 57663272081.37626, R2: 0.5147260010464185, MAE: 180383.88009849493


Registered model 'linear_regression_model' already exists. Creating a new version of this model...
Created version '29' of model 'linear_regression_model'.


Modelo: Random Forest, Test Size: 0.1, Seed: 123
MSE: 36330548408.03451, R2: 0.6942547678310177, MAE: 137781.02847576299


Registered model 'random_forest_model' already exists. Creating a new version of this model...
Created version '28' of model 'random_forest_model'.


Modelo: Gradient Boosting, Test Size: 0.1, Seed: 123
MSE: 39101028244.99076, R2: 0.6709393751907473, MAE: 149339.75781635128


Registered model 'gradient_boosting_model' already exists. Creating a new version of this model...
Created version '28' of model 'gradient_boosting_model'.


Modelo: XGBoost, Test Size: 0.1, Seed: 123
MSE: 33675961804.39893, R2: 0.7165948434149507, MAE: 137149.96591113106


Registered model 'xgboost_model' already exists. Creating a new version of this model...
Created version '28' of model 'xgboost_model'.


Modelo: Decision Tree, Test Size: 0.2, Seed: 123
MSE: 63529096083.97432, R2: 0.4501405374386255, MAE: 176293.00179164796


Registered model 'decision_tree_model' already exists. Creating a new version of this model...
Created version '30' of model 'decision_tree_model'.


Modelo: Linear Regression, Test Size: 0.2, Seed: 123
MSE: 57119846991.41373, R2: 0.5056141153532079, MAE: 179498.15173649596


Registered model 'linear_regression_model' already exists. Creating a new version of this model...
Created version '30' of model 'linear_regression_model'.


Modelo: Random Forest, Test Size: 0.2, Seed: 123
MSE: 36634451825.31596, R2: 0.6829200912087237, MAE: 137577.93817831163


Registered model 'random_forest_model' already exists. Creating a new version of this model...
Created version '29' of model 'random_forest_model'.


Modelo: Gradient Boosting, Test Size: 0.2, Seed: 123
MSE: 40520890317.01785, R2: 0.6492820401100539, MAE: 149237.93202080845


Registered model 'gradient_boosting_model' already exists. Creating a new version of this model...
Created version '29' of model 'gradient_boosting_model'.


Modelo: XGBoost, Test Size: 0.2, Seed: 123
MSE: 36277571096.50831, R2: 0.6860089789442159, MAE: 137197.1964214751


Registered model 'xgboost_model' already exists. Creating a new version of this model...
Created version '29' of model 'xgboost_model'.


Modelo: Decision Tree, Test Size: 0.3, Seed: 123
MSE: 63730050991.39421, R2: 0.44289446310468705, MAE: 176694.38042502245


Registered model 'decision_tree_model' already exists. Creating a new version of this model...
Created version '31' of model 'decision_tree_model'.


Modelo: Linear Regression, Test Size: 0.3, Seed: 123
MSE: 56511312174.740974, R2: 0.505998121451072, MAE: 180757.0030505261


Registered model 'linear_regression_model' already exists. Creating a new version of this model...
Created version '31' of model 'linear_regression_model'.


Modelo: Random Forest, Test Size: 0.3, Seed: 123
MSE: 36252042741.875175, R2: 0.6830974803708945, MAE: 137698.2834600419


Registered model 'random_forest_model' already exists. Creating a new version of this model...
Created version '30' of model 'random_forest_model'.


Modelo: Gradient Boosting, Test Size: 0.3, Seed: 123
MSE: 39765471976.7416, R2: 0.6523843262185738, MAE: 148802.22336159798


Registered model 'gradient_boosting_model' already exists. Creating a new version of this model...
Created version '30' of model 'gradient_boosting_model'.


Modelo: XGBoost, Test Size: 0.3, Seed: 123
MSE: 35728929338.27869, R2: 0.6876703524937664, MAE: 137394.49164296337


Registered model 'xgboost_model' already exists. Creating a new version of this model...
Created version '30' of model 'xgboost_model'.


In [None]:

#Avaliar modelos, corrigir valores de acordo com a inflação,  testar drift, 

#df_examples, y = load_new_data()
#drift_score, drift_by_columns = evaluate_model(df_examples, y, None)
#new_data = simulate_drift(df_examples)
#drift_score, drift_by_columns = evaluate_model(df_examples, y, new_data)
#check_for_drift(drift_score, drift_by_columns)

