### 1.0 IMPORTS

In [1]:
import os
import pickle

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_percentage_error


pd.options.display.max_columns = None

### 2.0 CONFIGURATIONS

In [2]:
path_raw = './data/'
path_out='./outputs/'
path_out_images='./images/'

### 3.0 LOADING DATASETS

In [3]:
data_raw = pd.read_csv(os.path.join(path_raw,'FLXDC_2023 (TCT)_20240612.csv'), low_memory=False, sep=';')
cod_base_guru = pd.read_csv(os.path.join(path_raw,'cods_guru_plantas.csv'), dtype=str)

### 4.0 HELPERS

In [4]:
def str_to_float(val_str):    
    return float(val_str.replace('.','').replace(",",'.'))

def class_to_num(df, col):
    new_class = {name:num for num,name in enumerate(df[col].unique())}
    return new_class

### 5.0 ETL

#### 5.1 Transforming

In [5]:
df_geral = data_raw.copy()

df_geral['Peso Liquido Delivery'] = df_geral['Peso Liquido Delivery'].map(lambda x: str_to_float(x) if x != '' else 0)
df_geral['FRETE'] = df_geral['FRETE'].map(lambda x: str_to_float(x[1:]) if x != '' else 0)
df_geral['FRETE AJUSTADO'] = df_geral['FRETE AJUSTADO'].map(lambda x: float(x.replace(',','')))
df_geral['KM Ajustado'] = df_geral['KM Ajustado'].map(lambda x: float(x.replace(',','')))

#### 5.2 Cleaning

In [6]:
df_geral = df_geral[~df_geral['UF Destino'].isin(['0','1'])]
df_geral = df_geral[df_geral['FRETE']>= 0.1]
df_geral = df_geral[df_geral['Planta'] != '7958']
df_geral.loc[
    (df_geral['Planta'] == '7961') 
    & (df_geral['CHAVE ORIGEM'] == 'CONTAGEMMG'), 
    'CHAVE ORIGEM'
] = 'SETE LAGOASMG'

#### 5.3 Agreggation

In [7]:
features= [
    'Planta', 'CHAVE ORIGEM','Movimentação',
    'UF Destino', 'TIPO',
]

num_cols = [
    'Quantidade','FRETE AJUSTADO',
]

df_agg = df_geral[
    ['CHAVE DESTINO']+ features + num_cols
    ].groupby(
        ['CHAVE DESTINO']+features).sum().reset_index()

_df_agg_mean = df_geral[
    ['Nro.Documento','Data','CHAVE DESTINO']+ features + num_cols
    ].groupby(
        ['Nro.Documento','Data','CHAVE DESTINO']+features).sum().reset_index()
_df_agg_mean['RealCaixa'] = _df_agg_mean['FRETE AJUSTADO'] / _df_agg_mean['Quantidade']

_df_agg_mean = _df_agg_mean[
    ['CHAVE DESTINO']+ features + num_cols + ['RealCaixa']
    ].groupby(
        ['CHAVE DESTINO']+features).mean().reset_index()

df_agg = pd.merge(
    df_agg,
    _df_agg_mean[['CHAVE DESTINO']+ features + ['RealCaixa']].rename(columns={'RealCaixa':'RealCaixaMean'}),
    on=['CHAVE DESTINO']+ features,
    how='left'
)

_df_agg_km = df_geral[['CHAVE ORIGEM','CHAVE DESTINO','KM']].drop_duplicates()

df_agg = pd.merge(
    df_agg,
    _df_agg_km.rename(columns={'KM':'KM Ajustado'}),
    on=['CHAVE ORIGEM','CHAVE DESTINO'],
    how='left'
)


#### 5.4 Filtering

In [8]:
col_target = 'RealCaixaMean'

### Initial hard filter
df_agg = df_agg[df_agg[col_target] < df_agg[col_target].mean() * 10]

### Categorical to Numerical
for feature in features:
    new_class = class_to_num(df_agg, feature)
    df_agg[feature+'_Class'] = df_agg[feature].map(lambda x: new_class[x])

In [9]:
# ### Filter to get the data in the middle 50% 
# df_qt = df_agg[
#     features + ['CHAVE DESTINO'] + [col_target]
#     ].groupby(
#        features + ['CHAVE DESTINO']
#     ).quantile(q=.25).reset_index().rename(columns={col_target:'%25'})
# df_qt['%75'] = df_agg[
#     features + ['CHAVE DESTINO'] + [col_target]
#     ].groupby(
#        features + ['CHAVE DESTINO']
#     ).quantile(q=.75).reset_index()[col_target]

# df_qt['Len'] = df_agg[
#     features + ['CHAVE DESTINO'] + [col_target]
#     ].groupby(
#        features + ['CHAVE DESTINO']
#     ).count().reset_index()[col_target]


# df_agg = pd.merge(
#     df_agg,
#     df_qt,
#     on=features + ['CHAVE DESTINO'],
#     how='left',
# )

# df_filt_len2 = df_agg[df_agg['Len'] <3].copy()
# df_filt = df_agg[(df_agg[col_target]>= df_agg['%25']) & (df_agg[col_target]<=df_agg['%75'])].copy()
# df_filt = pd.concat(
#     [
#         df_filt,
#         df_filt_len2
#     ],
#     ignore_index=True
# )

# df_filt['VENDAS'] = df_filt['Movimentação'] == 'VENDAS'
# df_filt['TRANSFERENCIA'] = df_filt['Movimentação'] == 'TRANSFERENCIA'

### 6.0 MODEL

In [10]:
# eng_classes = [f"{feature}_Class" for feature in features if feature != 'CHAVE ORIGEM']
# params = {
#     "n_estimators": 500,
#     "max_depth": 4,
#     "min_samples_split": 5,
#     "learning_rate": 0.01,
#     "loss": "squared_error",
# }

# X_Model = df_filt[['KM Ajustado','VENDAS','TRANSFERENCIA',]+eng_classes]
# y_target = df_filt[col_target].to_numpy()
# X_train, X_test, y_train, y_test = train_test_split(X_Model, y_target, test_size=0.10, random_state=7, shuffle = True)

# rf = RandomForestRegressor(random_state=2)
# rf = rf.fit(X = X_Model, y = y_target)
# y_pred_rf = rf.predict(X_Model)
# r2_rf_test = r2_score(y_target, y_pred_rf)
# mape_rf_test = mean_absolute_percentage_error(y_target, y_pred_rf)

# fig,ax = plt.subplots(layout='constrained')
# ax.plot(df_filt['KM Ajustado'],df_filt[col_target],'.')
# ax.plot(df_filt['KM Ajustado'], rf.predict(X_Model),'.')
# ax.text(df_filt['KM Ajustado'].max()-500,max(y_pred_rf),f'r2={r2_rf_test:0.2}')
# ax.text(df_filt['KM Ajustado'].max()-500,max(y_pred_rf)-10,f'mape={mape_rf_test*100:.1f}')
# fig.legend(['Dado','Modelo'],loc='upper right')
# ax.set_xlabel('Km')
# ax.set_ylabel('R$/CX')
# ax.set_title('Modelo Geral - Random Forest')
# os.makedirs(path_out_images, exist_ok=True)
# os.makedirs(path_out, exist_ok=True)
# fig.savefig(os.path.join(path_out_images,f'Modelo_Geral_RF.png'),dpi=200)

### 7.0 EXPORTS

Exportação concluída!


In [16]:
# 0. Preparação dos dados iniciais
# Definir features categóricas
features = [
    'Planta', 'CHAVE ORIGEM', 'Movimentação',
    'UF Destino', 'TIPO',
]

# Criar eng_classes (colunas categóricas transformadas)
eng_classes = [f"{feature}_Class" for feature in features if feature != 'CHAVE ORIGEM']

# Preparar X e y inicial
X_Model = df_filt[['KM Ajustado','VENDAS','TRANSFERENCIA'] + eng_classes]
y_target = df_filt[col_target].to_numpy()

# 1. Imports necessários
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import (
    VotingRegressor, 
    GradientBoostingRegressor, 
    ExtraTreesRegressor,
    RandomForestRegressor
)
from sklearn.linear_model import HuberRegressor
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error
from sklearn.exceptions import ConvergenceWarning
import warnings

# 2. Funções auxiliares
def remove_outliers(X, y, z_threshold=2.5):
    """Remove outliers usando z-score"""
    scaler = RobustScaler()
    z_scores = np.abs(scaler.fit_transform(y.reshape(-1, 1)))
    mask = z_scores < z_threshold
    return X[mask.flatten()], y[mask.flatten()]

def post_process_predictions(y_pred, y_train):
    """Pós-processa as previsões"""
    y_pred = np.maximum(y_pred, 0)
    upper_limit = y_train.mean() + 3 * y_train.std()
    y_pred = np.minimum(y_pred, upper_limit)
    return y_pred

def compute_sample_weights(y):
    """Calcula pesos das amostras"""
    bins = pd.qcut(y, q=10, labels=False)
    weights = 1 / pd.Series(bins).value_counts()[bins].values
    return weights / weights.sum() * len(weights)

# 3. Feature Engineering
def create_features(X, df_original):
    """Cria features engineered com foco nas interações mais importantes"""
    X_new = X.copy()
    
    # Features básicas de KM
    X_new['km_log'] = np.log1p(X['KM Ajustado'])
    X_new['km_root'] = np.sqrt(X['KM Ajustado'])
    X_new['km_squared'] = X['KM Ajustado'] ** 2
    X_new['km_cubic'] = X['KM Ajustado'] ** 3
    
    # Interações com KM
    X_new['km_vendas'] = X['KM Ajustado'] * X['VENDAS']
    X_new['km_transfer'] = X['KM Ajustado'] * X['TRANSFERENCIA']
    
    # Interações com features importantes
    X_new['planta_km'] = X['KM Ajustado'] * X['Planta_Class']
    X_new['uf_km'] = X['KM Ajustado'] * X['UF Destino_Class']
    X_new['tipo_km'] = X['KM Ajustado'] * X['TIPO_Class']
    
    # Features de agrupamento básicas
    for feature in ['Planta', 'UF Destino', 'TIPO']:
        X_new[f'mean_by_{feature}'] = df_original.groupby(feature)[col_target].transform('mean')
        X_new[f'std_by_{feature}'] = df_original.groupby(feature)[col_target].transform('std')
        X_new[f'median_by_{feature}'] = df_original.groupby(feature)[col_target].transform('median')
    
    # Novas interações baseadas nas features mais importantes
    X_new['km_vendas_planta'] = X_new['km_vendas'] * X_new['mean_by_Planta']
    X_new['km_vendas_tipo'] = X_new['km_vendas'] * X_new['mean_by_TIPO']
    
    # Features de densidade/frequência
    X_new['densidade_regiao'] = df_original.groupby('UF Destino')['CHAVE DESTINO'].transform('count')
    X_new['freq_planta'] = df_original.groupby('Planta')['CHAVE DESTINO'].transform('count')
    
    # Interações complexas
    X_new['planta_tipo_mean'] = df_original.groupby(['Planta', 'TIPO'])[col_target].transform('mean')
    X_new['uf_tipo_mean'] = df_original.groupby(['UF Destino', 'TIPO'])[col_target].transform('mean')
    
    return X_new

# 4. Pipeline e Grid
# def create_ensemble_pipeline():
#     """Cria pipeline com ensemble de modelos"""
#     rf = RandomForestRegressor(
#         n_estimators=500,
#         random_state=42,
#         n_jobs=-1,
#         warm_start=True,
#         max_features='sqrt'
#     )
    
#     gb = GradientBoostingRegressor(
#         n_estimators=500,
#         random_state=42,
#         learning_rate=0.01,
#         warm_start=True,
#         subsample=0.8
#     )
    
#     et = ExtraTreesRegressor(
#         n_estimators=500,
#         random_state=42,
#         n_jobs=-1,
#         warm_start=True,
#         max_features='sqrt'
#     )
    
#     hub = HuberRegressor(
#         max_iter=2000,
#         epsilon=1.35,
#         tol=1e-3,
#         warm_start=True,
#         fit_intercept=True
#     )
    
#     ensemble = VotingRegressor([
#         ('rf', rf),
#         ('gb', gb),
#         ('et', et),
#         ('hub', hub)
#     ], weights=[0.4, 0.3, 0.2, 0.1])
    
#     return Pipeline([
#         ('scaler', RobustScaler()),
#         ('ensemble', ensemble)
#     ])

# 4. Pipeline 
def create_ensemble_pipeline():
    """Cria pipeline com ensemble de modelos otimizado"""
    rf = RandomForestRegressor(
        n_estimators=1000,
        random_state=42,
        n_jobs=-1,
        warm_start=True,
        max_features='sqrt',
        bootstrap=True,
        oob_score=True  
    )
    
    gb = GradientBoostingRegressor(
        n_estimators=1000,
        random_state=42,
        warm_start=True,
        validation_fraction=0.15,
        n_iter_no_change=20,     
        tol=1e-5,
        subsample=0.8
    )
    
    et = ExtraTreesRegressor(
        n_estimators=500,
        random_state=42,
        n_jobs=-1,
        warm_start=True,
        max_features='sqrt',
        bootstrap=True
    )
    
    hub = HuberRegressor(
        max_iter=5000,        
        epsilon=1.35,
        tol=1e-4,            
        warm_start=True,
        fit_intercept=True,
        alpha=0.001         
    )
    
  
    ensemble = VotingRegressor([
        ('rf', rf),
        ('gb', gb),
        ('et', et),
        ('hub', hub)
    ], weights=[0.50, 0.35, 0.10, 0.05])
    
    return Pipeline([
        ('scaler', RobustScaler()),
        ('ensemble', ensemble)
    ])

# 5. Grid de hiperparâmetros
# param_grid = {
#     'ensemble__rf__max_depth': [8, 10],
#     'ensemble__rf__min_samples_split': [2, 5],
#     'ensemble__gb__learning_rate': [0.01, 0.05],
#     'ensemble__gb__subsample': [0.8, 0.9]
# }

param_grid_inicial = {
    'ensemble__rf__max_depth': [6, 8, 10],
    'ensemble__rf__min_samples_split': [2, 5, 10],
    'ensemble__gb__learning_rate': [0.01, 0.05, 0.1],
    'ensemble__gb__subsample': [0.7, 0.8, 0.9]
}

# 6. Treinamento
# def train_ensemble_model(X, y, df_original):
#     """Treina o modelo ensemble"""
#     print("Preparando features...")
#     X_processed = create_features(X, df_original)
#     X_clean, y_clean = remove_outliers(X_processed, y, z_threshold=2.5)
    
#     y_log = np.log1p(y_clean)
#     print("Criando estratificação...")
#     y_bins = pd.qcut(y_log, q=10, labels=False)
    
#     print("Dividindo dados...")
#     X_train, X_test, y_train, y_test = train_test_split(
#         X_clean, y_log,
#         test_size=0.2,
#         random_state=42,
#         stratify=y_bins
#     )
    
#     sample_weights = compute_sample_weights(y_train)
    
#     print("Treinando modelo...")
#     pipeline = create_ensemble_pipeline()
#     grid_search = GridSearchCV(
#         pipeline,
#         param_grid,
#         cv=3,
#         scoring='r2',
#         n_jobs=-1,
#         verbose=1
#     )
    
#     warnings.filterwarnings('ignore', category=ConvergenceWarning)
#     grid_search.fit(X_train, y_train, ensemble__sample_weight=sample_weights)
    
#     return grid_search, X_train, X_test, y_train, y_test

def train_ensemble_model(X, y, df_original):
    """Treina o modelo ensemble com ajuste fino de hiperparâmetros"""
    print("Preparando features...")
    X_processed = create_features(X, df_original)
    X_clean, y_clean = remove_outliers(X_processed, y, z_threshold=2.5)
    
    y_log = np.log1p(y_clean)
    print("Criando estratificação...")
    y_bins = pd.qcut(y_log, q=10, labels=False)
    
    print("Dividindo dados...")
    X_train, X_test, y_train, y_test = train_test_split(
        X_clean, y_log,
        test_size=0.2,
        random_state=42,
        stratify=y_bins
    )
    
    sample_weights = compute_sample_weights(y_train)
    
    # Primeira etapa - Grid inicial
    print("Treinando modelo - Etapa 1 (Grid Inicial)...")
    pipeline = create_ensemble_pipeline()
    grid_search_1 = GridSearchCV(
        pipeline,
        param_grid_inicial,
        cv=3,
        scoring='r2',
        n_jobs=-1,
        verbose=1
    )
    
    grid_search_1.fit(X_train, y_train, ensemble__sample_weight=sample_weights)
    
    # Extrair melhores parâmetros
    best_params = grid_search_1.best_params_
    best_depth = best_params['ensemble__rf__max_depth']
    best_split = best_params['ensemble__rf__min_samples_split']
    best_lr = best_params['ensemble__gb__learning_rate']
    best_subsample = best_params['ensemble__gb__subsample']
    
    print(f"\nMelhores parâmetros da primeira etapa:")
    print(f"RF max_depth: {best_depth}")
    print(f"RF min_samples_split: {best_split}")
    print(f"GB learning_rate: {best_lr}")
    print(f"GB subsample: {best_subsample}")
    
    # Segunda etapa - Grid fino
    print("\nTreinando modelo - Etapa 2 (Grid Fino)...")
    param_grid_fino = {
        'ensemble__rf__max_depth': [max(4, best_depth-1), best_depth, best_depth+1],
        'ensemble__rf__min_samples_split': [max(2, best_split-1), best_split, min(20, best_split+1)],
        'ensemble__gb__learning_rate': [best_lr*0.5, best_lr, best_lr*1.5],
        'ensemble__gb__subsample': [max(0.5, best_subsample-0.1), best_subsample, min(1.0, best_subsample+0.1)]
    }
    
    grid_search_2 = GridSearchCV(
        pipeline,
        param_grid_fino,
        cv=3,
        scoring=['r2', 'neg_mean_absolute_percentage_error'],
        refit='r2',
        n_jobs=-1,
        verbose=1
    )
    
    warnings.filterwarnings('ignore', category=ConvergenceWarning)
    grid_search_2.fit(X_train, y_train, ensemble__sample_weight=sample_weights)
    
    print("\nMelhores parâmetros finais:")
    print(grid_search_2.best_params_)
    
    return grid_search_2, X_train, X_test, y_train, y_test

# 7. Avaliação
def evaluate_model(model, X_train, X_test, y_train, y_test):
    """Avalia o modelo com métricas por faixa de KM"""
    def evaluate_by_km_range(X, y_true, y_pred, km_ranges):
        metrics_by_range = {}
        for i, (km_min, km_max) in enumerate(km_ranges):
            mask = (X['KM Ajustado'] >= km_min) & (X['KM Ajustado'] < km_max)
            if mask.any():
                metrics_by_range[f'Range {i+1} ({km_min}-{km_max} km)'] = {
                    'r2': r2_score(y_true[mask], y_pred[mask]),
                    'mape': mean_absolute_percentage_error(y_true[mask], y_pred[mask]),
                    'rmse': np.sqrt(mean_squared_error(y_true[mask], y_pred[mask]))
                }
        return metrics_by_range
    
    # Predições e transformações
    y_pred_train = np.expm1(model.predict(X_train))
    y_pred_test = np.expm1(model.predict(X_test))
    
    y_train_original = np.expm1(y_train)
    y_test_original = np.expm1(y_test)
    
    # Pós-processamento por faixa
    km_ranges = [(0, 100), (100, 300), (300, 600), (600, 1000), (1000, float('inf'))]
    
    # Métricas gerais
    metrics = {
        'train_r2': r2_score(y_train_original, y_pred_train),
        'test_r2': r2_score(y_test_original, y_pred_test),
        'train_mape': mean_absolute_percentage_error(y_train_original, y_pred_train),
        'test_mape': mean_absolute_percentage_error(y_test_original, y_pred_test),
        'train_rmse': np.sqrt(mean_squared_error(y_train_original, y_pred_train)),
        'test_rmse': np.sqrt(mean_squared_error(y_test_original, y_pred_test)),
        'train_by_range': evaluate_by_km_range(X_train, y_train_original, y_pred_train, km_ranges),
        'test_by_range': evaluate_by_km_range(X_test, y_test_original, y_pred_test, km_ranges)
    }
    
    # Visualizações
    fig = plt.figure(figsize=(20, 10))
    
    # Plot 1: Dispersão
    # ax1 = plt.subplot(221)
    # ax1.hexbin(y_train_original, y_pred_train, gridsize=30, cmap='YlOrRd')
    # ax1.plot([y_train_original.min(), y_train_original.max()], 
    #          [y_train_original.min(), y_train_original.max()], 'r--')
    # ax1.set_title(f'Treino (R² = {metrics["train_r2"]:.3f}, MAPE = {metrics["train_mape"]*100:.1f}%)')

    ax1 = plt.subplot(221)
    ax1.hexbin(y_test_original, y_pred_test, gridsize=30, cmap='YlOrRd')
    ax1.plot([y_test_original.min(), y_test_original.max()], 
             [y_test_original.min(), y_test_original.max()], 'r--')
    ax1.set_title(f'Teste (R² = {metrics["test_r2"]:.2f}, MAPE = {metrics["test_mape"]*100:.1f}%)')
    ax1.set_xlabel('Valor Real')
    ax1.set_ylabel('Valor Previsto')
    
    # Plot 2: Residuais
    ax2 = plt.subplot(222)
    residuals = y_test_original - y_pred_test
    ax2.scatter(y_pred_test, residuals, alpha=0.5)
    ax2.axhline(y=0, color='r', linestyle='--')
    ax2.set_title('Análise de Residuais (Teste)')
    
    # Plot 3: Distribuição dos Erros
    ax3 = plt.subplot(223)
    ax3.hist(residuals, bins=50)
    ax3.set_title('Distribuição dos Erros')
    
    # Plot 4: Erro por Faixa de KM
    ax4 = plt.subplot(224)
    mape_by_range = []
    range_labels = []
    for range_name, metrics_range in metrics['test_by_range'].items():
        mape_by_range.append(metrics_range['mape'] * 100)
        range_labels.append(range_name)
    ax4.bar(range_labels, mape_by_range)
    ax4.set_title('MAPE por Faixa de KM')
    plt.xticks(rotation=45)
    
    plt.tight_layout()
    
    return metrics, fig

# 8. Execução
# print("Iniciando treinamento...")
# model, X_train, X_test, y_train, y_test = train_ensemble_model(X_Model, y_target, df_filt)
# metrics, fig = evaluate_model(model, X_train, X_test, y_train, y_test)
print("Iniciando treinamento com ajuste fino...")
model, X_train, X_test, y_train, y_test = train_ensemble_model(X_Model, y_target, df_filt)
metrics, fig = evaluate_model(model, X_train, X_test, y_train, y_test)

# 9. Salvamento e resultados
os.makedirs(path_out_images, exist_ok=True)
os.makedirs(path_out, exist_ok=True)
fig.savefig(os.path.join(path_out_images, 'Ensemble_Model_Evaluation_v2.png'), dpi=200)
pickle.dump(model, open(os.path.join(path_out, "pepsico_frete_ensemble_v2.sav"), 'wb'))

print("\nMétricas de Avaliação:")
print(f"R² Treino: {metrics['train_r2']:.3f}")
print(f"R² Teste: {metrics['test_r2']:.3f}")
print(f"MAPE Treino: {metrics['train_mape']*100:.1f}%")
print(f"MAPE Teste: {metrics['test_mape']*100:.1f}%")
print(f"RMSE Treino: {metrics['train_rmse']:.2f}")
print(f"RMSE Teste: {metrics['test_rmse']:.2f}")

# 10. Salvar resultados
os.makedirs(path_out_images, exist_ok=True)
os.makedirs(path_out, exist_ok=True)
fig.savefig(os.path.join(path_out_images, 'Ensemble_Model_Evaluation.png'), dpi=200)
pickle.dump(model, open(os.path.join(path_out, "pepsico_frete_ensemble.sav"), 'wb'))

# 11. Imprimir resultados detalhados
print("\nMétricas de Avaliação:")
print(f"R² Treino: {metrics['train_r2']:.3f}")
print(f"R² Teste: {metrics['test_r2']:.3f}")
print(f"MAPE Treino: {metrics['train_mape']*100:.1f}%")
print(f"MAPE Teste: {metrics['test_mape']*100:.1f}%")
print(f"RMSE Treino: {metrics['train_rmse']:.2f}")
print(f"RMSE Teste: {metrics['test_rmse']:.2f}")

# 12. Análise de importância das features
rf_model = RandomForestRegressor(
    n_estimators=500,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train, y_train)

feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf_model.feature_importances_
})

print("\nImportância das Features (top 10):")
print(feature_importance.sort_values('importance', ascending=False).head(10))

# 13. Plot da importância das features
plt.figure(figsize=(12, 6))
feature_importance.sort_values('importance').tail(10).plot(
    x='feature', 
    y='importance', 
    kind='barh'
)
plt.title('Top 10 Features Mais Importantes')
plt.tight_layout()
plt.savefig(os.path.join(path_out_images, 'Feature_Importance.png'), dpi=200)

Iniciando treinamento com ajuste fino...
Preparando features...
Criando estratificação...
Dividindo dados...
Treinando modelo - Etapa 1 (Grid Inicial)...
Fitting 3 folds for each of 81 candidates, totalling 243 fits


KeyboardInterrupt: 

In [17]:
eng_classes

['Planta_Class', 'Movimentação_Class', 'UF Destino_Class', 'TIPO_Class']