# 0.0 Import library

In [2]:
import pandas            as pd
import numpy             as np

from sklearn import model_selection as ms
from sklearn import tree            as tr
from sklearn import metrics         as mt

# 0.1 Load dataset

In [3]:
X_train = pd.read_csv( '../../dataset/regression/X_training.csv')
y_train = pd.read_csv( '../../dataset/regression/y_training.csv')
X_val = pd.read_csv( '../../dataset/regression/X_validation.csv')
y_val = pd.read_csv( '../../dataset/regression/y_val.csv')
X_test = pd.read_csv( '../../dataset/regression/X_test.csv')
y_test = pd.read_csv( '../../dataset/regression/y_test.csv')

# 1.0 Training model

## Dados de treino

In [45]:
m = np.arange( 1, 41, 1)
mse_list = []
max_r2 = 0
min_mse = float('inf')
min_rmse = float('inf')
min_mae = float('inf')
min_mape = float('inf')

for i in m:
    # definition
    model = tr.DecisionTreeRegressor( max_depth=i, n_estimators=50 )

    # training
    model.fit( X_train, y_train )

    # performance
    y_pred = model.predict( X_train )

    # R squared
    r2_squared = np.round( mt.r2_score( y_train, y_pred ), 3 )
    if r2_squared > max_r2:
        max_r2 = r2_squared
    # MSE
    mse = np.round(mt.mean_squared_error( y_train, y_pred), 3)
    mse_list.append( mse )
    if mse < min_mse:
        min_mse = mse
    # RMSE
    rmse = np.round(np.sqrt( mse ), 3)
    if rmse < min_rmse:
        min_rmse = rmse

    # MAE
    mae = np.round(mt.mean_absolute_error( y_train, y_pred), 3)
    if mae < min_mae:
        min_mae = mae
    # MAPE
    mape = np.round(mt.mean_absolute_percentage_error( y_train, y_pred), 3) 
    if mape < min_mape:
        min_mape = mape

print( 'MAX R2: {}'.format( max_r2 ) )
print('MIN MSE: {}'.format( min_mse ) )   
print('MIN RMSE: {}'.format( min_rmse ) )
print('MIN MAE: {}'.format( min_mae ) )
print('MIN MAPE: {}'.format( min_mape ) )

MAX R2: 0.992
MIN MSE: 3.94
MIN RMSE: 1.985
MIN MAE: 0.214
MIN MAPE: 0.083


## Dados de validação

In [50]:
# Retreinando o modelo com o melhor parâmetro sobre os dados de validação

best_m = mse_list.index( min ( mse_list ) )

# definition
model = tr.DecisionTreeRegressor( max_depth=m[best_m] )

# training
model.fit( X_train, y_train )

# performance
yhat_val = model.predict( X_val )

# R squared
r2_squared = np.round( mt.r2_score( y_val, yhat_val ), 3 )

# MSE
mse = np.round(mt.mean_squared_error( y_val, yhat_val), 3)

# RMSE
rmse = np.round(np.sqrt( mse ), 3)

# MAE
mae = np.round(mt.mean_absolute_error( y_val, yhat_val), 3)

# MAPE
mape = np.round(mt.mean_absolute_percentage_error( y_val, yhat_val ), 3) 

print( 'R2: {}'.format( r2_squared ) )
print('MSE: {}'.format( mse ) )   
print('RMSE: {}'.format( rmse ) )
print('MAE: {}'.format( mae ) )
print('MAPE: {}'.format( mape ) )

R2: -0.284
MSE: 613.312
RMSE: 24.765
MAE: 17.115
MAPE: 6.792


## Dados de teste

In [51]:
# Retreinando o modelo com o melhor parâmetro sobre os dados de teste

best_m = mse_list.index( min ( mse_list ) )

# definition
model = tr.DecisionTreeRegressor( max_depth=m[best_m] )

# training
model.fit( np.concatenate( ( X_train, X_val ) ),
           np.concatenate( ( y_train, y_val ) ) ) 

# performance
yhat_test = model.predict( X_test )

# R squared
r2_squared = np.round( mt.r2_score( y_test, yhat_test ), 3 )

# MSE
mse = np.round(mt.mean_squared_error( y_test, yhat_test), 3)

# RMSE
rmse = np.round(np.sqrt( mse ), 3)

# MAE
mae = np.round(mt.mean_absolute_error( y_test, yhat_test), 3)

# MAPE
mape = np.round(mt.mean_absolute_percentage_error( y_test, yhat_test ), 3) 

print( 'R2: {}'.format( r2_squared ) )
print('MSE: {}'.format( mse ) )   
print('RMSE: {}'.format( rmse ) )
print('MAE: {}'.format( mae ) )
print('MAPE: {}'.format( mape ) )

R2: -0.171
MSE: 570.296
RMSE: 23.881
MAE: 15.827
MAPE: 6.114


