# Códigos para modelo de regressão
Com diferentes opções de validação (Kfold, leave-one-out e holdout)
* Linear Regression
* Lasso
* Ridge
* Elastic Net
* Árvore de Decisão
* Random Forest
* MLP Regressor

In [1]:
from sklearn import linear_model
import warnings
import pandas as pd
import numpy as np
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold, cross_val_score, LeaveOneOut, cross_val_predict
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import ElasticNet
from sklearn import datasets
from CrossValidationRegression import kfold_reg, leave_one_out_reg, th_reg

# Gerando base de dados (Boston housing prices)

In [2]:
warnings.filterwarnings('ignore')

boston = datasets.load_boston()

dados = pd.DataFrame(boston.data)
dados.columns = boston.feature_names
dados['PRICE'] = boston.target

X = dados.drop('PRICE',axis=1)
Y = dados['PRICE']

X.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [3]:
X.isna().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
dtype: int64

## Linear Regression

In [4]:
LM = linear_model.LinearRegression(fit_intercept=True, 
                                   normalize=False, 
                                   copy_X=True, 
                                   n_jobs=1)

LM.fit(X,Y)
y_pred = LM.predict(X)

RMSE = mean_squared_error(Y, y_pred)
MAE = mean_absolute_error(Y,y_pred)
score = r2_score(Y,y_pred)

print("Coeficientes:",LM.coef_)
print("Intercepto: {:.2f}.".format(LM.intercept_))
print("RMSE: {:.2f}.".format(RMSE))
print("MAE: {:.2f}.".format(MAE))
print("R2: {:.2f}.".format(score))

Coeficientes: [-1.08011358e-01  4.64204584e-02  2.05586264e-02  2.68673382e+00
 -1.77666112e+01  3.80986521e+00  6.92224640e-04 -1.47556685e+00
  3.06049479e-01 -1.23345939e-02 -9.52747232e-01  9.31168327e-03
 -5.24758378e-01]
Intercepto: 36.46.
RMSE: 21.89.
MAE: 3.27.
R2: 0.74.


In [5]:
LM = linear_model.LinearRegression(fit_intercept=True, 
                                   normalize=False, 
                                   copy_X=True, 
                                   n_jobs=1)

In [6]:
# definir k para kfolds
# se k = n, leave one out
# ver o porque R2 pode ser negativo nesse caso https://scikit-learn.org/stable/modules/model_evaluation.html#mean-squared-error
kfold_reg(LM, X, Y, k=10)

Unnamed: 0,MSE: 1. Treino,MSE: 2. Teste,RMSE: 1. Treino,RMSE: 2. Teste,MAE: 1. Treino,MAE: 2. Teste,R2: 1. Treino,R2: 2. Teste
0,23.36,9.29,4.83,3.05,3.41,2.21,0.74,0.73
1,22.88,14.15,4.78,3.76,3.39,2.9,0.75,0.47
2,23.22,14.07,4.82,3.75,3.43,2.79,0.74,-1.01
3,20.77,35.21,4.56,5.93,3.11,4.6,0.72,0.64
4,21.34,31.89,4.62,5.65,3.27,4.11,0.74,0.55
5,22.36,19.84,4.73,4.45,3.28,3.56,0.7,0.74
6,23.33,9.95,4.83,3.15,3.4,2.67,0.75,0.38
7,11.96,168.38,3.46,12.98,2.61,9.66,0.84,-0.13
8,21.59,33.33,4.65,5.77,3.19,5.02,0.74,-0.77
9,23.19,10.96,4.82,3.31,3.38,2.54,0.74,0.42


In [7]:
leave_one_out_reg(LM, X, Y)

O MSE da base de treino:  21.89307139577634
O MSE da base de teste:  23.725745519476096
O RMSE da base de treino:  4.678984415776224
O RMSE da base de teste:  3.3827965268794684


In [8]:
th_reg(LM, X, Y, corte = 253)

MSE treino: 9.99
MSE teste: 303.44
RMSE treino: 3.16
RMSE teste: 17.42
MAE treino: 2.44
MAE treino: 9.91
R2 treino: 0.86
R2 treino: -2.25


# Lasso

In [9]:
LASSO = linear_model.Lasso(alpha=1.2, 
                        fit_intercept=True, 
                        normalize=False, 
                        precompute=False, 
                        copy_X=True, 
                        max_iter=1000, 
                        tol=0.0001, 
                        warm_start=False, 
                        positive=False, 
                        random_state=42, 
                        selection='cyclic')
LASSO.fit(X,Y)
y_pred= LASSO.predict(X)

RMSE = mean_squared_error(Y, y_pred)
MAE = mean_absolute_error(Y,y_pred)
score = r2_score(Y,y_pred)

print("Coeficientes:",LASSO.coef_)
print("Intercepto: {:.2f}.".format(LASSO.intercept_))
print("RMSE: {:.2f}.".format(RMSE))
print("MAE: {:.2f}.".format(MAE))
print("R2: {:.2f}.".format(score))

Coeficientes: [-0.05556828  0.04895423 -0.          0.         -0.          0.32869825
  0.02786944 -0.5638014   0.25834277 -0.01505702 -0.70850093  0.00775368
 -0.80287711]
Intercepto: 44.49.
RMSE: 28.35.
MAE: 3.72.
R2: 0.66.


In [10]:
LASSO = linear_model.Lasso(alpha=1.2, 
                        fit_intercept=True, 
                        normalize=False, 
                        precompute=False, 
                        copy_X=True, 
                        max_iter=1000, 
                        tol=0.0001, 
                        warm_start=False, 
                        positive=False, 
                        random_state=42, 
                        selection='cyclic')

In [11]:
kfold_reg(LASSO, X, Y, k=10)

Unnamed: 0,MSE: 1. Treino,MSE: 2. Teste,RMSE: 1. Treino,RMSE: 2. Teste,MAE: 1. Treino,MAE: 2. Teste,R2: 1. Treino,R2: 2. Teste
0,29.75,12.8,5.45,3.58,3.83,2.87,0.67,0.63
1,29.61,19.82,5.44,4.45,3.8,3.45,0.67,0.26
2,29.73,11.24,5.45,3.35,3.87,2.33,0.67,-0.6
3,26.19,62.88,5.12,7.93,3.48,6.01,0.64,0.36
4,27.41,47.56,5.24,6.9,3.64,4.64,0.67,0.33
5,28.97,45.38,5.38,6.74,3.68,5.01,0.62,0.4
6,30.01,19.99,5.48,4.47,3.85,4.0,0.67,-0.25
7,18.53,95.28,4.3,9.76,3.11,6.74,0.75,0.36
8,27.85,22.13,5.28,4.7,3.71,3.47,0.66,-0.17
9,29.21,13.5,5.4,3.67,3.81,2.7,0.68,0.28


In [12]:
leave_one_out_reg(LASSO, X, Y)

O MSE da base de treino:  28.349137100181203
O MSE da base de teste:  30.035834490628222
O RMSE da base de treino:  5.3243763241527
O RMSE da base de teste:  3.818839270545992


In [13]:
th_reg(LASSO, X, Y, corte = 253)

MSE treino: 17.11
MSE teste: 43.49
RMSE treino: 4.14
RMSE teste: 6.59
MAE treino: 2.98
MAE treino: 4.48
R2 treino: 0.75
R2 treino: 0.53


# Ridge

In [14]:
RIDGE = linear_model.Ridge(alpha=1.2, 
                        fit_intercept=True, 
                        normalize=False, 
                        copy_X=True,
                        max_iter=None, 
                        tol=0.001, 
                        solver='auto', 
                        random_state=42)
RIDGE.fit(X,Y)
y_pred= RIDGE.predict(X)

RMSE = mean_squared_error(Y, y_pred)
MAE = mean_absolute_error(Y,y_pred)
score = r2_score(Y,y_pred)

print("Coeficientes:",RIDGE.coef_)
print("Intercepto: {:.2f}.".format(RIDGE.intercept_))
print("RMSE: {:.2f}.".format(RMSE))
print("MAE: {:.2f}.".format(MAE))
print("R2: {:.2f}.".format(score))

Coeficientes: [-1.04224650e-01  4.75695230e-02 -1.20862761e-02  2.53099146e+00
 -9.99078680e+00  3.85652922e+00 -6.07626954e-03 -1.36109191e+00
  2.88464252e-01 -1.29817384e-02 -8.67636863e-01  9.71326542e-03
 -5.34500118e-01]
Intercepto: 31.07.
RMSE: 22.08.
MAE: 3.27.
R2: 0.74.


In [15]:
RIDGE = linear_model.Ridge(alpha=1.2, 
                        fit_intercept=True, 
                        normalize=False, 
                        copy_X=True,
                        max_iter=None, 
                        tol=0.001, 
                        solver='auto', 
                        random_state=42)

In [16]:
kfold_reg(RIDGE, X, Y, k=10)

Unnamed: 0,MSE: 1. Treino,MSE: 2. Teste,RMSE: 1. Treino,RMSE: 2. Teste,MAE: 1. Treino,MAE: 2. Teste,R2: 1. Treino,R2: 2. Teste
0,23.58,9.33,4.86,3.06,3.41,2.13,0.74,0.73
1,23.17,12.46,4.81,3.53,3.37,2.67,0.74,0.54
2,23.55,10.62,4.85,3.26,3.42,2.33,0.74,-0.51
3,20.94,36.79,4.58,6.07,3.11,4.65,0.71,0.62
4,21.62,29.41,4.65,5.42,3.25,3.9,0.74,0.58
5,22.63,18.52,4.76,4.3,3.26,3.46,0.7,0.75
6,23.56,9.45,4.85,3.07,3.39,2.71,0.74,0.41
7,12.06,168.37,3.47,12.98,2.6,9.61,0.84,-0.13
8,21.72,33.86,4.66,5.82,3.18,5.07,0.73,-0.8
9,23.39,11.53,4.84,3.39,3.37,2.61,0.74,0.39


In [17]:
leave_one_out_reg(RIDGE, X, Y)

O MSE da base de treino:  22.078780911997246
O MSE da base de teste:  23.897245520611108
O RMSE da base de treino:  4.698787228391143
O RMSE da base de teste:  3.376151532081191


In [18]:
th_reg(RIDGE, X, Y, corte = 253)

MSE treino: 10.07
MSE teste: 180.03
RMSE treino: 3.17
RMSE teste: 13.42
MAE treino: 2.42
MAE treino: 7.99
R2 treino: 0.85
R2 treino: -0.93


# Elastic Net

In [19]:
EN = ElasticNet(alpha=1.0, 
                l1_ratio=0.5, 
                fit_intercept=True, 
                normalize=False, 
                precompute=False, 
                max_iter=1000, 
                copy_X=True, 
                tol=0.0001, 
                warm_start=False, 
                positive=False, 
                random_state=42, 
                selection='cyclic')

EN.fit(X,Y)
y_pred= EN.predict(X)

RMSE = mean_squared_error(Y, y_pred)
MAE = mean_absolute_error(Y,y_pred)
score = r2_score(Y,y_pred)

print("Coeficientes:",EN.coef_)
print("Intercepto: {:.2f}.".format(EN.intercept_))
print("RMSE: {:.2f}.".format(RMSE))
print("MAE: {:.2f}.".format(MAE))
print("R2: {:.2f}.".format(score))

Coeficientes: [-0.08037077  0.05323951 -0.0126571   0.         -0.          0.93393555
  0.0205792  -0.76204391  0.30156906 -0.01643916 -0.7480458   0.00833878
 -0.75842612]
Intercepto: 42.23.
RMSE: 26.50.
MAE: 3.59.
R2: 0.69.


In [20]:
EN = ElasticNet(alpha=1.0, 
                l1_ratio=0.5, 
                fit_intercept=True, 
                normalize=False, 
                precompute=False, 
                max_iter=1000, 
                copy_X=True, 
                tol=0.0001, 
                warm_start=False, 
                positive=False, 
                random_state=42, 
                selection='cyclic')

In [21]:
kfold_reg(EN, X, Y, k=10)

Unnamed: 0,MSE: 1. Treino,MSE: 2. Teste,RMSE: 1. Treino,RMSE: 2. Teste,MAE: 1. Treino,MAE: 2. Teste,R2: 1. Treino,R2: 2. Teste
0,28.04,11.68,5.3,3.42,3.72,2.65,0.69,0.67
1,27.61,17.63,5.25,4.2,3.68,3.23,0.69,0.34
2,28.04,10.73,5.3,3.28,3.75,2.28,0.69,-0.53
3,24.43,56.4,4.94,7.51,3.36,5.67,0.67,0.43
4,25.69,41.97,5.07,6.48,3.52,4.37,0.69,0.4
5,26.82,35.33,5.18,5.94,3.54,4.47,0.65,0.53
6,27.97,16.44,5.29,4.06,3.72,3.53,0.7,-0.03
7,19.94,84.34,4.47,9.18,3.2,6.4,0.73,0.43
8,26.5,23.2,5.15,4.82,3.61,3.63,0.68,-0.23
9,27.71,13.92,5.26,3.73,3.7,2.79,0.69,0.26


In [22]:
leave_one_out_reg(EN, X, Y)

O MSE da base de treino:  26.49780826370785
O MSE da base de teste:  27.88935424973581
O RMSE da base de treino:  5.147587959141399
O RMSE da base de teste:  3.680473139693095


In [23]:
th_reg(EN, X, Y, corte = 253)

MSE treino: 20.38
MSE teste: 38.56
RMSE treino: 4.51
RMSE teste: 6.21
MAE treino: 3.25
MAE treino: 4.33
R2 treino: 0.71
R2 treino: 0.59


# Árvore de Decisão

In [24]:
DT = DecisionTreeRegressor(criterion='mse', 
                           splitter='best', 
                           max_depth=10, 
                           min_samples_split=2, 
                           min_samples_leaf=1, 
                           min_weight_fraction_leaf=0.0, 
                           max_features=None, 
                           random_state=42, 
                           max_leaf_nodes=None, 
                           min_impurity_decrease=0.0)
DT.fit(X,Y)

y_pred= DT.predict(X)

RMSE = mean_squared_error(Y, y_pred)
MAE = mean_absolute_error(Y,y_pred)
score = r2_score(Y,y_pred)

print("RMSE: {:.2f}.".format(RMSE))
print("MAE: {:.2f}.".format(MAE))
print("R2: {:.2f}.".format(score))

RMSE: 0.80.
MAE: 0.52.
R2: 0.99.


In [25]:
kfold_reg(DT, X, Y, k=10)

Unnamed: 0,MSE: 1. Treino,MSE: 2. Teste,RMSE: 1. Treino,RMSE: 2. Teste,MAE: 1. Treino,MAE: 2. Teste,R2: 1. Treino,R2: 2. Teste
0,0.8,16.88,0.9,4.11,0.53,3.33,0.99,0.52
1,0.85,9.98,0.92,3.16,0.55,2.33,0.99,0.63
2,0.72,17.79,0.85,4.22,0.47,3.3,0.99,-1.54
3,0.69,45.19,0.83,6.72,0.48,4.95,0.99,0.54
4,0.96,15.9,0.98,3.99,0.52,3.1,0.99,0.77
5,1.18,46.57,1.08,6.82,0.61,5.05,0.98,0.38
6,0.94,18.3,0.97,4.28,0.56,3.2,0.99,-0.14
7,0.45,97.11,0.67,9.85,0.37,6.15,0.99,0.35
8,0.81,61.99,0.9,7.87,0.5,4.52,0.99,-2.29
9,0.64,16.73,0.8,4.09,0.45,3.07,0.99,0.11


In [26]:
leave_one_out_reg(DT, X, Y)

O MSE da base de treino:  0.8103621626579591
O MSE da base de teste:  18.46009117420743
O RMSE da base de treino:  0.9000770519551665
O RMSE da base de teste:  2.841693201166285


In [27]:
th_reg(DT, X, Y, corte = 253)

MSE treino: 0.14
MSE teste: 38.9
RMSE treino: 0.37
RMSE teste: 6.24
MAE treino: 0.16
MAE treino: 4.24
R2 treino: 1.0
R2 treino: 0.58


# Random Forest

In [28]:
RF = RandomForestRegressor(n_estimators=10, 
                           criterion='mse', 
                           max_depth=None, 
                           min_samples_split=2, 
                           min_samples_leaf=1, 
                           min_weight_fraction_leaf=0.0, 
                           max_features='auto', 
                           max_leaf_nodes=None, 
                           min_impurity_decrease=0.0, 
                           bootstrap=True, 
                           oob_score=False, 
                           n_jobs=1, 
                           random_state=42, 
                           verbose=0, 
                           warm_start=False)
RF.fit(X,Y)

y_pred= RF.predict(X)

RMSE = mean_squared_error(Y, y_pred)
MAE = mean_absolute_error(Y,y_pred)
score = r2_score(Y,y_pred)

print("RMSE: {:.2f}.".format(RMSE))
print("MAE: {:.2f}.".format(MAE))
print("R2: {:.2f}.".format(score))

RMSE: 2.05.
MAE: 0.96.
R2: 0.98.


In [29]:
RF = RandomForestRegressor(n_estimators=10, 
                           criterion='mse', 
                           max_depth=None, 
                           min_samples_split=2, 
                           min_samples_leaf=1, 
                           min_weight_fraction_leaf=0.0, 
                           max_features='auto', 
                           max_leaf_nodes=None, 
                           min_impurity_decrease=0.0, 
                           bootstrap=True, 
                           oob_score=False, 
                           n_jobs=1, 
                           random_state=42, 
                           verbose=0, 
                           warm_start=False)

In [30]:
kfold_reg(RF, X, Y, k=10)

Unnamed: 0,MSE: 1. Treino,MSE: 2. Teste,RMSE: 1. Treino,RMSE: 2. Teste,MAE: 1. Treino,MAE: 2. Teste,R2: 1. Treino,R2: 2. Teste
0,1.61,10.39,1.27,3.22,0.9,2.69,0.98,0.7
1,1.83,5.73,1.35,2.39,0.95,1.83,0.98,0.79
2,1.97,5.8,1.4,2.41,0.93,1.79,0.98,0.17
3,1.62,35.93,1.27,5.99,0.88,4.43,0.98,0.63
4,1.53,11.28,1.24,3.36,0.85,2.63,0.98,0.84
5,1.99,28.28,1.41,5.32,0.91,3.71,0.97,0.62
6,2.55,7.93,1.6,2.82,0.99,2.19,0.97,0.5
7,1.85,86.8,1.36,9.32,0.91,5.73,0.98,0.42
8,2.07,24.21,1.44,4.92,0.95,3.26,0.97,-0.28
9,2.09,18.07,1.45,4.25,0.91,3.06,0.98,0.04


In [31]:
leave_one_out_reg(RF, X, Y)

O MSE da base de treino:  2.204325126208273
O MSE da base de teste:  12.541033003952569
O RMSE da base de treino:  1.4842958119759853
O RMSE da base de teste:  2.314802371541502


In [32]:
th_reg(RF, X, Y, corte = 253)

MSE treino: 1.59
MSE teste: 35.62
RMSE treino: 1.26
RMSE teste: 5.97
MAE treino: 0.82
MAE treino: 3.94
R2 treino: 0.98
R2 treino: 0.62


# MLP Regressor

In [33]:
MLP = MLPRegressor(hidden_layer_sizes=(20, 15, 10),
                   activation='relu', 
                   solver='adam', 
                   alpha=0.01, #default = 0.0001
                   batch_size='auto', 
                   learning_rate='constant', 
                   learning_rate_init=0.1, #default = 0.0001
                   power_t=0.5, 
                   max_iter=400, #default = 100
                   shuffle=True, 
                   random_state=42, 
                   tol=0.0001, 
                   verbose=False, 
                   warm_start=False, 
                   momentum=0.9, 
                   nesterovs_momentum=True, 
                   early_stopping=False, 
                   validation_fraction=0.1, 
                   beta_1=0.9, 
                   beta_2=0.999, 
                   epsilon=1e-08)

MLP.fit(X,Y)

y_pred= MLP.predict(X)

RMSE = mean_squared_error(Y, y_pred)
MAE = mean_absolute_error(Y,y_pred)
score = r2_score(Y,y_pred)

print("RMSE: {:.2f}.".format(RMSE))
print("MAE: {:.2f}.".format(MAE))
print("R2: {:.2f}.".format(score))

RMSE: 30.09.
MAE: 4.27.
R2: 0.64.


In [34]:
MLP = MLPRegressor(hidden_layer_sizes=(20, 15, 10),
                   activation='relu', 
                   solver='adam', 
                   alpha=0.01, #default = 0.0001
                   batch_size='auto', 
                   learning_rate='constant', 
                   learning_rate_init=0.1, #default = 0.0001
                   power_t=0.5, 
                   max_iter=400, #default = 100
                   shuffle=True, 
                   random_state=42, 
                   tol=0.0001, 
                   verbose=False, 
                   warm_start=False, 
                   momentum=0.9, 
                   nesterovs_momentum=True, 
                   early_stopping=False, 
                   validation_fraction=0.1, 
                   beta_1=0.9, 
                   beta_2=0.999, 
                   epsilon=1e-08)

In [35]:
kfold_reg(MLP, X, Y, k=10)

Unnamed: 0,MSE: 1. Treino,MSE: 2. Teste,RMSE: 1. Treino,RMSE: 2. Teste,MAE: 1. Treino,MAE: 2. Teste,R2: 1. Treino,R2: 2. Teste
0,165.45,85.39,12.86,9.24,9.87,6.97,-0.85,-1.45
1,73.7,37.95,8.58,6.16,5.85,4.58,0.19,-0.41
2,164.39,15.07,12.82,3.88,9.98,3.12,-0.81,-1.15
3,50.92,180.41,7.14,13.43,4.56,10.68,0.31,-0.84
4,68.04,112.95,8.25,10.63,5.61,7.25,0.18,-0.6
5,49.03,101.92,7.0,10.1,4.64,7.56,0.35,-0.35
6,37.47,36.73,6.12,6.06,4.6,5.52,0.59,-1.3
7,32.21,148.89,5.68,12.2,4.3,9.93,0.57,0.0
8,33.48,29.22,5.79,5.41,4.06,4.26,0.59,-0.55
9,27.46,27.28,5.24,5.22,3.74,4.11,0.69,-0.45


In [37]:
leave_one_out_reg(MLP, X, Y)

O MSE da base de treino:  51.291169204897216
O MSE da base de teste:  47.81697719863965
O RMSE da base de treino:  7.03220495225713
O RMSE da base de teste:  5.137016937568379


In [36]:
th_reg(MLP, X, Y, corte = 253)

MSE treino: 666.36
MSE teste: 529.53
RMSE treino: 25.81
RMSE teste: 23.01
MAE treino: 24.44
MAE treino: 20.89
R2 treino: -8.62
R2 treino: -4.68
