## Comparando modelos utilizando as métricas de avaliação de modelos

In [2]:
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import Lasso

Importando o Dataset

In [3]:
from sklearn import datasets
data  = datasets.load_diabetes()
X = data.data
y = data.target
feature_names = data.feature_names

Contextualização:

O conjunto de dados de diabetes usado neste exemplo é proveniente do sklearn.datasets. As variáveis presentes no conjunto de dados são:

*   age: Idade do paciente.
*   sex: Gênero do paciente (0: feminino, 1: masculino).
*   bmi: Índice de massa corporal (IMC).
*   bp: Pressão arterial média.
*   s1: Total de soro de lipoproteína de alta densidade (HDL).
*   s2: Total de soro de lipoproteína de baixa densidade (LDL).
*   s3: Total de soro de triglicerídeos.
*   s4: Total de soro de total de lipoproteína.
*   s5: Total de soro de ácido sérico.
*   s6: Total de soro de lamotrigina.
Cada uma dessas variáveis representa diferentes características médicas e fisiológicas dos pacientes.

O objetivo é prever a medida quantitativa da progressão da doença diabética após um ano com base nessas características. O valor alvo (target) é uma medida quantitativa da progressão da doença.

Verificando os dados carregados

In [4]:
data.feature_names #nome das variaveis

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [5]:
data.data.shape  #tamanho da base de dados

(442, 10)

In [6]:
data.target.shape

(442,)

Criando um data frame para iniciar as analises

In [7]:
db_df = pd.DataFrame(data.data,columns=data.feature_names)

In [8]:
db_df

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641
...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485
439,0.041708,0.050680,-0.015906,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930


In [9]:
db_df['progressao'] = data.target

In [10]:
X = pd.DataFrame(db_df, columns=data.feature_names)
y = pd.DataFrame(data.target, columns=['progressao'])

In [11]:
y

Unnamed: 0,progressao
0,151.0
1,75.0
2,141.0
3,206.0
4,135.0
...,...
437,178.0
438,104.0
439,132.0
440,220.0


In [12]:
db_df

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,progressao
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0
...,...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207,178.0
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485,104.0
439,0.041708,0.050680,-0.015906,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491,132.0
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930,220.0


In [13]:
db_df.isna().sum() #verificando valores ausentes

age           0
sex           0
bmi           0
bp            0
s1            0
s2            0
s3            0
s4            0
s5            0
s6            0
progressao    0
dtype: int64

Maiores correlações com a progressão :
   - bmi : 0.59
   - s5 : 0.57
   - bp : 0.44
   - s4 : 0.43
   - s3: 0.39    

Divisão da base - Treino e Teste

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

No python é necessário criar a função para realizar o stepwise

In [15]:
def stepwise_selection(X, y, 
                       initial_list=[], 
                       threshold_in=0.01, 
                       threshold_out = 0.05, 
                       verbose=True):
    included = list(initial_list)
    while True:
        changed=False
        # forward step
        excluded = list(set(X.columns)-set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = LinearRegression().fit(X[included+[new_column]], y)
            new_pval[new_column] = model.score(X[included+[new_column]], y)
        best_pval = new_pval.max()
        if best_pval > threshold_in:
            best_feature = new_pval.idxmax()
            included.append(best_feature)
            changed=True
            if verbose:
                print('Adicionado variável: {}, Score R²: {:.4f}'.format(best_feature, best_pval))
        # backward step
        model = LinearRegression().fit(X[included], y)
        p_values = model.score(X[included], y)
        worst_pval = p_values.min()
        if worst_pval < threshold_out:
            changed=True
            worst_feature = included[np.argmin(p_values)]
            included.remove(worst_feature)
            if verbose:
                print('Removido variável: {}, Score R²: {:.4f}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return included

In [16]:
# Seleção de variáveis usando Stepwise
selected_features_stepwise = stepwise_selection(X_train, y_train['progressao'])

print("Variáveis selecionadas pelo Stepwise:")
print(selected_features_stepwise)

Adicionado variável: bmi, Score R²: 0.3657
Adicionado variável: s5, Score R²: 0.4583
Adicionado variável: bp, Score R²: 0.4831
Adicionado variável: s1, Score R²: 0.4991
Adicionado variável: s2, Score R²: 0.5088
Adicionado variável: sex, Score R²: 0.5235
Adicionado variável: s4, Score R²: 0.5260
Adicionado variável: s6, Score R²: 0.5268
Adicionado variável: s3, Score R²: 0.5275
Adicionado variável: age, Score R²: 0.5279
Variáveis selecionadas pelo Stepwise:
['bmi', 's5', 'bp', 's1', 's2', 'sex', 's4', 's6', 's3', 'age']


In [18]:
# Treinamento do modelo final usando as variáveis selecionadas
X_train_stepwise = X_train[selected_features_stepwise]
X_test_stepwise = X_test[selected_features_stepwise]

In [19]:
results = []

In [20]:
model_stepwise = LinearRegression()
model_step = model_stepwise.fit(X_train_stepwise, y_train)# Calcular métricas de avaliação
mae = np.mean(np.abs(y_test - model_step.predict(X_test_stepwise)))
mse = np.mean((y_test - model_step.predict(X_test_stepwise)) ** 2)


# Calcular o número de observações e o número de variáveis
n_obs = len(y_test)
n_vars = len(selected_features_stepwise)

# Calcular o AIC e o BIC manualmente
aic = n_obs * np.log(mse) + 2 * n_vars
bic = n_obs * np.log(mse) + n_vars * np.log(n_obs)


r_squared = model_stepwise.score(X_test_stepwise, y_test)

# Adicionar as métricas ao resultado
results.append({
    'Modelo': 'Modelo 1',
    'MAE': mae,
    'MSE': mse,
    'AIC': aic,
    'BIC': bic,
    'R²': r_squared
})

# Imprimir as métricas
print("Métricas do Modelo 1:")
print("MAE:", mae)
print("MSE:", mse)
print("AIC:", aic)
print("BIC:", bic)
print("R²:", r_squared)

Métricas do Modelo 1:
MAE: 42.79409467959994
MSE: 2900.193628493482
AIC: 729.5554176150596
BIC: 754.441781312381
R²: 0.45260276297191937


Observe que os métodos utilizados para gerar o modelo na comparação das métricas x o gerado agora utilizam metodologias de treinamento diferente. 

In [22]:
lm6 = sm.OLS.from_formula("progressao ~ bmi+s5+bp+s1+s2+sex+s4+s6+s3+age", data=db_df)
model_6 = lm6.fit()
print("Análise do Modelo:")
print(model_6.summary())
print("=" * 80)

Análise do Modelo:
                            OLS Regression Results                            
Dep. Variable:             progressao   R-squared:                       0.518
Model:                            OLS   Adj. R-squared:                  0.507
Method:                 Least Squares   F-statistic:                     46.27
Date:                Tue, 20 Feb 2024   Prob (F-statistic):           3.83e-62
Time:                        13:46:32   Log-Likelihood:                -2386.0
No. Observations:                 442   AIC:                             4794.
Df Residuals:                     431   BIC:                             4839.
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    152.1335      2.576 

In [None]:
Seleção de variáveis via Regularização - Lasso (L1)

Utilizando Lasso (L1) para seleção de variáveis, e definindo o melhor lambda a partir da cross-validation

In [23]:
lasso_model = Lasso()
alphas = np.logspace(-4, 4, 100)  # Define uma gama de valores de alpha
param_grid = {'alpha': alphas}
lasso_cv = GridSearchCV(lasso_model, param_grid, cv=5)
lasso_cv.fit(X_train, y_train)

In [24]:
best_alpha = lasso_cv.best_params_['alpha']
print("Melhor valor de alpha encontrado:", best_alpha)

Melhor valor de alpha encontrado: 0.08111308307896872


Usando o melhor valor de alpha para treinar o modelo final

In [25]:
lasso_model_final = Lasso(alpha=best_alpha)
lasso_model_final.fit(X_train, y_train)

In [None]:
Avaliação do modelo final

In [26]:
lasso_predictions = lasso_model_final.predict(X_test)

# Obtendo os coeficientes do modelo Lasso
lasso_coef = lasso_model_final.coef_

# Identificando as variáveis selecionadas
selected_features_lasso = X.columns[lasso_coef != 0]

# Imprimindo as variáveis selecionadas
print("Variáveis selecionadas pelo Lasso (L1):")
print(selected_features_lasso)

lasso_mae = mean_absolute_error(y_test, lasso_predictions)
lasso_mse = mean_squared_error(y_test, lasso_predictions)


# Calcular o número de observações e o número de variáveis
n_obs = len(y_test)
n_vars = len(selected_features_lasso)

# Calcular o AIC e o BIC manualmente
aic = n_obs * np.log(lasso_mse) + 2 * n_vars
bic = n_obs * np.log(lasso_mse) + n_vars * np.log(n_obs)

# Calcular R² corretamente (antes do cálculo, é necessário redimensionar lasso_predictions)
r_squared = lasso_model_final.score(X_test, y_test)

# Adicionar as métricas ao resultado
results.append({
    'Modelo': 'lasso_model_final',
    'MAE': lasso_mae,
    'MSE': lasso_mse,
    'AIC': aic,
    'BIC': bic,
    'R²': r_squared
})

# Imprimir as métricas
print("Métricas do Lasso:")
print("MAE:", lasso_mae)
print("MSE:", lasso_mse)
print("AIC:", aic)
print("BIC:", bic)
print("R²:", r_squared)

Variáveis selecionadas pelo Lasso (L1):
Index(['sex', 'bmi', 'bp', 's1', 's3', 's5', 's6'], dtype='object')
Métricas do Lasso:
MAE: 42.80030964676334
MSE: 2799.827946110184
AIC: 720.4208789347183
BIC: 737.8413335228432
R²: 0.4715462902899882
Mean Squared Error (Lasso): 2799.827946110184


In [28]:
# Criar um DataFrame com os resultados
results_df = pd.DataFrame(results)

# Imprimir a tabela de resultados
print(results_df)

              Modelo        MAE          MSE         AIC         BIC        R²
0           Modelo 1  42.794095  2900.193628  729.555418  754.441781  0.452603
1  lasso_model_final  42.800310  2799.827946  720.420879  737.841334  0.471546


In [31]:
# Imprimindo as variáveis selecionadas
print("Variáveis selecionadas pelo Stepwise:")
print(selected_features_stepwise)
print(len(selected_features_stepwise))
print("Variáveis selecionadas pelo Lasso (L1):")
print(selected_features_lasso)
print(len(selected_features_lasso))

Variáveis selecionadas pelo Stepwise:
['bmi', 's5', 'bp', 's1', 's2', 'sex', 's4', 's6', 's3', 'age']
10
Variáveis selecionadas pelo Lasso (L1):
Index(['sex', 'bmi', 'bp', 's1', 's3', 's5', 's6'], dtype='object')
7
