# Trabalho 2

## 4.1.7 c)

Neste notebook está o código relativo à MLPRegressor.

### Imports

In [1]:
import warnings

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

import utilidades as ut

### Inicializações e variáveis

In [2]:
warnings.filterwarnings("ignore")  # Desabilitar warnings.
plt.style.use(
    "style/estilo.mplstyle")  # Garantir que se utiliza um estilo definido centralmente e comum a todos os gráficos.
%matplotlib inline

label_encoder = LabelEncoder()

ficheiro = "dados_preparados.csv"
colunas_numericas = ["Idade", "FCV", "NRP", "CA", "FAF", "TUDE", "IMC"]
colunas_classes = ["Genero", "Historico_obesidade_familiar", "FCCAC", "Fumador", "MCC", "CCER", "CBA", "TRANS"]
colunas_classes_binarias = ['Genero', 'Historico_obesidade_familiar', 'FCCAC', 'Fumador', 'MCC']
colunas_classes_multiplos = ["CCER", "CBA", "TRANS", "Label"]

## Leitura dos dados preparados

In [3]:
dados_trabalho = pd.read_csv(ficheiro)

Remover o atributo *Label*, que é uma classificação de obesidade.

In [4]:
dados_trabalho.drop(["Label"], axis=1, inplace=True)

In [5]:
dados_trabalho

Unnamed: 0,Genero,Idade,Historico_obesidade_familiar,FCCAC,FCV,NRP,CCER,Fumador,CA,MCC,FAF,TUDE,CBA,TRANS,IMC
0,Feminino,21.000000,Sim,Nao,2.0,3.0,Ocasionalmente,Nao,2.000000,Nao,0.000000,1.000000,Nao,Transportes_Publicos,24.386526
1,Feminino,21.000000,Sim,Nao,3.0,3.0,Ocasionalmente,Sim,3.000000,Sim,3.000000,0.000000,Ocasionalmente,Transportes_Publicos,24.238227
2,Masculino,23.000000,Sim,Nao,2.0,3.0,Ocasionalmente,Nao,2.000000,Nao,2.000000,1.000000,Frequentemente,Transportes_Publicos,23.765432
3,Masculino,27.000000,Nao,Nao,3.0,3.0,Ocasionalmente,Nao,2.000000,Nao,2.000000,0.000000,Frequentemente,Caminhada,26.851852
4,Masculino,22.000000,Nao,Nao,2.0,1.0,Ocasionalmente,Nao,2.000000,Nao,0.000000,0.000000,Ocasionalmente,Transportes_Publicos,28.342381
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,Feminino,20.976842,Sim,Sim,3.0,3.0,Ocasionalmente,Nao,1.728139,Nao,1.676269,0.906247,Ocasionalmente,Transportes_Publicos,44.901475
2107,Feminino,21.982942,Sim,Sim,3.0,3.0,Ocasionalmente,Nao,2.005130,Nao,1.341390,0.599270,Ocasionalmente,Transportes_Publicos,43.741923
2108,Feminino,22.524036,Sim,Sim,3.0,3.0,Ocasionalmente,Nao,2.054193,Nao,1.414209,0.646288,Ocasionalmente,Transportes_Publicos,43.543817
2109,Feminino,24.361936,Sim,Sim,3.0,3.0,Ocasionalmente,Nao,2.852339,Nao,1.139107,0.586035,Ocasionalmente,Transportes_Publicos,44.071535


Temos no entanto de realizar primeiro o encoding das classes para valores numéricos, esta operação é realizada usando o ``sklearn.preprocessing.LabelEncoder`` em todas as colunas com apenas 2 classes.

In [6]:
ut.titulo("Valores codificados por atributo")

for coluna in colunas_classes:    
    if dados_trabalho[coluna].dtype == 'object':
        dados_trabalho[coluna] = label_encoder.fit_transform(dados_trabalho[coluna].values)
        ut.etiqueta_e_valor(coluna, str(sorted(dados_trabalho[coluna].unique())))

[21;30;44mValores codificados por atributo[0m
[0;94mGenero: [1;94m[0, 1][0m
[0;94mHistorico_obesidade_familiar: [1;94m[0, 1][0m
[0;94mFCCAC: [1;94m[0, 1][0m
[0;94mFumador: [1;94m[0, 1][0m
[0;94mMCC: [1;94m[0, 1][0m
[0;94mCCER: [1;94m[0, 1, 2, 3][0m
[0;94mCBA: [1;94m[0, 1, 2, 3][0m
[0;94mTRANS: [1;94m[0, 1, 2, 3, 4][0m


## Normalização



In [7]:
dados_normalizados = MinMaxScaler().fit_transform(dados_trabalho)
dataframe_normalizado = pd.DataFrame(
    dados_normalizados,
    columns=dados_trabalho.columns
)
dataframe_normalizado

Unnamed: 0,Genero,Idade,Historico_obesidade_familiar,FCCAC,FCV,NRP,CCER,Fumador,CA,MCC,FAF,TUDE,CBA,TRANS,IMC
0,0.0,0.148936,1.0,0.0,0.5,0.666667,0.666667,0.0,0.500000,0.0,0.000000,0.500000,0.333333,1.0,0.301162
1,0.0,0.148936,1.0,0.0,1.0,0.666667,0.666667,1.0,1.000000,1.0,1.000000,0.000000,0.666667,1.0,0.297240
2,1.0,0.191489,1.0,0.0,0.5,0.666667,0.666667,0.0,0.500000,0.0,0.666667,0.500000,0.000000,1.0,0.284736
3,1.0,0.276596,0.0,0.0,1.0,0.666667,0.666667,0.0,0.500000,0.0,0.666667,0.000000,0.000000,0.5,0.366359
4,1.0,0.170213,0.0,0.0,0.5,0.000000,0.666667,0.0,0.500000,0.0,0.000000,0.000000,0.666667,1.0,0.405778
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,0.0,0.148443,1.0,1.0,1.0,0.666667,0.666667,0.0,0.364070,0.0,0.558756,0.453124,0.666667,1.0,0.843697
2107,0.0,0.169850,1.0,1.0,1.0,0.666667,0.666667,0.0,0.502565,0.0,0.447130,0.299635,0.666667,1.0,0.813032
2108,0.0,0.181362,1.0,1.0,1.0,0.666667,0.666667,0.0,0.527097,0.0,0.471403,0.323144,0.666667,1.0,0.807793
2109,0.0,0.220467,1.0,1.0,1.0,0.666667,0.666667,0.0,0.926170,0.0,0.379702,0.293017,0.666667,1.0,0.821749


In [8]:
dataframe_normalizado

Unnamed: 0,Genero,Idade,Historico_obesidade_familiar,FCCAC,FCV,NRP,CCER,Fumador,CA,MCC,FAF,TUDE,CBA,TRANS,IMC
0,0.0,0.148936,1.0,0.0,0.5,0.666667,0.666667,0.0,0.500000,0.0,0.000000,0.500000,0.333333,1.0,0.301162
1,0.0,0.148936,1.0,0.0,1.0,0.666667,0.666667,1.0,1.000000,1.0,1.000000,0.000000,0.666667,1.0,0.297240
2,1.0,0.191489,1.0,0.0,0.5,0.666667,0.666667,0.0,0.500000,0.0,0.666667,0.500000,0.000000,1.0,0.284736
3,1.0,0.276596,0.0,0.0,1.0,0.666667,0.666667,0.0,0.500000,0.0,0.666667,0.000000,0.000000,0.5,0.366359
4,1.0,0.170213,0.0,0.0,0.5,0.000000,0.666667,0.0,0.500000,0.0,0.000000,0.000000,0.666667,1.0,0.405778
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,0.0,0.148443,1.0,1.0,1.0,0.666667,0.666667,0.0,0.364070,0.0,0.558756,0.453124,0.666667,1.0,0.843697
2107,0.0,0.169850,1.0,1.0,1.0,0.666667,0.666667,0.0,0.502565,0.0,0.447130,0.299635,0.666667,1.0,0.813032
2108,0.0,0.181362,1.0,1.0,1.0,0.666667,0.666667,0.0,0.527097,0.0,0.471403,0.323144,0.666667,1.0,0.807793
2109,0.0,0.220467,1.0,1.0,1.0,0.666667,0.666667,0.0,0.926170,0.0,0.379702,0.293017,0.666667,1.0,0.821749


| Target | Preditores                   |
|--------|------------------------------|
| IMC    | Todos os restantes atributos |

Ver TP5

O target não muda

In [9]:
y = dataframe_normalizado.IMC

## Preditores

In [10]:
lista_preditores = list(dataframe_normalizado.columns)
lista_preditores.remove("IMC")
X = dataframe_normalizado[lista_preditores]

### Holdout

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=100)

## Cálculo

In [12]:
Nhidden = 2  # (?)

rede_neural = MLPRegressor(
    hidden_layer_sizes=Nhidden,
    activation='tanh',
    solver='lbfgs', 
    max_iter=1000, 
    learning_rate_init=0.001
)

### Treino do modelo

In [13]:
rede_neural.fit(X_train, y_train)

### Teste
Realizar a previsão sobre o conjunto de teste.

In [14]:
pred = rede_neural.predict(X_test)

### Calcular a "accuracy" e métricas de erro

In [15]:
rsquared_teste = rede_neural.score(X_test, y_test)
rmse_teste = np.sqrt(mean_squared_error(y_test, pred))

### R_squared e Valor de RMSE

In [16]:
ut.etiqueta_e_valor('Valor do R_squared:', rsquared_teste)
ut.etiqueta_e_valor("Valor do RMSE:", rmse_teste)

[0;94mValor do R_squared:: [1;94m0.7359090142954792[0m
[0;94mValor do RMSE:: [1;94m0.10863563767225858[0m


#### Pesos entre a entrada e a primeira camada oculta

In [17]:
print(rede_neural.coefs_[0])

[[ 1.85590267 -1.54345197]
 [-5.33430673  4.69921713]
 [-5.94291025  5.23168877]
 [ 0.21975032 -0.04221459]
 [ 1.7021812  -1.09741435]
 [ 4.39409461 -3.79343494]
 [-0.63420092  1.0477425 ]
 [-0.72491854  0.5676688 ]
 [-0.2441372   0.2633297 ]
 [ 4.45018901 -4.08368384]
 [-0.69571184  0.56944613]
 [ 0.17189906 -0.1837866 ]
 [ 0.06069139  0.29368376]
 [ 0.12089184 -0.04032936]]


#### Pesos entre a primeira e segunda camada (ocultas)

In [18]:
print(rede_neural.coefs_[1])

[[0.85263492]
 [0.95221578]]
