# Trabalho 2

## 4.1.7 c)

Neste notebook está o código relativo à MLPRegressor.

### Imports

In [1]:
import warnings

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import utilidades as ut
from sklearn.tree import export_text
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

### Inicializações e variáveis

In [2]:
warnings.filterwarnings("ignore")  # Desabilitar warnings.
plt.style.use(
    "style/estilo.mplstyle")  # Garantir que se utiliza um estilo definido centralmente e comum a todos os gráficos.
%matplotlib inline

label_encoder = LabelEncoder()

ficheiro = "dados_preparados.csv"
colunas_numericas = ["Idade", "FCV", "NRP", "CA", "FAF", "TUDE", "IMC"]
colunas_classes = ["Genero", "Historico_obesidade_familiar", "FCCAC", "Fumador", "MCC", "CCER", "CBA", "TRANS"]
colunas_classes_binarias = ['Genero', 'Historico_obesidade_familiar', 'FCCAC', 'Fumador', 'MCC']
colunas_classes_multiplos = ["CCER", "CBA", "TRANS", "Label"]

## Leitura dos dados preparados

In [3]:
dados_trabalho = pd.read_csv(ficheiro)

In [4]:
dados_trabalho

Unnamed: 0,Genero,Idade,Historico_obesidade_familiar,FCCAC,FCV,NRP,CCER,Fumador,CA,MCC,FAF,TUDE,CBA,TRANS,Label,IMC
0,Feminino,21.000000,Sim,Nao,2.0,3.0,Ocasionalmente,Nao,2.000000,Nao,0.000000,1.000000,Nao,Transportes_Publicos,Peso_Normal,24.386526
1,Feminino,21.000000,Sim,Nao,3.0,3.0,Ocasionalmente,Sim,3.000000,Sim,3.000000,0.000000,Ocasionalmente,Transportes_Publicos,Peso_Normal,24.238227
2,Masculino,23.000000,Sim,Nao,2.0,3.0,Ocasionalmente,Nao,2.000000,Nao,2.000000,1.000000,Frequentemente,Transportes_Publicos,Peso_Normal,23.765432
3,Masculino,27.000000,Nao,Nao,3.0,3.0,Ocasionalmente,Nao,2.000000,Nao,2.000000,0.000000,Frequentemente,Caminhada,Excesso_Peso_Grau_I,26.851852
4,Masculino,22.000000,Nao,Nao,2.0,1.0,Ocasionalmente,Nao,2.000000,Nao,0.000000,0.000000,Ocasionalmente,Transportes_Publicos,Excesso_Peso_Grau_II,28.342381
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,Feminino,20.976842,Sim,Sim,3.0,3.0,Ocasionalmente,Nao,1.728139,Nao,1.676269,0.906247,Ocasionalmente,Transportes_Publicos,Obesidade_Mórbida,44.901475
2107,Feminino,21.982942,Sim,Sim,3.0,3.0,Ocasionalmente,Nao,2.005130,Nao,1.341390,0.599270,Ocasionalmente,Transportes_Publicos,Obesidade_Mórbida,43.741923
2108,Feminino,22.524036,Sim,Sim,3.0,3.0,Ocasionalmente,Nao,2.054193,Nao,1.414209,0.646288,Ocasionalmente,Transportes_Publicos,Obesidade_Mórbida,43.543817
2109,Feminino,24.361936,Sim,Sim,3.0,3.0,Ocasionalmente,Nao,2.852339,Nao,1.139107,0.586035,Ocasionalmente,Transportes_Publicos,Obesidade_Mórbida,44.071535


Temos no entanto de realizar primeiro o encoding das classes para valores numéricos, esta operação é realizada usando o ``sklearn.preprocessing.LabelEncoder`` em todas as colunas com apenas 2 classes.

In [5]:
ut.titulo("Valores codificados por atributo")

for coluna in colunas_classes_binarias:
    if dados_trabalho[coluna].dtype == 'object':
        dados_trabalho[coluna] = label_encoder.fit_transform(dados_trabalho[coluna].values)
        ut.etiqueta_e_valor(coluna, str(sorted(dados_trabalho[coluna].unique())))

[21;30;44mValores codificados por atributo[0m
[0;94mGenero: [1;94m[0, 1][0m
[0;94mHistorico_obesidade_familiar: [1;94m[0, 1][0m
[0;94mFCCAC: [1;94m[0, 1][0m
[0;94mFumador: [1;94m[0, 1][0m
[0;94mMCC: [1;94m[0, 1][0m


Para colunas que têm mais que 2 classes precisamos de utilizar o ``pandas.get_dummies()``.

In [6]:
dados_trabalho = pd.get_dummies(dados_trabalho, dtype=float)

In [7]:
dados_trabalho

Unnamed: 0,Genero,Idade,Historico_obesidade_familiar,FCCAC,FCV,NRP,Fumador,CA,MCC,FAF,...,TRANS_Transportes_Publicos,Label_Excesso_Peso_Grau_I,Label_Excesso_Peso_Grau_II,Label_Magreza_Grau_I,Label_Magreza_Grau_II,Label_Magreza_Grau_III,Label_Obesidade_Moderada,Label_Obesidade_Mórbida,Label_Obesidade_Severa,Label_Peso_Normal
0,0,21.000000,1,0,2.0,3.0,0,2.000000,0,0.000000,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,21.000000,1,0,3.0,3.0,1,3.000000,1,3.000000,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1,23.000000,1,0,2.0,3.0,0,2.000000,0,2.000000,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1,27.000000,0,0,3.0,3.0,0,2.000000,0,2.000000,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,22.000000,0,0,2.0,1.0,0,2.000000,0,0.000000,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,0,20.976842,1,1,3.0,3.0,0,1.728139,0,1.676269,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2107,0,21.982942,1,1,3.0,3.0,0,2.005130,0,1.341390,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2108,0,22.524036,1,1,3.0,3.0,0,2.054193,0,1.414209,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2109,0,24.361936,1,1,3.0,3.0,0,2.852339,0,1.139107,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


| Target | Preditores |
|--------|------------|
| IMC    | ??         |

Ver TP5

O target não muda

In [8]:
y = dados_trabalho.IMC

## Preditores

In [9]:
# O preditor vai mudando, podemos ter uma lista com o nome das colunas
# lista_preditores = ['Genero', 'Historico_obesidade_familiar', 'FCCAC', 'FCV', 'NRP', 'CCER', 'Fumador', 'CA', 'MCC', 'FAF', 'TUDE', 'CBA', 'TRANS']

lista_preditores = dados_trabalho.columns
X = dados_trabalho[lista_preditores].to_numpy()

### Holdout

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=100)

## Cálculo

TODO: Rever código é cópia do TP7

In [11]:
Nhidden = 1  # (?)

nn = MLPRegressor(
    hidden_layer_sizes=Nhidden,
    activation='tanh',
    solver='lbfgs', 
    max_iter=1000, 
    learning_rate_init=0.001
)

In [12]:
nn.fit(X_train, y_train)

### Make prediction

In [13]:
pred = nn.predict(X_test)

### Calculate accuracy and error metrics

In [14]:
test_set_rsquared = nn.score(X_test, y_test)
test_set_rmse = np.sqrt(mean_squared_error(y_test, pred))

### Print R_squared and RMSE value

In [15]:
print('R_squared value: ', test_set_rsquared)
print('RMSE: ', test_set_rmse)

R_squared value:  -0.005326609696777007
RMSE:  8.014769426745653


In [16]:
print("weights between input and first hidden layer:")
print(nn.coefs_[0])
print("\nweights between first hidden and second hidden layer:")
print(nn.coefs_[1])

weights between input and first hidden layer:
[[ 0.38849679]
 [-0.11999319]
 [ 0.40693381]
 [ 0.24887437]
 [ 0.26202937]
 [-0.29398393]
 [ 0.25779151]
 [-0.13648353]
 [-0.33673694]
 [-0.18948341]
 [ 0.30624025]
 [-0.38821341]
 [-0.1318426 ]
 [ 0.0098798 ]
 [-0.33416772]
 [-0.34747402]
 [-0.02312627]
 [-0.06601769]
 [ 0.14563922]
 [ 0.35456318]
 [ 0.18568446]
 [ 0.07220956]
 [-0.30745745]
 [ 0.13084272]
 [-0.28979873]
 [ 0.36048689]
 [-0.3802843 ]
 [-0.18381075]
 [-0.23446144]
 [ 0.17684407]
 [ 0.22079312]
 [ 0.36784094]
 [-0.06368022]
 [-0.10527582]]

weights between first hidden and second hidden layer:
[[-15.18553245]]


TODO: Abrir o dataframe do RMSE e MAE e adicionar resultados.