# Trabalho 2

## 4.1.7 a)

Neste notebook está o código relativo à Regressão Linear Múltipla (sigla em inglês MLR).

### Imports

In [1]:
import warnings

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as stats
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics

import utilidades as ut

### Inicializações e variáveis

In [2]:
warnings.filterwarnings("ignore")  # Desabilitar warnings.
plt.style.use(
    "style/estilo.mplstyle")  # Garantir que se utiliza um estilo definido centralmente e comum a todos os gráficos.
%matplotlib inline

label_encoder = LabelEncoder()

ficheiro = "dados_preparados.csv"
ficheiro_resultados = "dados_resultados.csv"
colunas_numericas = ["Idade", "FCV", "NRP", "CA", "FAF", "TUDE", "IMC"]
colunas_classes = ["Genero", "Historico_obesidade_familiar", "FCCAC", "Fumador", "MCC", "CCER", "CBA", "TRANS"]
colunas_classes_binarias = ['Genero', 'Historico_obesidade_familiar', 'FCCAC', 'Fumador', 'MCC']
colunas_classes_multiplos = ["CCER", "CBA", "TRANS", "Label"]

## Leitura dos dados preparados

In [3]:
dados_trabalho = pd.read_csv(ficheiro)

In [4]:
dados_trabalho

Unnamed: 0,Genero,Idade,Historico_obesidade_familiar,FCCAC,FCV,NRP,CCER,Fumador,CA,MCC,FAF,TUDE,CBA,TRANS,Label,IMC
0,Feminino,21.000000,Sim,Nao,2.0,3.0,Ocasionalmente,Nao,2.000000,Nao,0.000000,1.000000,Nao,Transportes_Publicos,Peso_Normal,24.386526
1,Feminino,21.000000,Sim,Nao,3.0,3.0,Ocasionalmente,Sim,3.000000,Sim,3.000000,0.000000,Ocasionalmente,Transportes_Publicos,Peso_Normal,24.238227
2,Masculino,23.000000,Sim,Nao,2.0,3.0,Ocasionalmente,Nao,2.000000,Nao,2.000000,1.000000,Frequentemente,Transportes_Publicos,Peso_Normal,23.765432
3,Masculino,27.000000,Nao,Nao,3.0,3.0,Ocasionalmente,Nao,2.000000,Nao,2.000000,0.000000,Frequentemente,Caminhada,Excesso_Peso_Grau_I,26.851852
4,Masculino,22.000000,Nao,Nao,2.0,1.0,Ocasionalmente,Nao,2.000000,Nao,0.000000,0.000000,Ocasionalmente,Transportes_Publicos,Excesso_Peso_Grau_II,28.342381
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,Feminino,20.976842,Sim,Sim,3.0,3.0,Ocasionalmente,Nao,1.728139,Nao,1.676269,0.906247,Ocasionalmente,Transportes_Publicos,Obesidade_Mórbida,44.901475
2107,Feminino,21.982942,Sim,Sim,3.0,3.0,Ocasionalmente,Nao,2.005130,Nao,1.341390,0.599270,Ocasionalmente,Transportes_Publicos,Obesidade_Mórbida,43.741923
2108,Feminino,22.524036,Sim,Sim,3.0,3.0,Ocasionalmente,Nao,2.054193,Nao,1.414209,0.646288,Ocasionalmente,Transportes_Publicos,Obesidade_Mórbida,43.543817
2109,Feminino,24.361936,Sim,Sim,3.0,3.0,Ocasionalmente,Nao,2.852339,Nao,1.139107,0.586035,Ocasionalmente,Transportes_Publicos,Obesidade_Mórbida,44.071535


Temos no entanto de realizar primeiro o encoding das classes para valores numéricos, esta operação é realizada usando o ``sklearn.preprocessing.LabelEncoder`` em todas as colunas com apenas 2 classes.

In [5]:
ut.titulo("Valores codificados por atributo")

for coluna in colunas_classes_binarias:
    if dados_trabalho[coluna].dtype == 'object':
        dados_trabalho[coluna] = label_encoder.fit_transform(dados_trabalho[coluna].values)
        ut.etiqueta_e_valor(coluna, str(sorted(dados_trabalho[coluna].unique())))

[21;30;44mValores codificados por atributo[0m
[0;94mGenero: [1;94m[0, 1][0m
[0;94mHistorico_obesidade_familiar: [1;94m[0, 1][0m
[0;94mFCCAC: [1;94m[0, 1][0m
[0;94mFumador: [1;94m[0, 1][0m
[0;94mMCC: [1;94m[0, 1][0m


A coluna *Label* é uma classificação, não um preditor. Tem de ser removida.

In [6]:
dados_trabalho.drop("Label", axis=1, inplace=True)

Para colunas que têm mais que 2 classes precisamos de utilizar o ``pandas.get_dummies()``.

In [7]:
dados_trabalho = pd.get_dummies(dados_trabalho, dtype=float)

In [8]:
dados_trabalho

Unnamed: 0,Genero,Idade,Historico_obesidade_familiar,FCCAC,FCV,NRP,Fumador,CA,MCC,FAF,...,CCER_Sempre,CBA_Frequentemente,CBA_Nao,CBA_Ocasionalmente,CBA_Sempre,TRANS_Automovel,TRANS_Bicicleta,TRANS_Caminhada,TRANS_Mota,TRANS_Transportes_Publicos
0,0,21.000000,1,0,2.0,3.0,0,2.000000,0,0.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,21.000000,1,0,3.0,3.0,1,3.000000,1,3.000000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1,23.000000,1,0,2.0,3.0,0,2.000000,0,2.000000,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1,27.000000,0,0,3.0,3.0,0,2.000000,0,2.000000,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1,22.000000,0,0,2.0,1.0,0,2.000000,0,0.000000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,0,20.976842,1,1,3.0,3.0,0,1.728139,0,1.676269,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2107,0,21.982942,1,1,3.0,3.0,0,2.005130,0,1.341390,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2108,0,22.524036,1,1,3.0,3.0,0,2.054193,0,1.414209,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2109,0,24.361936,1,1,3.0,3.0,0,2.852339,0,1.139107,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


| Target | Preditores                   |
|--------|------------------------------|
| IMC    | Todas os restantes atributos |

O target não muda

In [9]:
y = dados_trabalho.IMC

## Preditores

Colunas originais, antes do get_dummies:
lista_preditores = ["Idade", "Genero", "Historico_obesidade_familiar", "FCCAC", "FCV", "NRP", "CCER", "Fumador", "CA", "MCC", "FAF", "TUDE", "CBA", "TRANS"]

Obter a lista de todos os atributos (preditores) do dataset, é preciso remover a coluna "IMC"

In [10]:
lista_preditores = list(dados_trabalho.columns.values)
lista_preditores.remove("IMC")

X = dados_trabalho.drop("IMC", axis=1).to_numpy()

### Holdout

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=100)

## Cálculo

### Inicializar o algoritmo


In [12]:
mlr = LinearRegression()

### Fit the data

In [13]:
mlr.fit(X_train, y_train)

### Previsão 

In [14]:
y_prev_mlr = mlr.predict(X_test)

ut.titulo("Previsºao para o conjunto de teste:")
print(y_prev_mlr)

[21;30;44mPrevisºao para o conjunto de teste:[0m
[29.81814583 35.39453916 37.32918131 29.3237348  34.11337315 27.23983665
 31.59500878 13.67479004 12.89394736 31.38360027 29.96175851 21.15486789
 28.54651286 36.51155233 19.57362677 28.44680112 24.99079801 35.91189065
 34.19081968 29.02684796 31.60837093 33.06741959 21.22736111 21.22552371
 30.11906184 32.7341213  31.8008557  28.43939478 19.24429605 37.17940798
 32.3953805  33.3138143  31.0991164  37.64085085 25.02007274 34.55605403
 30.75508123 31.99247094 29.21360763 31.79854983 34.80355367 34.30964482
 35.16472828 12.81481204 22.62058159 18.35408761 29.80461463 31.32755285
 29.33874427 33.98878849 31.35876725 22.34601484 29.97048406 24.17944324
 31.20588285 30.39832446 30.55823956 37.99414179 21.44803965 25.58886875
 32.81905183 37.77864818 29.80826894 17.58018282 20.75143086 30.49706984
 31.32029337 32.14045939 33.06923473 38.08055986 30.31740656 31.99961311
 33.99680937 34.24480689 22.15554988 32.01985545 34.9387497  34.97003991


# Valores reais e valores previstos

In [15]:
mlr_diff = pd.DataFrame({"Valor Real": y_test, "Valor previsto": y_prev_mlr})
mlr_diff.head()

Unnamed: 0,Valor Real,Valor previsto
785,25.269124,29.818146
1838,47.718705,35.394539
2083,40.870732,37.329181
1105,29.146663,29.323735
361,32.87311,34.113373


### Avaliação do Modelo

In [23]:
mae = metrics.mean_absolute_error(y_test, y_prev_mlr)
mse = metrics.mean_squared_error(y_test, y_prev_mlr)

rmse = np.sqrt(metrics.mean_squared_error(y_test, y_prev_mlr))

ut.etiqueta_e_valor("R squared:", f"{(mlr.score(X, y) * 100):.5}")
ut.etiqueta_e_valor("Mean Absolute Error:", f"{mae:.5}")
ut.etiqueta_e_valor("Mean Square Error:", f"{mse:.5}")
ut.etiqueta_e_valor("Root Mean Square Error:", f"{rmse:.5}")

[0;94mR squared:: [1;94m49.794[0m
[0;94mMean Absolute Error:: [1;94m4.4425[0m
[0;94mMean Square Error:: [1;94m30.927[0m
[0;94mRoot Mean Square Error:: [1;94m5.5612[0m


In [None]:
mlr.

TODO: Abrir o dataframe do RMSE e MAE e adicionar resultados.