# Trabalho 2

## 4.1.7 a)

Neste notebook está o código relativo à regresão linear múltipla.

### Imports

In [1]:
import warnings

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import utilidades as ut

### Inicializações e variáveis

In [2]:
warnings.filterwarnings("ignore")  # Desabilitar warnings.
plt.style.use(
    "style/estilo.mplstyle")  # Garantir que se utiliza um estilo definido centralmente e comum a todos os gráficos.
%matplotlib inline

label_encoder = LabelEncoder()

ficheiro = "dados_preparados.csv"
colunas_numericas = ["Idade", "FCV", "NRP", "CA", "FAF", "TUDE", "IMC"]
colunas_classes = ["Genero", "Historico_obesidade_familiar", "FCCAC", "Fumador", "MCC", "CCER", "CBA", "TRANS"]
colunas_classes_binarias = ['Genero', 'Historico_obesidade_familiar', 'FCCAC', 'Fumador', 'MCC']
colunas_classes_multiplos = ["CCER", "CBA", "TRANS", "Label"]

## Leitura dos dados preparados

In [3]:
dados_trabalho = pd.read_csv(ficheiro)

In [4]:
dados_trabalho

Unnamed: 0,Genero,Idade,Historico_obesidade_familiar,FCCAC,FCV,NRP,CCER,Fumador,CA,MCC,FAF,TUDE,CBA,TRANS,Label,IMC
0,Feminino,21.000000,Sim,Nao,2.0,3.0,Ocasionalmente,Nao,2.000000,Nao,0.000000,1.000000,Nao,Transportes_Publicos,Peso_Normal,24.386526
1,Feminino,21.000000,Sim,Nao,3.0,3.0,Ocasionalmente,Sim,3.000000,Sim,3.000000,0.000000,Ocasionalmente,Transportes_Publicos,Peso_Normal,24.238227
2,Masculino,23.000000,Sim,Nao,2.0,3.0,Ocasionalmente,Nao,2.000000,Nao,2.000000,1.000000,Frequentemente,Transportes_Publicos,Peso_Normal,23.765432
3,Masculino,27.000000,Nao,Nao,3.0,3.0,Ocasionalmente,Nao,2.000000,Nao,2.000000,0.000000,Frequentemente,Caminhada,Excesso_Peso_Grau_I,26.851852
4,Masculino,22.000000,Nao,Nao,2.0,1.0,Ocasionalmente,Nao,2.000000,Nao,0.000000,0.000000,Ocasionalmente,Transportes_Publicos,Excesso_Peso_Grau_II,28.342381
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,Feminino,20.976842,Sim,Sim,3.0,3.0,Ocasionalmente,Nao,1.728139,Nao,1.676269,0.906247,Ocasionalmente,Transportes_Publicos,Obesidade_Mórbida,44.901475
2107,Feminino,21.982942,Sim,Sim,3.0,3.0,Ocasionalmente,Nao,2.005130,Nao,1.341390,0.599270,Ocasionalmente,Transportes_Publicos,Obesidade_Mórbida,43.741923
2108,Feminino,22.524036,Sim,Sim,3.0,3.0,Ocasionalmente,Nao,2.054193,Nao,1.414209,0.646288,Ocasionalmente,Transportes_Publicos,Obesidade_Mórbida,43.543817
2109,Feminino,24.361936,Sim,Sim,3.0,3.0,Ocasionalmente,Nao,2.852339,Nao,1.139107,0.586035,Ocasionalmente,Transportes_Publicos,Obesidade_Mórbida,44.071535


Temos no entanto de realizar primeiro o encoding das classes para valores numéricos, esta operação é realizada usando o ``sklearn.preprocessing.LabelEncoder`` em todas as colunas com apenas 2 classes.

In [5]:
ut.titulo("Valores codificados por atributo")

for coluna in colunas_classes_binarias:
    if dados_trabalho[coluna].dtype == 'object':
        dados_trabalho[coluna] = label_encoder.fit_transform(dados_trabalho[coluna].values)
        ut.etiqueta_e_valor(coluna, str(sorted(dados_trabalho[coluna].unique())))

[21;30;44mValores codificados por atributo[0m
[0;94mGenero: [1;94m[0, 1][0m
[0;94mHistorico_obesidade_familiar: [1;94m[0, 1][0m
[0;94mFCCAC: [1;94m[0, 1][0m
[0;94mFumador: [1;94m[0, 1][0m
[0;94mMCC: [1;94m[0, 1][0m


Para colunas que têm mais que 2 classes precisamos de utilizar o ``pandas.get_dummies()``.

In [6]:
dados_trabalho = pd.get_dummies(dados_trabalho, dtype=float)

In [7]:
dados_trabalho

Unnamed: 0,Genero,Idade,Historico_obesidade_familiar,FCCAC,FCV,NRP,Fumador,CA,MCC,FAF,...,TRANS_Transportes_Publicos,Label_Excesso_Peso_Grau_I,Label_Excesso_Peso_Grau_II,Label_Magreza_Grau_I,Label_Magreza_Grau_II,Label_Magreza_Grau_III,Label_Obesidade_Moderada,Label_Obesidade_Mórbida,Label_Obesidade_Severa,Label_Peso_Normal
0,0,21.000000,1,0,2.0,3.0,0,2.000000,0,0.000000,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,21.000000,1,0,3.0,3.0,1,3.000000,1,3.000000,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1,23.000000,1,0,2.0,3.0,0,2.000000,0,2.000000,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1,27.000000,0,0,3.0,3.0,0,2.000000,0,2.000000,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,22.000000,0,0,2.0,1.0,0,2.000000,0,0.000000,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,0,20.976842,1,1,3.0,3.0,0,1.728139,0,1.676269,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2107,0,21.982942,1,1,3.0,3.0,0,2.005130,0,1.341390,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2108,0,22.524036,1,1,3.0,3.0,0,2.054193,0,1.414209,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2109,0,24.361936,1,1,3.0,3.0,0,2.852339,0,1.139107,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


| Target | Preditores |
|--------|------------|
| IMC    | ??         |

Ver TP5

O target não muda

In [8]:
y = dados_trabalho.IMC

## Preditores

In [9]:
# O preditor vai mudando, podemos ter uma lista com o nome das colunas
# lista_preditores = ['Genero', 'Historico_obesidade_familiar', 'FCCAC', 'FCV', 'NRP', 'CCER', 'Fumador', 'CA', 'MCC', 'FAF', 'TUDE', 'CBA', 'TRANS']

lista_preditores = dados_trabalho.columns
X = dados_trabalho[lista_preditores].to_numpy()

### Holdout

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=100)

## Cálculo

In [11]:
## Initialize algorithm
mlr = LinearRegression()

## Fit the data
mlr.fit(X_train, y_train)

print("Eq. da reta: y=", mlr.intercept_, "+", mlr.coef_, "x")

Eq. da reta: y= 4.618527782440651e-14 + [ 3.81272768e-16  1.11022302e-16  4.09981088e-15  5.07482413e-15
  6.03942055e-15  2.53399854e-17  1.93436923e-16  1.90330120e-15
 -4.46491334e-15 -7.87359697e-16 -1.77245681e-15  1.00000000e+00
 -2.20804607e-15  1.03419399e-15 -1.10778710e-16  7.88337569e-16
  9.48599814e-17 -1.83078329e-15 -1.03333632e-15  2.75595473e-15
 -9.40606649e-16 -1.27836221e-15  7.69863902e-16  1.13877052e-15
 -7.29136148e-16  5.93350219e-16  9.74521190e-16  1.84741141e-15
  1.19734197e-15  5.66570174e-16  4.99592573e-16 -1.09252942e-15
  6.97033537e-16  7.28583860e-17] x


In [12]:
#Prediction of test set
y_pred_mlr = mlr.predict(X_test)
#Predicted values
#print("Prediction for test set: {}".format(y_pred_mlr))

In [13]:
#Actual value and the predicted value
mlr_diff = pd.DataFrame({'Actual value': y_test, 'Predicted value': y_pred_mlr})
mlr_diff.head()

Unnamed: 0,Actual value,Predicted value
785,25.269124,25.269124
1838,47.718705,47.718705
2083,40.870732,40.870732
1105,29.146663,29.146663
361,32.87311,32.87311


In [14]:
#Model Evaluation
from sklearn import metrics

meanAbErr = metrics.mean_absolute_error(y_test, y_pred_mlr)
meanSqErr = metrics.mean_squared_error(y_test, y_pred_mlr)

rootMeanSqErr = np.sqrt(metrics.mean_squared_error(y_test, y_pred_mlr))

print('R squared: {:.2f}'.format(mlr.score(X, y) * 100))
print('Mean Absolute Error:', meanAbErr)
print('Mean Square Error:', meanSqErr)
print('Root Mean Square Error:', rootMeanSqErr)

R squared: 100.00
Mean Absolute Error: 2.1698429761657432e-14
Mean Square Error: 6.108655382283067e-28
Root Mean Square Error: 2.4715694168449057e-14


TODO: Abrir o dataframe do RMSE e MAE e adicionar resultados.