### Experimento 2: Outliers

Rodar uma regressão em 3 datasets que passaram por algoritmos diferentes de remoção de outliers:

- Isolation Forest
- Minimum Covariance Determinant
- One Class SVM


Fonte: https://machinelearningmastery.com/model-based-outlier-detection-and-removal-in-python/

Iremos testar tanto com a `Regressão Ridge`, quanto com a `Regressão Linear`, que tiveram resultados próximos recentemente.

In [1]:
import pandas as pd

#metrics
from sklearn.metrics import mean_absolute_error

#models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

#outlier detection algorithms
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope

In [2]:
# load the dataset
X_train = pd.read_parquet("../Datasets/DF_split/X_train.parquet")
X_val = pd.read_parquet("../Datasets/DF_split/X_val.parquet")
y_train = pd.read_csv("../Datasets/DF_split/y_train.csv")
y_val = pd.read_csv("../Datasets/DF_split/y_val.csv")

In [3]:
# initial shape of the training dataset
print(X_train.shape, y_train.shape)

(1988340, 75) (1988340, 1)


### Isolation Forest

In [4]:
X_train = X_train.drop(columns='cid').values
X_val = X_val.drop(columns='cid').values
y_train = y_train.values
y_val = y_val.values

In [5]:
X_train

array([[ 9.23750000e+04,  1.21050000e+05,  1.80675000e+05, ...,
        -5.31710464e+00,  4.38919641e+00, -2.12549000e+04],
       [ 5.21250000e+04,  6.10000000e+04,  8.33500000e+04, ...,
        -3.59190283e-01,  5.99466813e+00, -1.95844131e+04],
       [ 5.35250000e+04,  6.49750000e+04,  8.98500000e+04, ...,
         6.09535025e-01,  6.88448042e+00, -1.70947811e+04],
       ...,
       [ 8.64750000e+04,  1.03825000e+05,  1.60425000e+05, ...,
        -1.54832781e+00,  4.19599557e+00, -3.01401312e+04],
       [ 7.23000000e+04,  8.37750000e+04,  1.16775000e+05, ...,
        -7.51034227e-01,  4.84362654e+00, -2.57219912e+04],
       [ 5.41000000e+04,  6.16000000e+04,  8.40250000e+04, ...,
        -3.59190283e-01,  7.08040239e+00, -2.56850714e+04]])

In [6]:
# evaluate model performance with outliers removed using isolation forest
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train, y_train = X_train[mask, :], y_train[mask]
# summarize the shape of the updated training dataset
print(X_train.shape, y_train.shape)

(1789506, 74) (1789506, 1)


Linear Regression

In [7]:
# fit the model
model = LinearRegression()
model.fit(X_train, y_train)
# evaluate the model
yhat = model.predict(X_val)
# evaluate predictions
mae = mean_absolute_error(y_val, yhat)
print('MAE: %.3f' % mae)

MAE: 0.296


Ridge Regression

In [8]:
# fit the model
RD = Ridge()
RD.fit(X_train, y_train)
# evaluate the RD
yhat = RD.predict(X_val)
# evaluate predictions
mae = mean_absolute_error(y_val, yhat)
print('MAE: %.3f' % mae)

MAE: 0.248


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


### Minimum Covariance

In [9]:
# identify outliers in the training dataset
ee = EllipticEnvelope(contamination=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train, y_train = X_train[mask, :], y_train[mask]
# summarize the shape of the updated training dataset
print(X_train.shape, y_train.shape)



(1771610, 74) (1771610, 1)


Linear Regression

In [10]:
# fit the model
model = LinearRegression()
model.fit(X_train, y_train)
# evaluate the model
yhat = model.predict(X_val)
# evaluate predictions
mae = mean_absolute_error(y_val, yhat)
print('MAE: %.3f' % mae)

MAE: 0.305


Ridge Regression

In [11]:
# fit the model
RD = Ridge()
RD.fit(X_train, y_train)
# evaluate the RD
yhat = RD.predict(X_val)
# evaluate predictions
mae = mean_absolute_error(y_val, yhat)
print('MAE: %.3f' % mae)

MAE: 0.249


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


### SVM

In [None]:
from sklearn.svm import OneClassSVM

# identify outliers in the training dataset
ee = OneClassSVM(nu=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train, y_train = X_train[mask, :], y_train[mask]
# summarize the shape of the updated training dataset
print(X_train.shape, y_train.shape)

In [None]:
# fit the model
model = LinearRegression()
model.fit(X_train, y_train)
# evaluate the model
yhat = model.predict(X_val)
# evaluate predictions
mae = mean_absolute_error(y_val, yhat)
print('MAE: %.3f' % mae)

In [None]:
# plotar uma única figura para demonstrar a diferença após normalizar, escalar e sem tratar os dados