# Outlier Detection in Housing Data

In [73]:
# load and summarize the dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM

## Reading the Data

In [82]:
url = 'https://goz39a.s3.eu-central-1.amazonaws.com/housing.csv'
df = pd.read_csv(url, header=None)

In [83]:
data = df.values
X, y = data[:, :-1], data[:, -1]
print(X.shape, y.shape)

(506, 13) (506,)


In [84]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
# summarize the shape of the train and test sets
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(379, 13) (127, 13) (379,) (127,)


## Building a Regression Model

In [85]:
model = LinearRegression()
model.fit(X_train, y_train)
y_prediction = model.predict(X_test)
mae = mean_absolute_error(y_test, y_prediction)
print('MAE: %.3f' % mae)

MAE: 3.668


## Isolation Forest

In [86]:
# identify outliers in the training dataset
iso = IsolationForest(contamination=0.2)
y_prediction = iso.fit_predict(X_train)

In [87]:
# select all rows that are not outliers
mask = y_prediction != -1
X_train, y_train = X_train[mask, :], y_train[mask]
# summarize the shape of the updated training dataset
print(X_train.shape, y_train.shape)

(303, 13) (303,)


In [88]:
# fit the model
model = LinearRegression()
model.fit(X_train, y_train)
# evaluate the model
y_prediction = model.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, y_prediction)
print('MAE: %.3f' % mae)

MAE: 3.927


## MinCovDet 

In [91]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)
# identify outliers in the training dataset
ee = EllipticEnvelope(contamination=0.2)
y_prediction = ee.fit_predict(X_train)

In [92]:
mask = y_prediction != -1
X_train, y_train = X_train[mask, :], y_train[mask]
# summarize the shape of the updated training dataset
print(X_train.shape, y_train.shape)

(271, 13) (271,)


In [93]:
# fit the model
model = LinearRegression()
model.fit(X_train, y_train)
# evaluate the model
y_prediction = model.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, y_prediction)
print('MAE: %.3f' % mae)

MAE: 4.144


## Local Outlier Factor

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [95]:
# identify outliers in the training dataset
lof = LocalOutlierFactor()
y_prediction = lof.fit_predict(X_train)

In [96]:
mask = y_prediction != -1
X_train, y_train = X_train[mask, :], y_train[mask]
# summarize the shape of the updated training dataset
print(X_train.shape, y_train.shape)

(298, 13) (298,)


In [97]:
# fit the model
model = LinearRegression()
model.fit(X_train, y_train)
# evaluate the model
y_prediction = model.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, y_prediction)
print('MAE: %.3f' % mae)

MAE: 3.592


## One Class SVM

In [98]:
ee = OneClassSVM(nu=0.2)
y_prediction = ee.fit_predict(X_train)

In [99]:
mask = y_prediction != -1
X_train, y_train = X_train[mask, :], y_train[mask]
# summarize the shape of the updated training dataset
print(X_train.shape, y_train.shape)

(239, 13) (239,)


In [100]:
# fit the model
model = LinearRegression()
model.fit(X_train, y_train)
# evaluate the model
y_prediction = model.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, y_prediction)
print('MAE: %.3f' % mae)

MAE: 3.812
