In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [2]:
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv'
df = pd.read_csv(url, header=None)
df = df.add_prefix("C_")
df.head()

Unnamed: 0,C_0,C_1,C_2,C_3,C_4,C_5,C_6,C_7,C_8,C_9,C_10,C_11,C_12,C_13
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [3]:
sum(df.isnull().sum())

0

# Train test split and scaling

In [4]:
X = df.drop("C_13", axis=1)
y =df["C_13"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Before outlier removal

In [5]:
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred = lr.predict(X_test_scaled)
mae = mean_absolute_error(y_test, y_pred)
print('MAE: %.3f' % mae)
r2 = r2_score(y_test, y_pred)
print('R2 Score: %.3f' % r2)

MAE: 3.417
R2 Score: 0.765


# After LOF 

In [6]:
lof = LocalOutlierFactor()
yhat = lof.fit_predict(X_train_scaled)
mask = (yhat != -1)
print(X_train.shape, y_train.shape)
X_train_reduced, y_train_redcued = X_train_scaled[mask, :], y_train[mask]
print(X_train_reduced.shape, y_train_redcued.shape)

model = LinearRegression()
model.fit(X_train_reduced, y_train_redcued)

yhat = model.predict(X_test_scaled)
mae = mean_absolute_error(y_test, yhat)
print('MAE: %.3f' % mae)
r2 = r2_score(y_test, yhat)
print('R2 Score: %.3f' % r2)

(339, 13) (339,)
(319, 13) (319,)
MAE: 3.244
R2 Score: 0.766


### In LocalOutlierFactor() should I pass the scaled X_train or the original X_train?
In most cases, it is recommended to pass the scaled X_train to the LocalOutlierFactor algorithm instead of the original X_train. The reason for this is that the LOF algorithm is based on distance measures, and the scale of the features can have a significant impact on the algorithm's performance.

# After Isolation Forest

In [7]:
from sklearn.ensemble import IsolationForest
contamination = 0.1
isof = IsolationForest(contamination=contamination)
outlier_labels = isof.fit_predict(X_train_scaled)
mask = (outlier_labels != -1)
print(X_train.shape, y_train.shape)
X_train_reduced, y_train_redcued = X_train_scaled[mask, :], y_train[mask]
print(X_train_reduced.shape, y_train_redcued.shape)

model = LinearRegression()
model.fit(X_train_reduced, y_train_redcued)

yhat = model.predict(X_test_scaled)
mae = mean_absolute_error(y_test, yhat)
print('MAE: %.3f' % mae)
r2 = r2_score(y_test, yhat)
print('R2 Score: %.3f' % r2)

(339, 13) (339,)
(305, 13) (305,)
MAE: 3.271
R2 Score: 0.777


# After one class SVM

In [8]:
from sklearn.svm import OneClassSVM

nu = 0.05
clf = OneClassSVM(nu=nu)

outlier_labels = clf.fit_predict(X_train_scaled)
mask = (outlier_labels != -1)
print(X_train.shape, y_train.shape)
X_train_reduced, y_train_redcued = X_train_scaled[mask, :], y_train[mask]
print(X_train_reduced.shape, y_train_redcued.shape)

model = LinearRegression()
model.fit(X_train_reduced, y_train_redcued)

yhat = model.predict(X_test_scaled)
mae = mean_absolute_error(y_test, yhat)
print('MAE: %.3f' % mae)
r2 = r2_score(y_test, yhat)
print('R2 Score: %.3f' % r2)

(339, 13) (339,)
(315, 13) (315,)
MAE: 3.291
R2 Score: 0.774
