Goal of this notebook is to construct data model to separate outliers form normal data and noise.

In [4]:
import pandas as pd
import numpy as np
import scipy as sp

In [5]:
from pathlib import Path
from src.preprocessor import preprocessor

# Read data
data_dir = Path("C:/Users/reini/Documents/GitHub/Advanced-Regression-Techniques/data")
train_df = pd.read_csv(data_dir / "train.csv")
test_df = pd.read_csv(data_dir / "test.csv")

# Separate target form predictors
dfx = train_df.copy()
label = dfx.pop("SalePrice")

# Apply preprocessor 
X = pd.DataFrame(preprocessor.fit_transform(dfx))

In [6]:
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.compose import TransformedTargetRegressor

# Set outlier fraction
outliers_fraction = 0.05

# Algorithms to evaluate
anomaly_algorithms = [
    ("Robust covariance", EllipticEnvelope(contamination=outliers_fraction)),
    ("Isolation Forest", IsolationForest(contamination=outliers_fraction, random_state=42)),
    ("Local Outlier Factor", LocalOutlierFactor(n_neighbors=35, contamination=outliers_fraction,novelty=True)),
]

# Baseline model
lr_pipeline = make_pipeline(
    SelectFromModel(LassoCV()),
    TransformedTargetRegressor(
        regressor=RidgeCV(), 
        func=np.log10, 
        inverse_func=sp.special.exp10
    )
)

In [7]:
from sklearn.metrics import mean_squared_error

results_df = pd.DataFrame(columns=["Algorithm", "MSE"])

# Fit baseline
lr_pipeline.fit(X, label)
y_pred = lr_pipeline.predict(X)
mse = mean_squared_error(label, y_pred)
results_df = results_df.append({"Algorithm": "Simple Linear Regression", "MSE": mse}, ignore_index=True)

# iteratively fit anomaly detection algorithms and evaluate effect on model
for name, algorithm in anomaly_algorithms:
    # Extract the samples that are considered inliers (not anomalies)
    algorithm.fit(X)
    anomaly_scores = algorithm.decision_function(X)
    inliers = anomaly_scores > 0

    # Fit the linear regression pipeline to the inlier samples
    lr_pipeline.fit(X[inliers], label[inliers])
    y_pred = lr_pipeline.predict(X[inliers])
    mse = mean_squared_error(label[inliers], y_pred)
    results_df = results_df.append({"Algorithm": name, "MSE": mse, "Inliers Shape": X[inliers].shape, "Inliers":X[inliers], "Outliers": X[~inliers].index}, ignore_index=True)
results_df

  results_df = results_df.append({"Algorithm": "Simple Linear Regression", "MSE": mse}, ignore_index=True)
  results_df = results_df.append({"Algorithm": name, "MSE": mse, "Inliers Shape": X[inliers].shape, "Inliers":X[inliers], "Outliers": X[~inliers].index}, ignore_index=True)
  results_df = results_df.append({"Algorithm": name, "MSE": mse, "Inliers Shape": X[inliers].shape, "Inliers":X[inliers], "Outliers": X[~inliers].index}, ignore_index=True)
  results_df = results_df.append({"Algorithm": name, "MSE": mse, "Inliers Shape": X[inliers].shape, "Inliers":X[inliers], "Outliers": X[~inliers].index}, ignore_index=True)


Unnamed: 0,Algorithm,MSE,Inliers Shape,Inliers,Outliers
0,Simple Linear Regression,486036900.0,,,
1,Robust covariance,321149600.0,"(1387, 236)",0 1 2 3 ...,"Int64Index([ 17, 39, 54, 88, 90, 102..."
2,Isolation Forest,278794100.0,"(1387, 236)",0 1 2 3 ...,"Int64Index([ 39, 48, 87, 88, 125, 178..."
3,Local Outlier Factor,288493200.0,"(1392, 236)",0 1 2 3 ...,"Int64Index([ 70, 113, 159, 170, 178, 185..."


In [8]:
results_df["Outliers"][2]

Int64Index([  39,   48,   87,   88,  125,  178,  185,  197,  198,  250,  291,
             307,  335,  349,  375,  386,  431,  434,  440,  496,  515,  520,
             523,  533,  581,  614,  635,  636,  649,  664,  691,  705,  738,
             747,  769,  778,  798,  803,  825,  828,  843,  897,  898,  914,
             921,  942,  954,  977, 1011, 1030, 1061, 1142, 1169, 1173, 1181,
            1182, 1219, 1228, 1230, 1234, 1243, 1268, 1283, 1298, 1323, 1326,
            1337, 1349, 1373, 1386, 1387, 1423, 1449],
           dtype='int64')