constract data model to separate outliers form normal data and noise.

types of methods:
* Density based
* Distance base
* Parametric

Speaking falls into three categories: density-based methods, distance-based methods, and parametric methods.
Density-based and distance-based methods fall into the category of spatial proximity algorithms. Examples include DBSCAN, k-means, and k-nearest neighbor.
Parametric methods usually assume some sort of form to the data, such as normality. Examples include Gaussian mixture model, single class Williams, and Z-score.
Other methods that are not specifically machine learning methods, such as Z-score, also assume normality.

In [11]:
import pandas as pd
import numpy as np
import scipy as sp

from sklearn import svm
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import mean_squared_error

from src.preprocessor import preprocessor

from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.compose import TransformedTargetRegressor

from sklearn.model_selection import cross_validate, cross_val_predict, cross_val_score, train_test_split, StratifiedKFold


In [12]:
from pathlib import Path

# Read data
data_dir = Path("/home/reinis/Documents/House Prices - Advanced Regression Techniques/data")
train_df = pd.read_csv(data_dir / "train.csv")
test_df = pd.read_csv(data_dir / "test.csv")

# Separate target form predictors
dfx = train_df.copy()
label = dfx.pop("SalePrice")

X = pd.DataFrame(preprocessor.fit_transform(dfx))

In [13]:
outliers_fraction = 0.05

anomaly_algorithms = [
    ("Robust covariance", EllipticEnvelope(contamination=outliers_fraction)),
    ("Isolation Forest", IsolationForest(contamination=outliers_fraction, random_state=42)),
    ("Local Outlier Factor", LocalOutlierFactor(n_neighbors=35, contamination=outliers_fraction,novelty=True)),
]

lr_pipeline = make_pipeline(
    SelectFromModel(LassoCV()),
    TransformedTargetRegressor(
        regressor=RidgeCV(), 
        func=np.log10, 
        inverse_func=sp.special.exp10
    )
)

In [14]:
results_df = pd.DataFrame(columns=["Algorithm", "MSE"])

# Fit the linear regression pipeline to the entire dataset
lr_pipeline.fit(X, label)

# Predict the target variable for the entire dataset
y_pred = lr_pipeline.predict(X)

# Calculate the mean squared error
mse = mean_squared_error(label, y_pred)

# Append the results to the dataframe
results_df = results_df.append({"Algorithm": "Simple Linear Regression", "MSE": mse}, ignore_index=True)


for name, algorithm in anomaly_algorithms:
# Fit the algorithm to the data
    algorithm.fit(X)
    # Predict the anomaly scores for each sample
    anomaly_scores = algorithm.decision_function(X)

    # Extract the samples that are considered inliers (not anomalies)
    inliers = anomaly_scores > 0

    # Fit the linear regression pipeline to the inlier samples
    lr_pipeline.fit(X[inliers], label[inliers])

    # Predict the target variable for the inlier samples
    y_pred = lr_pipeline.predict(X[inliers])

    # Calculate the mean squared error
    mse = mean_squared_error(label[inliers], y_pred)

    # Append the results to the dataframe
    results_df = results_df.append({"Algorithm": name, "MSE": mse, "Inliers Shape": X[inliers].shape, "Inliers":X[inliers], "Outliers": X[~inliers].index}, ignore_index=True)


  results_df = results_df.append({"Algorithm": "Simple Linear Regression", "MSE": mse}, ignore_index=True)
  results_df = results_df.append({"Algorithm": name, "MSE": mse, "Inliers Shape": X[inliers].shape, "Inliers":X[inliers], "Outliers": X[~inliers].index}, ignore_index=True)
  results_df = results_df.append({"Algorithm": name, "MSE": mse, "Inliers Shape": X[inliers].shape, "Inliers":X[inliers], "Outliers": X[~inliers].index}, ignore_index=True)
  results_df = results_df.append({"Algorithm": name, "MSE": mse, "Inliers Shape": X[inliers].shape, "Inliers":X[inliers], "Outliers": X[~inliers].index}, ignore_index=True)


In [15]:
results_df

Unnamed: 0,Algorithm,MSE,Inliers Shape,Inliers,Outliers
0,Simple Linear Regression,486036900.0,,,
1,Robust covariance,320118300.0,"(1387, 236)",0 1 2 3 ...,"Int64Index([ 17, 39, 51, 52, 88, 102..."
2,Isolation Forest,278794100.0,"(1387, 236)",0 1 2 3 ...,"Int64Index([ 39, 48, 87, 88, 125, 178..."
3,Local Outlier Factor,288493200.0,"(1392, 236)",0 1 2 3 ...,"Int64Index([ 70, 113, 159, 170, 178, 185..."


In [16]:
# Analysis of algorithm

In [17]:
results_df["Outliers"][2]

Int64Index([  39,   48,   87,   88,  125,  178,  185,  197,  198,  250,  291,
             307,  335,  349,  375,  386,  431,  434,  440,  496,  515,  520,
             523,  533,  581,  614,  635,  636,  649,  664,  691,  705,  738,
             747,  769,  778,  798,  803,  825,  828,  843,  897,  898,  914,
             921,  942,  954,  977, 1011, 1030, 1061, 1142, 1169, 1173, 1181,
            1182, 1219, 1228, 1230, 1234, 1243, 1268, 1283, 1298, 1323, 1326,
            1337, 1349, 1373, 1386, 1387, 1423, 1449],
           dtype='int64')