## Preparation, global imports

In [14]:
from copy import copy, deepcopy
import numpy as np
import pandas as pd
from sklearn.utils import check_random_state

# Working with data containing anomalies

### Dataset 1: Line and outliers

In [15]:
def line_and_outliers(seed=20241217):
    n_total = 70
    n_outliers = 30
    sigma = 0.04
    rs = check_random_state(seed)
    x = rs.uniform(-1, 1, n_total)
    y = x.copy()
    y[n_total-n_outliers:] = rs.uniform(-1, 1, n_outliers)
    x += rs.normal(scale=sigma, size=n_total)
    y += rs.normal(scale=sigma, size=n_total)
    return x, y

## Random sample consensus (RANSAC) algorithm
 
* fitting a line to a set of points with outliers

In [16]:
class RANSAC:
    # Source: https://en.wikipedia.org/wiki/Random_sample_consensus
    
    def __init__(self, n=10, k=100, t=0.05, d=10, model=None, loss=None, metric=None, random_state=None):
        """
        Parameters:

        `n`: Minimum number of data points to estimate parameters
        `k`: Maximum iterations allowed
        `t`: Threshold value to determine if points are fit well
        `d`: Number of close data points required to assert model fits well
        `model`: class implementing `fit` and `predict`
        `loss`: function of `y_true` and `y_pred` that returns a vector
        `metric`: function of `y_true` and `y_pred` and returns a float
        """
        self.n = n              # `n`: Minimum number of data points to estimate parameters
        self.k = k              # `k`: Maximum iterations allowed
        self.t = t              # `t`: Threshold value to determine if points are fit well
        self.d = d              # `d`: Number of close data points required to assert model fits well
        self.model = model      # `model`: class implementing `fit` and `predict`
        self.loss = loss        # `loss`: function of `y_true` and `y_pred` that returns a vector
        self.metric = metric    # `metric`: function of `y_true` and `y_pred` and returns a float
        self.best_fit = None
        self.best_error = np.inf
        self.random_state = random_state

    def fit(self, X, y):
        rs = check_random_state(self.random_state)
        for _ in range(self.k):
            ids = rs.permutation(X.shape[0])

            maybe_inliers = ids[: self.n]
            maybe_model = copy(self.model).fit(X[maybe_inliers], y[maybe_inliers])

            thresholded = (
                self.loss(y[ids][self.n :], maybe_model.predict(X[ids][self.n :]))
                < self.t
            )

            inlier_ids = ids[self.n :][np.flatnonzero(thresholded).flatten()]

            if inlier_ids.size > self.d:
                inlier_points = np.hstack([maybe_inliers, inlier_ids])
                better_model = copy(self.model).fit(X[inlier_points], y[inlier_points])

                this_error = self.metric(
                    y[inlier_points], better_model.predict(X[inlier_points])
                )

                if this_error < self.best_error:
                    self.best_error = this_error
                    self.best_fit = better_model

        return self

    def predict(self, X):
        return self.best_fit.predict(X)

def square_error_loss(y_true, y_pred):
    return (y_true - y_pred) ** 2


def mean_square_error(y_true, y_pred):
    return np.sum(square_error_loss(y_true, y_pred)) / y_true.shape[0]


class LinearRegressor:
    def __init__(self):
        self.params = None

    def fit(self, X: np.ndarray, y: np.ndarray):
        r, _ = X.shape
        X = np.hstack([np.ones((r, 1)), X])
        self.params = np.linalg.inv(X.T @ X) @ X.T @ y
        return self

    def predict(self, X: np.ndarray):
        r, _ = X.shape
        X = np.hstack([np.ones((r, 1)), X])
        return X @ self.params

    

In [None]:
def RANSAC_demo():
    regressor = RANSAC(model=LinearRegressor(), loss=square_error_loss, metric=mean_square_error)

    x, y = line_and_outliers()
    X = x.reshape(-1, 1)

    regressor.fit(X, y)

    import matplotlib.pyplot as plt
    plt.style.use("seaborn-darkgrid")
    fig, ax = plt.subplots(1, 1)
    ax.set_box_aspect(1)

    ax.scatter(X, y)

    line = np.linspace(-1, 1, num=100).reshape(-1, 1)
    ax.plot(line, regressor.predict(line), c="peru")
    ax.set_title("RANSAC Linear Regression")
    plt.show()


RANSAC_demo()

In [None]:
def RANSAC_steps():
    k_values = [1, 3, 5, 10]
    regressor = RANSAC(model=LinearRegressor(), loss=square_error_loss, metric=mean_square_error, random_state=20241217)

    x, y = line_and_outliers()
    X = x.reshape(-1, 1)
    line = np.linspace(-1, 1, num=100).reshape(-1, 1)

    import matplotlib.pyplot as plt
    plt.style.use("seaborn-darkgrid")
    fig, axs = plt.subplots(1, len(k_values), figsize=(3*len(k_values), 3), sharex=True, sharey=True)

    for k, ax in zip(k_values, axs):
        regressor.k = k
        regressor.fit(X, y)

        ax.scatter(X, y)
        ax.set_box_aspect(1)
        ax.set_title(f"k={k}")

        if regressor.best_fit is not None:
            ax.plot(line, regressor.predict(line), c="peru")
    plt.show()


RANSAC_steps()

# Anomaly detection / Novelty detection in point clouds

* anomaly: single data series, find the outliers
* novelty: training set is normal, find the outliers in the test set

### Dataset 2: two clusters with outliers

In [19]:
def two_clusters_with_outliers(seed=20241217, n_inliers = 100, n_outliers = 20):
    """
    Generate two clusters of points with outliers.
    The clusters are centered at (2, 2) and (-2, -2) with standard deviation 0.3.
    The outliers are uniformly distributed in [-4, 4] x [-4, 4].
    The inliers come first, followed by the outliers.

    returns:
    X: np.ndarray, shape=(n_inliers + n_outliers, 2)
    ground_truth: np.ndarray, shape=(n_inliers + n_outliers)
    """
    rs = check_random_state(seed)

    X_inliers = 0.3 * rs.randn(n_inliers // 2, 2)
    X_inliers = np.r_[X_inliers + 2, X_inliers - 2]
    X_outliers = rs.uniform(low=-4, high=4, size=(n_outliers, 2))
    X = np.r_[X_inliers, X_outliers]

    n_outliers = len(X_outliers)
    ground_truth = np.ones(len(X), dtype=int)
    ground_truth[-n_outliers:] = -1

    return X, ground_truth

## One-class Support Vector Machine

* based on kernel funcion and SVM
* https://scikit-learn.org/dev/modules/sgd.html#online-one-class-svm
* https://scikit-learn.org/dev/modules/svm.html#density-estimation-novelty-detection

In [None]:
def inspect_decision_boundary(clf, X_train, X_test, X_outliers):
    # Original implementation:
    # https://scikit-learn.org/dev/auto_examples/linear_model/plot_sgdocsvm_vs_ocsvm.html

    from sklearn.inspection import DecisionBoundaryDisplay
    from matplotlib import pyplot as plt
    import matplotlib.lines as mlines

    _, ax = plt.subplots(figsize=(9, 6))

    xx, yy = np.meshgrid(np.linspace(-4.5, 4.5, 50), np.linspace(-4.5, 4.5, 50))
    X = np.concatenate([xx.reshape(-1, 1), yy.reshape(-1, 1)], axis=1)
    DecisionBoundaryDisplay.from_estimator(
        clf,
        X,
        response_method="decision_function",
        plot_method="contourf",
        ax=ax,
        cmap="PuBu",
    )
    DecisionBoundaryDisplay.from_estimator(
        clf,
        X,
        response_method="decision_function",
        plot_method="contour",
        ax=ax,
        linewidths=2,
        colors="darkred",
        levels=[0],
    )
    DecisionBoundaryDisplay.from_estimator(
        clf,
        X,
        response_method="decision_function",
        plot_method="contourf",
        ax=ax,
        colors="palevioletred",
        levels=[0, clf.decision_function(X).max()],
    )

    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)
    y_pred_outliers = clf.predict(X_outliers)
    n_error_train = y_pred_train[y_pred_train == -1].size
    n_error_test = y_pred_test[y_pred_test == -1].size
    n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size

    s = 20
    b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c="white", s=s, edgecolors="k")
    b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c="blueviolet", s=s, edgecolors="k")
    c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c="gold", s=s, edgecolors="k")

    ax.set(
        title=str(clf),
        xlim=(-4.5, 4.5),
        ylim=(-4.5, 4.5),
        xlabel=(
            f"error train: {n_error_train}/{X_train.shape[0]}; "
            f"errors novel regular: {n_error_test}/{X_test.shape[0]}; "
            f"errors novel abnormal: {n_error_outliers}/{X_outliers.shape[0]}"
        ),
    )
    _ = ax.legend(
        [mlines.Line2D([], [], color="darkred", label="learned frontier"), b1, b2, c],
        [
            "learned frontier",
            "training observations",
            "new regular observations",
            "new abnormal observations",
        ],
        loc="upper left",
    )


def SVM_demo(seed=20241217):
    # Original implementation:
    # https://scikit-learn.org/dev/auto_examples/linear_model/plot_sgdocsvm_vs_ocsvm.html

    from sklearn.kernel_approximation import Nystroem
    from sklearn.linear_model import SGDOneClassSVM
    from sklearn.pipeline import make_pipeline
    from sklearn.svm import OneClassSVM

    # Generate train data
    X = two_clusters_with_outliers(seed=seed, n_inliers=520, n_outliers=20)[0]
    X_train = X[10:510]
    X_test = np.concatenate((X[:10], X[510:520]), axis=0)
    X_outliers = X[520:]

    # OCSVM hyperparameters
    nu = 0.05
    gamma = 2.0

    # Fit the One-Class SVM
    clf = OneClassSVM(gamma=gamma, kernel="rbf", nu=nu)
    clf.fit(X_train)
    inspect_decision_boundary(clf, X_train, X_test, X_outliers)

    # Fit the One-Class SVM using a kernel approximation and SGD
    transform = Nystroem(gamma=gamma, random_state=seed)
    clf_sgd = SGDOneClassSVM(
        nu=nu, shuffle=True, fit_intercept=True, random_state=seed, tol=1e-4
    )
    pipe_sgd = make_pipeline(transform, clf_sgd)
    pipe_sgd.fit(X_train)
    inspect_decision_boundary(pipe_sgd, X_train, X_test, X_outliers)


SVM_demo()

## Isolation Forest

* based on distance from root in a rendom forest classificator
* https://scikit-learn.org/dev/modules/outlier_detection.html#isolation-forest


In [None]:
def IsolationForest_demo():
    # Original implementation:
    # https://scikit-learn.org/dev/auto_examples/ensemble/plot_isolation_forest.html
    
    from sklearn.ensemble import IsolationForest
    X, y = two_clusters_with_outliers()

    clf = IsolationForest(max_samples=100, random_state=0)
    clf.fit(X)

    import matplotlib.pyplot as plt
    from sklearn.inspection import DecisionBoundaryDisplay
    disp = DecisionBoundaryDisplay.from_estimator(
        clf,
        X,
        response_method="decision_function",
        alpha=0.5,
        cmap='viridis'
    )
    scatter = disp.ax_.scatter(X[:, 0], X[:, 1], c=y, s=20, edgecolor="k", cmap='viridis')
    handles, labels = scatter.legend_elements()
    plt.title("Gaussian inliers with \nuniformly distributed outliers")
    disp.ax_.set_title("Path length decision boundary \nof IsolationForest")
    disp.ax_.axis("square")
    disp.ax_.legend(handles=handles, labels=["outliers", "inliers"], title="true class")
    plt.colorbar(scatter)
    plt.show()
    inspect_decision_boundary(clf, X, X[:100], X[100:])


IsolationForest_demo()

## Local Outlier Factor (LOF)

* based on $k$ nearest neighbors
* https://scikit-learn.org/dev/modules/outlier_detection.html#local-outlier-factor


In [None]:
def LOF_demo():
    # Original implementation:
    # https://scikit-learn.org/dev/auto_examples/neighbors/plot_lof_outlier_detection.html
    
    from sklearn.neighbors import LocalOutlierFactor

    X, ground_truth = two_clusters_with_outliers()
    clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
    y_pred = clf.fit_predict(X)
    n_errors = (y_pred != ground_truth).sum()
    X_scores = clf.negative_outlier_factor_

    import matplotlib.pyplot as plt
    from matplotlib.legend_handler import HandlerPathCollection


    def update_legend_marker_size(handle, orig):
        "Customize size of the legend marker"
        handle.update_from(orig)
        handle.set_sizes([20])

    fig, ax = plt.subplots()
    ax.scatter(X[:, 0], X[:, 1], color="k", s=3.0, label="Data points")
    # plot circles with radius proportional to the outlier scores
    radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min())
    scatter = ax.scatter(
        X[:, 0], X[:, 1], s=1000 * radius,
        edgecolors="r", facecolors="none", label="Outlier scores",
    )
    ax.set_xlabel("prediction errors: %d" % (n_errors))
    ax.legend(
        handler_map={scatter: HandlerPathCollection(update_func=update_legend_marker_size)}
    )
    ax.set_title("Local Outlier Factor (LOF)")
    plt.show()

LOF_demo()

# Anomaly detection in time series

* Takens' theorem, 1981
* time-delay embedding: interpret values in a sliding window as elements of a vector
* use a metric to find non-typical configurations

## Time series discord

In [23]:
def sine_with_baseline_shift(seed=20241217):
    shift = 2.5
    noise = 0.05
    rs = check_random_state(seed)
    t = np.linspace(0, 20 * np.pi, num=1000)
    y = np.sin(t)
    y[600:] += shift
    y[500:600] = np.linspace(0, shift, num=100)
    y += noise * rs.randn(1000)
    return t, y


def plot_sine_with_baseline_shift(mark=(), windows=(), title=None):
    from matplotlib import pyplot as plt
    t, y = sine_with_baseline_shift()
    if title is None:
        title = "Sine wave with baseline shift"
    plt.plot(t, y)
    mark = list(mark)
    plt.scatter(t[mark], y[mark], c="red")
    for window in windows:
        plt.axvspan(t[window[0]], t[window[1]], color="gray", alpha=0.5)
    plt.title(title)
    plt.show()


In [24]:
def time_series_discord(ts: np.ndarray, window_size: int, n_discords: int):
    """
    Find the `n_discords` most dissimilar subsequences in `ts` using a sliding window of size `window_size`.

    Parameters:
    `ts`: np.ndarray, shape=(n_samples,)
    `window_size`: int
    `n_discords`: int
    """
    from scipy.spatial.distance import cdist
    from scipy.linalg import toeplitz

    n_samples = len(ts)
    n_windows = n_samples - window_size + 1
    windows = np.lib.stride_tricks.sliding_window_view(ts, window_shape=(window_size,))

    # Calculate pairwise Euclidean distances between windows
    distances = cdist(windows, windows, metric="euclidean")
    # Make overlapping windows have infinite distance
    distances += toeplitz([np.inf] * window_size + list(np.zeros(n_windows - window_size)))
    # Note: it n_samples is large, this is very inefficient and memory-intensive
    # a more efficient way is to use cKDTree and discard overlapping vectors

    discord_ids = np.argsort(distances.min(axis=1))[-n_discords:]
    return discord_ids

In [None]:
def demo_time_series_discord():
    window_size = 50
    t, y = sine_with_baseline_shift()
    discord_ids = time_series_discord(y, window_size=window_size, n_discords=10)
    print(discord_ids)
    plot_sine_with_baseline_shift(windows=[(discord_ids[0], discord_ids[0] + window_size)],
                                  title="Time series discord detection")

demo_time_series_discord()

## Temporal outlier factor

* anomalous events occured rarely, similar points are concentrated in time

In [None]:
def temporal_outlier_factor(ts: np.ndarray, window_size: int, n_outliers: int):
    """
    Find the `n_outliers` most concentrated subsequences in `ts` using a sliding window of size `window_size`.

    Parameters:
    `ts`: np.ndarray, shape=(n_samples,)
    `window_size`: int
    `n_discords`: int
    """
    from scipy.spatial import cKDTree

    num = 20
    n_samples = len(ts)
    n_windows = n_samples - window_size + 1
    windows = np.lib.stride_tricks.sliding_window_view(ts, window_shape=(window_size,))

    # Calculate how far similar sequences in time
    _, indices = cKDTree(windows).query(windows, k=num + 1)
    temporal_outlier_factor = np.mean(np.abs(indices[:, 1:] - np.arange(n_windows)[:, None]), axis=1)
    # Generate reference values because the achievable distance depends on the index
    # The paper gives an exact formula for this, std. dev can be calculated too
    equispaced = np.linspace(0, n_windows, num=num)
    reference = np.mean(np.abs(equispaced - np.arange(n_windows)[:, None]), axis=1)

    outlier_ids = np.argsort(temporal_outlier_factor / reference)[:n_outliers]
    return outlier_ids

def demo_temporal_outlier_factor():
    window_size = 50
    t, y = sine_with_baseline_shift()
    outlier_ids = temporal_outlier_factor(y, window_size=window_size, n_outliers=10)
    print(outlier_ids)
    plot_sine_with_baseline_shift(mark=outlier_ids,
                                  title="Temporal outlier factor detection")

demo_temporal_outlier_factor()
    

# Anomaly detection with deep neural networks

### Dataset 3: pump sensor data

* Download from https://www.kaggle.com/datasets/nphantawee/pump-sensor-data

In [None]:
path = '../data'
ext = '.csv.zip'

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("nphantawee/pump-sensor-data")
ext = '.zip'

print("Path to dataset files:", path)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, TimeSeriesSplit

df = pd.read_csv(f"{path}/sensor{ext}", index_col=0)
df['machine_status'] = (df['machine_status'] == 'NORMAL').astype(int)
df = df.dropna(how='all', axis=1).dropna(how='any', axis=0)
print(df.shape, df['machine_status'].sum())
df.head()

In [None]:
ss = StandardScaler()
ss.fit(df[df['machine_status'] == 1].drop(columns=['timestamp', 'machine_status']))
scaled = ss.transform(df.drop(columns=['timestamp', 'machine_status']).fillna(0))
print(scaled.shape)

train_dataset, val_dataset = train_test_split(scaled[df['machine_status'] == 1], shuffle=False)
train_dataset = np.lib.stride_tricks.sliding_window_view(train_dataset,64,axis=0)
val_dataset = np.lib.stride_tricks.sliding_window_view(val_dataset,64,axis=0)
whole_dataset = np.lib.stride_tricks.sliding_window_view(scaled,64,axis=0)
print(train_dataset.shape, val_dataset.shape, whole_dataset.shape)

## Autoencoder

In [29]:
# Implement pytorch autoencoder for sequence length n_seq and number of features n_features
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class Autoencoder(nn.Module):
    def __init__(self, n_features, hidden_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(n_features, 128),
            nn.ReLU(True),
            nn.Linear(128, 64),
            nn.ReLU(True),
            nn.Linear(64, 12),
            nn.ReLU(True),
            nn.Linear(12, hidden_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim, 12),
            nn.ReLU(True),
            nn.Linear(12, 64),
            nn.ReLU(True),
            nn.Linear(64, 128),
            nn.ReLU(True),
            nn.Linear(128, n_features),
            nn.Tanh()
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [30]:
def train_model(model, train_dataset, val_dataset, criterion, optimizer):
    train_dataset = DataLoader(train_dataset, batch_size=512, shuffle=False)
    val_dataset = DataLoader(val_dataset, batch_size=512, shuffle=False)
    best_loss = np.inf
    history = dict(train=[], val=[])

    for epoch in tqdm(range(10)):
        model = model.train()

        train_losses = []
        for seq_true in train_dataset:
            seq_true = torch.flatten(seq_true, 1).to(device)
            seq_pred = model(seq_true)
            loss = criterion(seq_pred, seq_true)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_losses.append(loss.item())

        val_losses = []
        model = model.eval()
        with torch.no_grad():
            for seq_true in val_dataset:

                seq_true = torch.flatten(seq_true, 1).to(device)
                seq_pred = model(seq_true)

                loss = criterion(seq_pred, seq_true)
                val_losses.append(loss.item())

        train_loss = np.mean(train_losses)
        val_loss = np.mean(val_losses)

        history['train'].append(train_loss)
        history['val'].append(val_loss)

        if val_loss < best_loss:
            best_loss = val_loss
            best_model_wts = deepcopy(model.state_dict())
    return model, history, best_model_wts


In [None]:
model = Autoencoder(n_features=np.prod(train_dataset.shape[1:]), hidden_dim=8)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.L1Loss(reduction='sum').to(device)


trained, history, best_model_wts = train_model(
    model.to(device),
    torch.tensor(train_dataset, dtype=torch.float32),
    torch.tensor(val_dataset, dtype=torch.float32),
    criterion, optimizer)

In [33]:
def find_anomalies(model, test_dataset, criterion):
    model.load_state_dict(best_model_wts)
    model = model.eval()
    test_dataset = DataLoader(test_dataset, batch_size=512, shuffle=False)
    losses = []
    with torch.no_grad():
        for seq_true in test_dataset:
            seq_true = torch.flatten(seq_true, 1).to(device)
            seq_pred = model(seq_true)
            loss = criterion(seq_pred, seq_true).mean(axis=1)
            losses.append(loss.cpu().numpy())
    return np.concatenate(losses, axis=0)

losses = find_anomalies(
    model,
    torch.tensor(whole_dataset, dtype=torch.float32),
    criterion = nn.L1Loss(reduction='none').to(device))

In [None]:
def plot_losses():
    from matplotlib import pyplot as plt
    fig, ax = plt.subplots()
    at = ax.twinx()
    at.plot(losses)
    at.set_ylabel('Loss')
    ax.plot(df['machine_status'].values, color='red')
    ax.set_ylabel('Machine status')
    ax.set_xlabel('Time')
    plt.show()

plot_losses()

## Read more...

* More implementations (RNN, LSTM): https://medium.com/@artur.shaikhatarov/anomaly-detection-using-recurrent-neural-networks-autoencoders-41bdf52d7b53
* Other datasets
  * https://developer.ibm.com/tutorials/iot-deep-learning-anomaly-detection-5/
  * https://github.com/claimed-framework/component-library/blob/master/component-library/anomaly/anomaly-score-unsupervised/test-anomaly-score-unsupervised.ipynb
  * https://compete.hexagon-ml.com/practice/competition/39