## Exercise 1: K-Means Anomaly Detection
Use the K-Means algorithm to detect anomalies in a two-dimensional synthetic dataset. Consider points that are more than two standard deviations from the nearest cluster centroid as anomalies.

In [None]:
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans, DBSCAN
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor

In [None]:
def kmeans_anomaly_detection(X, n_clusters=3, threshold=2):
    """Detects anomalies using the K-Means clustering algorithm.

    Parameters:
    X (array-like): The input features for clustering.
    n_clusters (int): The number of clusters to form.
    threshold (float): The number of standard deviations from the cluster centroid to consider a point an anomaly.

    Returns:
    anomalies (array-like): The indices of the anomalies in the dataset.
    """
    # Fit the K-Means model
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(X)
    clusters = kmeans.predict(X)
    centroids = kmeans.cluster_centers_

    # Calculate distances from each point to its assigned cluster centroid
    distances = np.linalg.norm(X - centroids[clusters], axis=1)
    std_distance = np.std(distances)
    anomalies = np.where(distances > std_distance * threshold)[0]

    return anomalies

# Generate synthetic data
X, _ = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)

# Detect anomalies
anomalies = kmeans_anomaly_detection(X)
print(f'Anomalies detected at indices: {anomalies}')


Anomalies detected at indices: [  0   2   5   6   7   8  10  11  12  16  19  22  23  24  27  28  32  38
  41  42  43  44  47  50  56  57  62  64  70  73  74  76  77  85  87  93
  95  98  99 101 104 107 109 110 115 119 127 129 131 132 135 136 142 143
 144 145 151 152 157 159 164 165 166 167 169 170 173 181 182 183 185 186
 187 192 194 195 196 197 198 200 203 204 205 206 208 210 211 213 218 223
 225 229 230 235 239 241 242 247 249 250 251 253 254 255 260 261 262 266
 268 271 272 273 274 277 282 290 298 299]


  super()._check_params_vs_input(X, default_n_init=10)


## Exercise 2: DBSCAN Anomaly Detection
Use the DBSCAN algorithm to detect anomalies in the same synthetic dataset. Consider points classified as '-1' by DBSCAN as anomalies.

In [None]:
def dbscan_anomaly_detection(X, eps=0.5, min_samples=5):
    """Detects anomalies using the DBSCAN clustering algorithm.

    Parameters:
    X (array-like): The input features for clustering.
    eps (float): The maximum distance between two samples for one to be considered as in the neighborhood of the other.
    min_samples (int): The number of samples (or total weight) in a neighborhood for a point to be considered as a core point.

    Returns:
    anomalies (array-like): The indices of the anomalies in the dataset.
    """
    # Fit the DBSCAN model
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    clusters = dbscan.fit_predict(X)

    # Anomalies are the points labeled as -1
    anomalies = np.where(clusters == -1)[0]

    return anomalies

# Detect anomalies using DBSCAN
anomalies_dbscan = dbscan_anomaly_detection(X)
print(f'DBSCAN Anomalies detected at indices: {anomalies_dbscan}')


DBSCAN Anomalies detected at indices: [  5  25  42  62  88 143 152 166 174 205 218 242 249 256 273 274 290 298]


## Exercise 3: Isolation Forest Anomaly Detection
Apply an Isolation Forest to identify anomalies in the dataset. Use the anomaly score to determine if a point is an anomaly or not.

In [None]:

def isolation_forest_anomaly_detection(X, contamination=0.1):
    """Detects anomalies using the Isolation Forest algorithm.

    Parameters:
    X (array-like): The input features for anomaly detection.
    contamination (float): The proportion of outliers in the data set.

    Returns:
    anomalies (array-like): The indices of the anomalies in the dataset.
    """
    # Fit the Isolation Forest model
    iso_forest = IsolationForest(contamination=contamination, random_state=42)
    anomalies = iso_forest.fit_predict(X)

    # Anomalies are the points labeled as -1
    anomaly_indices = np.where(anomalies == -1)[0]

    return anomaly_indices

# Detect anomalies using Isolation Forest
anomalies_iso_forest = isolation_forest_anomaly_detection(X)
print(f'Isolation Forest Anomalies detected at indices: {anomalies_iso_forest}')


Isolation Forest Anomalies detected at indices: [  5   8  11  13  28  35  37  56  62  66  68  88  92  99 101 140 144 165
 166 196 218 229 238 242 250 273 274 285 290 298]


## Exercise 4: One-Class SVM Anomaly Detection
Implement anomaly detection using a One-Class SVM on the provided dataset. Choose an appropriate kernel and adjust the nu parameter, which represents the proportion of outliers.

In [None]:
def one_class_svm_anomaly_detection(X, nu=0.05, kernel='rbf', gamma='scale'):
    """Detects anomalies using the One-Class SVM algorithm.

    Parameters:
    X (array-like): The input features for anomaly detection.
    nu (float): An upper bound on the fraction of training errors and a lower bound of the fraction of support vectors.
    kernel (str): Specifies the kernel type to be used in the algorithm.
    gamma (str): Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.

    Returns:
    anomalies (array-like): The indices of the anomalies in the dataset.
    """
    # Fit the One-Class SVM model
    oc_svm = OneClassSVM(nu=nu, kernel=kernel, gamma=gamma)
    oc_svm.fit(X)
    predictions = oc_svm.predict(X)

    # Anomalies are the points labeled as -1
    anomalies = np.where(predictions == -1)[0]

    return anomalies

# Detect anomalies using One-Class SVM
anomalies_oc_svm = one_class_svm_anomaly_detection(X)
print(f'One-Class SVM Anomalies detected at indices: {anomalies_oc_svm}')


One-Class SVM Anomalies detected at indices: [  5   6  11  28  62  99 101 165 218 229 238 250 290]
