# Overview

In this notebooks we wil detect outliers using pycaret for various strategies. We will then compare the results.

### Detecting Outliers using Pycaret

In [None]:
from pycaret.anomaly import AnomalyExperiment
from pycaret.datasets import get_data
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA

In [None]:
# Load dataset
data = get_data('anomaly')

# Initialize experiment
exp = AnomalyExperiment()
exp.setup(data, session_id=123, normalize=True, transformation=True)  # Added normalization and transformation

# Train multiple models
iforest = exp.create_model('iforest')
knn = exp.create_model('knn')
lof = exp.create_model('lof')
cblof = exp.create_model('cblof')  # Added Cluster-Based Local Outlier Factor
svm = exp.create_model('svm')  # Added One-Class SVM

# Assign anomaly labels
iforest_results = exp.assign_model(iforest)
knn_results = exp.assign_model(knn)
lof_results = exp.assign_model(lof)
cblof_results = exp.assign_model(cblof)
svm_results = exp.assign_model(svm)

In [None]:
# Combine anomaly assignments into a single table
anomaly_df = data.copy()
anomaly_df['IForest_Anomaly'] = iforest_results['Anomaly']
anomaly_df['KNN_Anomaly'] = knn_results['Anomaly']
anomaly_df['LOF_Anomaly'] = lof_results['Anomaly']
anomaly_df['CBLOF_Anomaly'] = cblof_results['Anomaly']
anomaly_df['SVM_Anomaly'] = svm_results['Anomaly']

In [None]:
# Save processed dataset
anomaly_df.to_csv('anomaly_comparison.csv', index=False)


### Analysis of the Detected Anomalies

We now look at the anomalies detected in a single combined dataset.

In [None]:
# Explore dataset statistics
print("Dataset Summary:")
print(anomaly_df.describe())

# Count anomalies detected by each model
anomaly_counts = anomaly_df.iloc[:, 1:].apply(lambda x: (x == 1).sum())
print("\nAnomaly Counts per Model:")
print(anomaly_counts)

# Correlation analysis among anomaly detections
correlation_matrix = anomaly_df.iloc[:, 1:].corr()
print("\nCorrelation Matrix:")
print(correlation_matrix)

# Visualize correlations using a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix of Anomaly Assignments")
plt.show()

### Visualisation of the Detected Anomalies

In [None]:
def plot_anomaly_scatter(df, model_column, feature1, feature2):
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=df[feature1], y=df[feature2], hue=df[model_column], palette={0: "blue", 1: "red"})
    plt.xlabel(feature1)
    plt.ylabel(feature2)
    plt.title(f'Anomalies Detected by {model_column}')
    plt.legend(title="Anomaly (1=Yes, 0=No)")
    plt.show()

# Example usage (change feature names based on dataset)
plot_anomaly_scatter(anomaly_df, 'IForest_Anomaly', 'Feature1', 'Feature2')

Since datasets often have multiple features, PCA reduces them to two principal components for visualization.


In [None]:
def plot_pca_anomalies(df, model_column):
    pca = PCA(n_components=2)
    principal_components = pca.fit_transform(df.drop(columns=['IForest_Anomaly', 'KNN_Anomaly', 'LOF_Anomaly', 'CBLOF_Anomaly', 'SVM_Anomaly']))  # Drop anomaly labels for PCA
    df_pca = pd.DataFrame(principal_components, columns=['PC1', 'PC2'])
    df_pca[model_column] = df[model_column]

    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=df_pca['PC1'], y=df_pca['PC2'], hue=df_pca[model_column], palette={0: "blue", 1: "red"})
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.title(f'PCA Projection of Anomalies ({model_column})')
    plt.legend(title="Anomaly (1=Yes, 0=No)")
    plt.show()

# Example usage
plot_pca_anomalies(anomaly_df, 'IForest_Anomaly')

We plot anomaly distributions


In [None]:

def plot_anomaly_comparison(*datasets, labels):
    plt.figure(figsize=(10, 6))
    for dataset, label in zip(datasets, labels):
        plt.hist(dataset['Anomaly'], bins=3, alpha=0.5, label=label)
    plt.xlabel('Anomaly Score')
    plt.ylabel('Frequency')
    plt.title('Comparison of Anomalies Across Models')
    plt.legend()
    plt.show()

# Call function to compare anomaly detection results
plot_anomaly_comparison(iforest_results, knn_results, lof_results, cblof_results, svm_results, 
                         labels=['IForest', 'KNN', 'LOF', 'CBLOF', 'SVM'])