# Outliers Detect Package Demonstration

This notebook demonstrates the use of the `outliers_detect` package for outlier detection in embeddings.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from outliers_detect import outliers_detect_df

## 1. Creating Sample Data

Let's create a dataset with normal embeddings and some outliers to demonstrate the different detection methods.

In [None]:
# Setting seed for reproducibility
np.random.seed(42)

# Creating normal embeddings
n_samples = 100
embedding_dim = 3
normal_embeddings = np.random.normal(0, 1, (n_samples, embedding_dim))

# Creating some outliers
outlier_embeddings = np.random.normal(5, 1, (5, embedding_dim))

# Combining the data
all_embeddings = np.vstack([normal_embeddings, outlier_embeddings])

# Creating DataFrame
df = pd.DataFrame({
    'id': range(len(all_embeddings)),
    'embedding': [str(emb.tolist()) for emb in all_embeddings]
})

print(f"Total number of samples: {len(df)}")
print(f"Number of known outliers: {len(outlier_embeddings)}")
df.head()

## 2. Visualizing the Data

Let's visualize the data in 3D to get an idea of the point distribution.

In [None]:
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Plotting normal points
ax.scatter(normal_embeddings[:, 0], normal_embeddings[:, 1], normal_embeddings[:, 2], 
          c='blue', label='Normal', alpha=0.6)

# Plotting outliers
ax.scatter(outlier_embeddings[:, 0], outlier_embeddings[:, 1], outlier_embeddings[:, 2], 
          c='red', label='Outlier', s=100)

ax.set_xlabel('Dimension 1')
ax.set_ylabel('Dimension 2')
ax.set_zlabel('Dimension 3')
ax.legend()
plt.title('Embeddings Distribution')
plt.show()

## 3. Testing Different Detection Methods

Let's test each of the available methods and compare their results.

In [None]:
# Available methods: 'percentile', 'zscore', 'iqr', 'cosine', 'pca_reconstruction'
methods = ['percentile', 'zscore', 'iqr', 'cosine', 'pca_reconstruction']
results = {}

for method in methods:
    print(f"\nTesting method: {method}")
    result = outliers_detect_df(
        df,
        method=method,
        percentile_threshold=95,
        z_threshold=2.0,
        cosine_threshold=0.10,
        plot=True
    )
    
    n_outliers = result['is_outlier'].sum()
    print(f"Number of outliers found: {n_outliers}")
    print(f"Percentage of outliers: {(n_outliers/len(df))*100:.2f}%")
    
    results[method] = result

## 4. Comparing Results

Let's compare the results from different methods to see how they behave.

In [None]:
# Creating a DataFrame with the results
comparison = pd.DataFrame({
    'Method': methods,
    'Number of Outliers': [results[m]['is_outlier'].sum() for m in methods],
    'Percentage': [(results[m]['is_outlier'].sum()/len(df))*100 for m in methods]
})

comparison

## 5. Visualizing Detected Outliers

Let's visualize the outliers found by each method in a 3D plot.

In [None]:
def plot_outliers(method, result):
    fig = plt.figure(figsize=(10, 8))
    ax = fig.add_subplot(111, projection='3d')
    
    # Converting embeddings back to arrays
    embeddings = np.array([eval(emb) for emb in df['embedding']])
    
    # Plotting normal points
    mask_normal = ~result['is_outlier']
    ax.scatter(embeddings[mask_normal, 0], embeddings[mask_normal, 1], embeddings[mask_normal, 2], 
              c='blue', label='Normal', alpha=0.6)
    
    # Plotting outliers
    mask_outlier = result['is_outlier']
    ax.scatter(embeddings[mask_outlier, 0], embeddings[mask_outlier, 1], embeddings[mask_outlier, 2], 
              c='red', label='Outlier', s=100)
    
    ax.set_xlabel('Dimension 1')
    ax.set_ylabel('Dimension 2')
    ax.set_zlabel('Dimension 3')
    ax.legend()
    plt.title(f'Detected Outliers - Method: {method}')
    plt.show()

for method in methods:
    plot_outliers(method, results[method])

## 6. Adjusting Parameters

Let's see how the results change when we adjust the percentile method parameters.

In [None]:
percentiles = [90, 95, 97.5, 99]
percentile_results = {}

for p in percentiles:
    print(f"\nTesting percentile: {p}")
    result = outliers_detect_df(
        df,
        method='percentile',
        percentile_threshold=p,
        plot=True
    )
    
    n_outliers = result['is_outlier'].sum()
    print(f"Number of outliers found: {n_outliers}")
    print(f"Percentage of outliers: {(n_outliers/len(df))*100:.2f}%")
    
    percentile_results[p] = result