In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.ensemble import IsolationForest # new!
from sklearn import set_config
set_config(transform_output='pandas')

In [5]:
from sklearn.datasets import make_blobs
# Create dataset with 3 centers
X_, y_ = make_blobs(n_samples=300, centers=3, random_state=321)
df = pd.DataFrame(X_, columns=['x1','x2'])
df.head()

Unnamed: 0,x1,x2
0,7.793419,-7.149916
1,8.783409,-9.979164
2,7.467731,1.424818
3,9.204612,-5.804796
4,10.24253,-3.605225


In [6]:
# Define the original features
X = df[['x1','x2']]
X.head()

Unnamed: 0,x1,x2
0,7.793419,-7.149916
1,8.783409,-9.979164
2,7.467731,1.424818
3,9.204612,-5.804796
4,10.24253,-3.605225


In [7]:
# Isolation Forest

# Instantiate the model with a contaimination of 0.05 (we will identify 5% as anamolous)
iso_05 = IsolationForest(contamination=0.05, random_state = 42)
# fit the model using .values to avoid a warning
iso_05.fit(X.values)

In [8]:
# Obtain results from the model
predictions = iso_05.predict(X.values)
predictions[:100]

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
       -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1, -1,  1,  1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1])

In [9]:
# Change the labels to match our columns from the kmeans dataframe
# Not anamolies
predictions[predictions ==1] = 0
# Anomalies
predictions[predictions ==-1] = 1
# Preview new labels
predictions[:100]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0])

In [10]:
# Add isolation forest predictions to the dataframe
df['anomaly-iso_05'] = predictions
# Preview dataframe with new column
df.head()

Unnamed: 0,x1,x2,anomaly-iso_05
0,7.793419,-7.149916,0
1,8.783409,-9.979164,0
2,7.467731,1.424818,0
3,9.204612,-5.804796,0
4,10.24253,-3.605225,0


In [11]:
# Comparing number of anomalies found
kmeans_anomalies = df['anomaly-kmeans'].sum()
print(f'Kmeans (threshold = 0.95) identified {kmeans_anomalies} anomalies.')
isoforest_anomalies = df['anomaly-iso_05'].sum()
print(f'Isoforest (contaminatin = 0.05) identified {isoforest_anomalies} anomalies.')

KeyError: 'anomaly-kmeans'

In [None]:
# Plot the original data points
fig, ax = plt.subplots(figsize=(8, 8))
ax.scatter(df['x1'], df['x2'], ec= 'black', alpha = 0.5)
ax.set_title("Identified Anomalies")  
    
# Plot the anomalies identified by the isolation forest
df.loc[idx_anomalies].plot(
    x="x1", y="x2", kind="scatter", color="magenta", label="Anomalies (Isolation Forest)", 
    ax=ax, marker='P', s=50);

In [None]:
# Define anomalies identified by kmeans
kmeans_95_anomalies = df[df['anomaly-kmeans'] == 1]
# Define anomalies identified by isolation forest
iso_05_anomalies = df[df['anomaly-iso_05'] == 1]
# Define anomalies identified by both models
both_anomalies = df[(df['anomaly-iso_05'] == 1)& (df['anomaly-kmeans'] == 1)]
# Plot the data points and identified clusters
fig, ax= plt.subplots(figsize=(8, 8))
# Plot data
ax.scatter(df["x1"], df["x2"], ec = 'black', alpha=0.5)
ax.set_title("Comparing Models")
# Annotate KMeans anomalies
kmeans_95_anomalies.plot(x="x1", y="x2", kind="scatter", color="magenta", label="Anomalies only KMeans Distance", ax=ax, marker='P',s=50)
# Annotate Iso Forest anomalies
iso_05_anomalies.plot(x="x1", y="x2", kind="scatter", color="blue", label="Anomalies only Isolation Forest", ax=ax, marker='P',s=50)
# Annotate Anomolies identified by both models
both_anomalies.plot(x="x1", y="x2", kind="scatter", color="red", label="Anomalies Models Agree on", ax=ax, marker='P',s=50);

In [None]:
# Change Contamination Parameter

# Instantiate the model with a contaimination of 0.20 (we will identify 20% as anamolous)
iso_20 = IsolationForest(contamination=0.20, random_state = 42)
# fit the model using .values to avoid a warning
iso_20.fit(X.values)
# Obtain results from the model
predictions = iso_20.predict(X.values)
predictions[:100]

In [None]:
# Change the labels to match our columns from the kmeans dataframe
# Not anamolies
predictions[predictions ==1] = 0
# Anomalies
predictions[predictions ==-1] = 1
# Add isolation forest predictions to the dataframe
df['anomaly-iso_20'] = predictions
# Preview dataframe with new column
df.head()

In [None]:
# Comparing number of anomalies found with different contamination values
isoforest_anomalies_05 = df['anomaly-iso_05'].sum()
print(f'Isoforest (contaminatin = 0.05) identified {isoforest_anomalies_05} anomalies.')
isoforest_anomalies_20 = df['anomaly-iso_20'].sum()
print(f'Isoforest (contaminatin = 0.20) identified {isoforest_anomalies_20} anomalies.')

In [None]:
# Create a filter to identify anomalies
idx_anomalies = df[df['anomaly-iso_20'] == 1].index
idx_anomalies
# Plot the original data points
fig, ax = plt.subplots(figsize=(8, 8))
ax.scatter(df['x1'], df['x2'], ec= 'black', alpha = 0.5)
ax.set_title("Identified Anomalies")  
    
# Plot the anomalies identified by the isolation forest
df.loc[idx_anomalies].plot(
    x="x1", y="x2", kind="scatter", color="magenta", label="Anomalies (Isolation Forest)", 
    ax=ax, marker='P', s=50);