In [32]:
import numpy as np
from sklearn.metrics import balanced_accuracy_score

contamination = 0.1

samples = 1000
features = 3

mu = np.array([3, 7, 2])
print("Mean:", mu)

cov = np.array([[3, 3, 2],
                [2, 9, 4],
                [1, 5, 6]])
cov = cov + np.eye(features) * 0.1  # to make sure it's positive definite
print("Covariance Matrix:\n", cov)


outliers = int(contamination * samples)
inliners = samples - outliers

data_inliners = np.random.randn(inliners, features)
data_outliers = np.random.randn(outliers, features) * 5

L = np.linalg.cholesky(cov)
data = np.vstack((data_inliners, data_outliers))
data = data @ L.T + mu

# marking the labels
labels = np.hstack((np.zeros(inliners), np.ones(outliers)))

z_score = (data - np.mean(data, axis=0)) / np.std(data, axis=0)
z_aggr = np.max(np.abs(z_score), axis=1)
threshold = np.quantile(z_aggr, 1 - contamination)

y_pred = (np.abs(z_aggr) > threshold).astype(int)

print("Balanced Accuracy:", balanced_accuracy_score(labels, y_pred))

Mean: [3 7 2]
Covariance Matrix:
 [[3.1 3.  2. ]
 [2.  9.1 4. ]
 [1.  5.  6.1]]
Balanced Accuracy: 0.9555555555555556
