In [1]:
import numpy as np
from sklearn.covariance import MinCovDet

In [2]:
# Generate a random dataset with outliers
def generate_outlier_dataset(num_samples, num_features, outlier_fraction):
    inliers = np.random.randn(int((1 - outlier_fraction) * num_samples), num_features)
    outliers = 10.0 * np.random.randn(int(outlier_fraction * num_samples), num_features)
    dataset = np.vstack([inliers, outliers])
    return dataset

In [3]:
# Estimate covariance matrix using the sample covariance and MCD estimator
def estimate_covariance_matrix(dataset):
    sample_cov = np.cov(dataset, rowvar=False)
    mcd = MinCovDet().fit(dataset)
    mcd_cov = mcd.covariance_
    return sample_cov, mcd_cov

In [4]:
def compare_covariance_matrices(sample_cov, mcd_cov):
    print("Sample Covariance Matrix:")
    print(sample_cov)
    print("\nMCD Covariance Matrix:")
    print(mcd_cov)
    print("\nDifference between the matrices:")
    print(sample_cov - mcd_cov)

In [5]:
np.random.seed(42)
num_samples = 100
num_features = 3
outlier_fractions = [0.1, 0.2, 0.3]
for outlier_fraction in outlier_fractions:
    print(f"\nGenerating dataset with {int(outlier_fraction * 100)}% outliers:")
    dataset = generate_outlier_dataset(num_samples, num_features, outlier_fraction)
    sample_cov, mcd_cov = estimate_covariance_matrix(dataset)
    print("\nComparing covariance matrices:")
    compare_covariance_matrices(sample_cov, mcd_cov)


Generating dataset with 10% outliers:

Comparing covariance matrices:
Sample Covariance Matrix:
[[ 9.77826035 -3.70391397  3.45120391]
 [-3.70391397  7.37128737 -4.96073006]
 [ 3.45120391 -4.96073006  9.5865595 ]]

MCD Covariance Matrix:
[[ 0.54165739 -0.04922735 -0.10961889]
 [-0.04922735  0.91676002 -0.15491047]
 [-0.10961889 -0.15491047  0.94779386]]

Difference between the matrices:
[[ 9.23660296 -3.65468662  3.5608228 ]
 [-3.65468662  6.45452735 -4.8058196 ]
 [ 3.5608228  -4.8058196   8.63876564]]

Generating dataset with 20% outliers:

Comparing covariance matrices:
Sample Covariance Matrix:
[[23.51572186 -7.1339849   0.84439369]
 [-7.1339849  22.56958587 -2.88239039]
 [ 0.84439369 -2.88239039 17.96732802]]

MCD Covariance Matrix:
[[ 0.84914283 -0.0393283  -0.06345819]
 [-0.0393283   0.89745006  0.03037772]
 [-0.06345819  0.03037772  0.97531991]]

Difference between the matrices:
[[22.66657903 -7.0946566   0.90785188]
 [-7.0946566  21.67213581 -2.91276812]
 [ 0.90785188 -2.91276