In [1]:
# Question: Multivariate Outlier Detection Using Mahalanobis Distance
# Description: Implement Mahalanobis distance to detect multivariate outliers in a dataset.



In [2]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import mahalanobis
from scipy.linalg import inv
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Create a DataFrame with potential multivariate outliers
data_multi = {'Feature_A': [1, 2, 3, 4, 5, 10],
              'Feature_B': [6, 7, 8, 9, 10, 20],
              'Feature_C': [11, 12, 13, 14, 15, 50]}
df_multi = pd.DataFrame(data_multi)

print("Original Multivariate DataFrame:\n", df_multi)

# 2. Calculate the Mahalanobis Distance for each data point
# a. Calculate the mean vector of the data
mean_vector = df_multi.mean().values

# b. Calculate the covariance matrix of the data
covariance_matrix = df_multi.cov().values

# c. Calculate the inverse of the covariance matrix
inv_covariance = inv(covariance_matrix)

# d. Calculate the Mahalanobis distance for each point
mahalanobis_distances = []
for index, row in df_multi.iterrows():
    diff = row.values - mean_vector
    mahalanobis_dist = np.sqrt(diff.reshape(1, -1) @ inv_covariance @ diff.reshape(-1, 1))
    mahalanobis_distances.append(mahalanobis_dist[0][0])

df_multi['Mahalanobis_Distance'] = mahalanobis_distances
print("\nDataFrame with Mahalanobis Distances:\n", df_multi)

# 3. Identify potential outliers based on a threshold
# The Mahalanobis distance follows a chi-squared distribution with k degrees of freedom (where k is the number of variables)
# under the assumption that the data is multivariate normal. We can use a threshold based on the chi-squared distribution.
from scipy.stats import chi2

# Degrees of freedom (number of features)
degrees_freedom = df_multi.shape[1] - 1 # Subtract the Mahalanobis distance column

# Set a significance level (alpha)
alpha = 0.05

# Calculate the critical value from the chi-squared distribution
critical_value = chi2.ppf(1 - alpha, df=degrees_freedom)
print("\nCritical Chi-Squared Value (alpha={}, df={}): {:.2f}".format(alpha, degrees_freedom, critical_value))

# Identify outliers
df_outliers_mahalanobis = df_multi[df_multi['Mahalanobis_Distance'] > critical_value]
print("\nPotential Multivariate Outliers (based on Mahalanobis Distance > {:.2f}):\n".format(critical_value), df_outliers_mahalanobis)

# 4. Visualize the Mahalanobis Distances (optional)
plt.figure(figsize=(8, 6))
sns.histplot(df_multi['Mahalanobis_Distance'], kde=True)
plt.title('Distribution of Mahalanobis Distances')
plt.xlabel('Mahalanobis Distance')
plt.ylabel('Frequency')
plt.axvline(critical_value, color='red', linestyle='--', label=f'Critical Value ({critical_value:.2f})')
plt.legend()
plt.show()

Original Multivariate DataFrame:
    Feature_A  Feature_B  Feature_C
0          1          6         11
1          2          7         12
2          3          8         13
3          4          9         14
4          5         10         15
5         10         20         50


LinAlgError: singular matrix