In [1]:
import numpy as np
import pandas as pd

In [16]:
iris_data = pd.read_csv('iris.csv')
iris_data

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica


In [69]:
varieties = list(set(iris_data.variety))
test_data = pd.DataFrame({})
subsets = [iris_data[iris_data.variety == variety].sample(1) for variety in varieties]

test_data = pd.concat(subsets)

In [77]:
actual_classes = test_data.variety
test_data = test_data.drop(['variety'], axis=1)
test_data

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
60,5.0,2.0,3.5,1.0
142,5.8,2.7,5.1,1.9
46,5.1,3.8,1.6,0.2


In [78]:
train_data = iris_data.drop(test_data.index)

In [94]:
columns = list(train_data.columns)[0:4]

mean_vector = []
for variety in varieties:
    mean_var = []
    for column in columns:
        data = train_data[train_data.variety == variety][column]
        mean_var.append(np.mean(data.to_numpy()))
    mean_vector.append(mean_var)

covar_vector = []
for variety in varieties:
    # Extract data for all features (columns) for the given variety
    data = train_data[train_data['variety'] == variety][columns].to_numpy()
    
    # Calculate the covariance matrix for all features for this variety
    covar_matrix = np.cov(data, rowvar=False)  # rowvar=False treats columns as variables
    covar_vector.append(covar_matrix)

covar_vector

[array([[0.25335884, 0.0716369 , 0.17158588, 0.05045493],
        [0.0716369 , 0.08791667, 0.07193452, 0.03672619],
        [0.17158588, 0.07193452, 0.21313776, 0.06935799],
        [0.05045493, 0.03672619, 0.06935799, 0.03766156]]),
 array([[0.39956633, 0.0911267 , 0.30203656, 0.04800595],
        [0.0911267 , 0.10457483, 0.07023384, 0.0478869 ],
        [0.30203656, 0.07023384, 0.30659014, 0.04863095],
        [0.04800595, 0.0478869 , 0.04863095, 0.07666667]]),
 array([[0.12664966, 0.10053997, 0.01642007, 0.01063776],
        [0.10053997, 0.1437415 , 0.01085034, 0.00985544],
        [0.01642007, 0.01085034, 0.03038265, 0.00633078],
        [0.01063776, 0.00985544, 0.00633078, 0.01129252]])]

In [98]:
def mahalanobis_distance(x, mean, cov):
    diff = np.array(x) - np.array(mean)
    inv_cov = np.linalg.inv(cov)
    return np.sqrt(np.dot(np.dot(diff, inv_cov), diff.T))

distances = []

for j in range(len(test_data)):
    distance = []
    for i in range(len(mean_vector)):
        distance.append(mahalanobis_distance(list(test_data.iloc[j]), mean_vector[i], covar_vector[i]))
    distances.append(distance)

distances

[[np.float64(2.7520202995923464),
  np.float64(4.855772866647456),
  np.float64(13.85501630276377)],
 [np.float64(4.353212711364535),
  np.float64(1.418327570896674),
  np.float64(23.894522522633274)],
 [np.float64(11.354076236259075),
  np.float64(13.490375049884507),
  np.float64(1.7268283806806726)]]

In [114]:
predicted_classes = []
for test in distances:
    predicted_classes.append(varieties[np.argmin(test)])

type(predicted_classes)

list

In [113]:
for i in range(len(predicted_classes)):
    print(f'Predicted class: {predicted_classes[i]}, Actual class: {actual_classes[i]}, mahalanobis distance: {np.min(distances[i])}')


KeyError: 0