In [71]:
from sklearn.neighbors import KNeighborsClassifier

import numpy as np

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

path = '/workspace/data/crops_and_embeddings/kwitonda_portraits/embeddings'

embeddings = np.load(f'{path}/embeddings_dino.npy')
labels = np.load(f'{path}/individual_ids.npy')

assert len(embeddings) == len(labels)

print(embeddings.shape)
print(labels.shape)

print(len(set(labels)))

from collections import Counter

counter = Counter(labels)
print(counter.most_common())

(1731, 1536)
(1731,)
20
[('KAV', 376), ('MBI', 241), ('NTI', 130), ('INO', 124), ('RWU', 107), ('SUL', 98), ('KHA', 92), ('IZO', 82), ('ICM', 76), ('REM', 65), ('KAT', 60), ('KBU', 58), ('NRJ', 50), ('CYU', 50), ('GAH', 49), ('WIG', 45), ('NAN', 21), ('NOT VISIBLE', 4), ('IKR', 2), ('ELEPHANT', 1)]


In [72]:
def predict_one(embeddings, labels, test_idx, k=5):
    train_idx = [i for i in range(len(embeddings)) if i != test_idx]
    train_embeddings = embeddings[train_idx]
    train_labels = labels[train_idx]

    test_embedding = embeddings[test_idx]

    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(train_embeddings, train_labels)

    return knn.predict([test_embedding])[0], labels[test_idx]

# predict_one(embeddings, labels, 3)

# Calculate precision, recall, f1

def calculate_metrics(embeddings, labels, min_samples=5, k=2):
    # only predicting on individuals with at least min_samples
    unique_labels, counts = np.unique(labels, return_counts=True)
    unique_labels = unique_labels[counts >= min_samples]
    
    mask = np.isin(labels, unique_labels)
    # embeddings = embeddings[mask]
    # labels = labels[mask]
    
    idx = [i for i in range(len(embeddings)) if labels[i] in unique_labels]
    
    print(f'Calculating metrics on {len(idx)} samples from {len(unique_labels)} individuals with {len(embeddings)} samples in total.')
    
    predictions = [predict_one(embeddings, labels, i, k=k) for i in idx]
    predictions = np.array(predictions)
    
    tp = np.sum(predictions[:, 0] == predictions[:, 1])
    
    average_precision =  tp / len(embeddings)
    
    return average_precision
 

print(calculate_metrics(embeddings, labels, min_samples=1, k=3))
print(calculate_metrics(embeddings, labels, min_samples=5, k=3))

Calculating metrics on 1731 samples from 20 individuals with 1731 samples in total.
0.9641825534373195
Calculating metrics on 1724 samples from 17 individuals with 1731 samples in total.
0.9624494511842865


In [None]:
Calculating metrics on 138 samples from 22 individuals with 138 samples in total.
0.36231884057971014
Calculating metrics on 119 samples from 10 individuals with 138 samples in total.
0.36231884057971014