In [147]:

import numpy as np
import torch
from torchvision.models import resnet50,  resnet18
from torch import nn
from torch.utils.data import DataLoader

from sklearn.metrics.pairwise import cosine_similarity

In [148]:
from scipy.spatial.distance import cdist
import numpy as np

# Example vectors (rows in a matrix)
vectors = np.array([
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9]
])

# Compute pairwise Euclidean distances
distances = cdist(vectors, vectors, 'euclidean')

print(distances)


[[ 0.          5.19615242 10.39230485]
 [ 5.19615242  0.          5.19615242]
 [10.39230485  5.19615242  0.        ]]


In [149]:
test_dataset = torch.load("data/test_dataset.pt")

In [150]:
import os

baseline = resnet50(pretrained=True)
    
baseline.fc = nn.Linear(2048, 910)
    
if os.path.exists(f"models/baseline.pt"):
    baseline.load_state_dict(torch.load(f"models/baseline.pt"))

baseline.fc = torch.nn.Identity()



In [151]:
from torchvision import models

triplet = models.squeezenet1_1(pretrained=True)
    
triplet.classifier = torch.nn.Identity()

bs = 1024

if os.path.exists(f"models/squeezenet_{bs}.pt"):
    triplet.load_state_dict(torch.load(f"models/squeezenet_{bs}.pt"))




In [152]:
def get_accuracy(embeddings, targets):
    embeddings = np.array(embeddings)
    targets = np.array(targets)
    
    # Get cosine similarity for all embeddings
    similarity_matrix = cosine_similarity(embeddings)
    
    # Fill diagonal with inf
    np.fill_diagonal(similarity_matrix, -np.inf)
    
    # Find the index of max sim for each embedding
    most_similar_indices = np.argmax(similarity_matrix, axis=1)
    
    # Extract the labels of the most similar items
    predicted_labels = [targets[i] for i in most_similar_indices]
    n_correct = sum(1 for true, pred in zip(targets, predicted_labels) if true == pred)
    
    accuracy = 100 * (n_correct / len(embeddings))
    
    return accuracy, predicted_labels

In [153]:
def get_embeddings(model, dataloader, device):
    embeddings = []
    targets = []
    
    with torch.no_grad():
        for inputs, labels in dataloader:
            data = inputs.to(device)
            label = labels.to(device)
            batch_emb, batch_lab = [], []            
            
            emb = model(data)
            
            batch_emb.extend(emb.cpu())
            batch_lab.extend(label.cpu().tolist())
            
            embeddings.extend(batch_emb)
            targets.extend(batch_lab)
            
    return embeddings, targets

In [154]:
test_dataloader = DataLoader(test_dataset, 128, True)

In [155]:
device = "cuda"


In [156]:
baseline = baseline.to(device)

baseline_emb, baseline_targ = get_embeddings(baseline, test_dataloader, "cuda")

In [157]:
for i in range(len(baseline_emb)):
    baseline_emb[i] = baseline_emb[i].detach().numpy()

In [158]:
type(baseline_emb[0])

numpy.ndarray

In [159]:
# baseline_accuracy, baseline_preds = get_accuracy(baseline_emb, baseline_targ)

In [160]:
# baseline_accuracy

In [161]:
triplet = triplet.to(device)
triplet_emb, triplet_targ = get_embeddings(triplet, test_dataloader, "cuda")

for i in range(len(triplet_emb)):
    triplet_emb[i] = triplet_emb[i].detach().numpy()

triplet_accuracy, triplet_preds = get_accuracy(triplet_emb, triplet_targ)

In [163]:
print(f"Test set with {len(test_dataset)} samples of {len(set(test_dataset.labels))} classes.")
# print(f"Baseline model openset performance: {baseline_accuracy}")
print(f"Triplet model openset performance: {triplet_accuracy}")

Test set with 75576 samples of 400 classes.
Triplet model openset performance: 40.66105642002752


TODO: Compare best model performances to majority classifier and specify that the bad model still has some knowledge. Also use random classifier