In [1]:
import torch
import os
from itertools import product
from random import shuffle
from tqdm import tqdm

from KNN_Embeddings import *

(50000, 768)
Accuracy: 0.987
Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1000
           1       0.98      0.99      0.99      1000
           2       1.00      0.98      0.99      1000
           3       0.96      0.98      0.97      1000
           4       0.98      0.99      0.99      1000
           5       0.99      0.96      0.98      1000
           6       0.99      0.99      0.99      1000
           7       1.00      1.00      1.00      1000
           8       1.00      1.00      1.00      1000
           9       0.99      0.98      0.99      1000

    accuracy                           0.99     10000
   macro avg       0.99      0.99      0.99     10000
weighted avg       0.99      0.99      0.99     10000

Confusion Matrix:
[[997   0   0   0   0   0   0   0   3   0]
 [  0 992   0   0   0   0   0   0   0   8]
 [  5   0 979   3   7   0   6   0   0   0]
 [  2   2   1 982   3   6   4   0   0   0]

In [2]:
# Feature normalization
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset, Dataset
import random

class TripletDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels
        self.labels_set = set(labels.numpy())
        self.label_to_indices = {label: np.where(labels.numpy() == label)[0]
                                 for label in self.labels_set}

    def __getitem__(self, index):
        anchor = self.embeddings[index]
        anchor_label = self.labels[index].item()
        positive_index = index
        while positive_index == index:
            positive_index = random.choice(self.label_to_indices[anchor_label])
        negative_label = random.choice(list(self.labels_set - {anchor_label}))
        negative_index = random.choice(self.label_to_indices[negative_label])
        positive = self.embeddings[positive_index]
        negative = self.embeddings[negative_index]
        return anchor, positive, negative, anchor_label, negative_label

    def __len__(self):
        return len(self.embeddings)


# Create a scaler object
scaler = StandardScaler()

# Fit on training data and transform both training and test data
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

X_train_tensor = torch.tensor(X_train_normalized, dtype=torch.float)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_normalized, dtype=torch.float)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

train_dataset = TripletDataset(X_train_tensor, y_train_tensor)
test_dataset = TripletDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=4)



## DNNH Model 

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class DNNH(nn.Module):
    def __init__(self, input_dim, num_bits):
        super(DNNH, self).__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(512, num_bits)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = torch.tanh(x)  # Ensuring outputs are bounded between -1 and 1
        x = F.normalize(x, p=2, dim=1)  # L2 normalization
        return x

def to_one_hot(labels, num_classes):
    """ Convert labels to one-hot encoded format """
    return torch.eye(num_classes)[labels].to(labels.device)

class DNNHLoss(nn.Module):
    def __init__(self, margin=1.0, epsilon=0.05):
        super(DNNHLoss, self).__init__()
        self.margin = margin
        self.epsilon = epsilon

    def forward(self, anchor, positive, negative):
        d_ap = torch.sum((anchor - positive) ** 2, dim=1)
        d_an = torch.sum((anchor - negative) ** 2, dim=1)
        return F.relu(1 + d_ap - d_an).mean()

    def threshold(self, s):
        return torch.where(s < 0.5 - self.epsilon, torch.zeros_like(s),
                           torch.where(s > 0.5 + self.epsilon, torch.ones_like(s), s))


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DNNH(input_dim=X_train_tensor.shape[1], num_bits=12).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
loss_func = DNNHLoss(margin=1.0, epsilon=0.05).to(device)

for epoch in range(10):
    model.train()
    total_loss = 0.0
    for anchor, positive, negative, _, _ in train_loader:
        anchor, positive, negative = anchor.to(device), positive.to(device), negative.to(device)
        optimizer.zero_grad()
        anchor_output = model(anchor)
        positive_output = model(positive)
        negative_output = model(negative)
        loss = loss_func(anchor_output, positive_output, negative_output)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}')


Epoch 1, Loss: 0.015424929529457065
Epoch 2, Loss: 0.006242638817080833
Epoch 3, Loss: 0.005183660232311929
Epoch 4, Loss: 0.0047881783455457835
Epoch 5, Loss: 0.004065484077672062
Epoch 6, Loss: 0.0038715133817432937
Epoch 7, Loss: 0.0036545027383243487
Epoch 8, Loss: 0.004494864479317079
Epoch 9, Loss: 0.00355258080250848
Epoch 10, Loss: 0.0033369085065486945


## KNN on hashed embeddings

In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import average_precision_score
from sklearn.preprocessing import label_binarize
import numpy as np

def evaluate_model(model, test_loader, device):
    model.eval()
    embeddings = []
    labels = []
    with torch.no_grad():
        for anchor, _, _, label_a, _ in test_loader:  # Correctly unpack all elements
            anchor = anchor.to(device)
            output = model(anchor)
            embeddings.append(output.cpu())
            labels.append(label_a)
    embeddings = torch.cat(embeddings)
    labels = torch.cat(labels)
    return embeddings, labels

# Ensure model and device are defined and properly initialized
# Example: model = DPSH(input_dim=X_train.shape[1], num_bits=48).to(device)
# and device is defined like device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Extract hash codes
train_codes, train_labels = evaluate_model(model, train_loader, device)
test_codes, test_labels = evaluate_model(model, test_loader, device)

# Classification with KNN
knn = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
knn.fit(train_codes, train_labels)
predictions = knn.predict(test_codes)
y_pred_proba = knn.predict_proba(test_codes)

print(classification_report(test_labels, predictions))

# Binarize the labels for a one-vs-rest computation
y_test_binarized = label_binarize(test_labels, classes=np.unique(train_labels))  # Updated to use `test_labels`

# Calculate the average precision for each class
average_precisions = []
for i in range(y_test_binarized.shape[1]):  # iterate over classes
    average_precisions.append(average_precision_score(y_test_binarized[:, i], y_pred_proba[:, i]))

# Compute the mean of the average precisions
map_score = np.mean(average_precisions)
print(f'Mean Average Precision (MAP): {map_score}')


              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1000
           1       0.98      0.99      0.99      1000
           2       0.99      0.99      0.99      1000
           3       0.96      0.97      0.97      1000
           4       0.99      0.98      0.99      1000
           5       0.98      0.97      0.98      1000
           6       0.99      0.99      0.99      1000
           7       1.00      0.99      1.00      1000
           8       0.99      0.99      0.99      1000
           9       0.99      0.98      0.99      1000

    accuracy                           0.99     10000
   macro avg       0.99      0.99      0.99     10000
weighted avg       0.99      0.99      0.99     10000

Mean Average Precision (MAP): 0.982478881939176
