# Neighbour Extraction using Gram Matrices

---

In [None]:
import sys
print(sys.executable)
print(sys.version)

In [None]:
from ipynb.fs.full.Helper import getDataLoader, getNames, dist
from ipynb.fs.full.GramMatrix import convertModel, GramMatrixLayer
from ipynb.fs.full.LabelDataset import createDirectories

import torch
import torch.nn as nn

%matplotlib inline

---

## Feature Extraction

In [None]:
loader = getDataLoader('/scratch/bam_subset_2_0', batch_size=4, shuffle=False, num_workers=4, testing=True)

In [None]:
import torchvision.models as models
vgg19 = models.vgg19(pretrained=True)

In [None]:
gramMatrixLayers = ['relu1_1', 'relu2_1', 'relu3_1', 'relu4_1', 'relu5_1']
vgg19, model, gram_matrices = convertModel(vgg19, gramMatrixLayers, testing=False)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

model.eval()
model.to(device)

In [None]:
dataIter = iter(loader)

(data, classes), names = dataIter.next()
data = data.cuda()
out = model(data)
G = []
for layer in gram_matrices:
    G.append(layer.gramMatrix)
G = torch.cat(G, 1)

a, D = G.size()

print(D)

Find a ‘safe’ number of components to randomly project to

The distortion introduced by a random projection p only changes the distance between two points by a factor (1 +- eps) in an euclidean space with good probability.

In [None]:
from sklearn.random_projection import johnson_lindenstrauss_min_dim

# TODO: Remove hardcoded dataset size
K = johnson_lindenstrauss_min_dim(121000, eps=0.3)

print(K)

In [None]:
from sklearn.random_projection import gaussian_random_matrix

RPM = gaussian_random_matrix(K, D)
RPM = RPM.transpose()
print(RPM.shape)

In [None]:
import os

if not os.path.exists('/scratch/kshitij98'):
    print("Creating", '/scratch/kshitij98')
    os.makedirs('/scratch/kshitij98')

RPM = torch.from_numpy(RPM)
RPM = RPM.cuda()
RPM = RPM.float()
print(RPM.shape)

torch.save(RPM, "/scratch/kshitij98/rpm")

In [None]:
import time

createDirectories('/scratch/bam_subset_2_0_features/')

dataIter = iter(loader)
t = time.time()

# TODO: Remove hardcoded dataset size
for i in range(30250):
    (data, classes), names = dataIter.next()
    data = data.cuda()
    out = model(data)
    G = []
    for layer in gram_matrices:
        G.append(layer.gramMatrix)
    G = torch.cat(G, 1)
    G = torch.mm(G, RPM)
    for j, gm in enumerate(G):
        torch.save(gm, names[j].replace('bam_subset_2_0', 'bam_subset_2_0_features'))
    print(i+1, "\tETA: ", ((time.time() - t) / ((i + 1) * 4)) * (121000 - ((i+1) * 4)) * (1 / 60), "minutes", end='\r')

---

## Get Neighbours

In [None]:
import time

t = time.time()

names = getNames('/scratch/bam_subset_2_0_features/')
X = []
for i, fileName in enumerate(names):
    X.append(torch.load(fileName))
    print("ETA: ", (len(names) - i - 1) * ((time.time() - t) / (i+1)), end='\r')
X = torch.stack(X, 0)
print(X.shape)

In [None]:
createDirectories('/scratch/bam_subset_2_0_top_neighbours/')
createDirectories('/scratch/bam_subset_2_0_bottom_neighbours/')

In [None]:
import numpy as np

k = 15
t = time.time()

for i, source in enumerate(X):
    source = torch.unsqueeze(source, 0)

    d = dist(source, X)
    d, indices = d.sort()

    topIds = indices[0, 1:k+1]
    # Note: Negative slicing is not supported
    bottomIds = indices[0, -k:]

    top = []
    for idx in topIds:
        top.append(names[idx])
    top = np.asarray(top)
    
    bottom = []
    for idx in bottomIds:
        bottom.append(names[idx])
    bottom = np.asarray(bottom)
    
    np.save(names[i].replace('bam_subset_2_0_features', 'bam_subset_2_0_top_neighbours'), top)
    np.save(names[i].replace('bam_subset_2_0_features', 'bam_subset_2_0_bottom_neighbours'), bottom)

    print("ETA: ", (len(X) - i - 1) * ((time.time() - t) / (i+1)) * (1 / 60), end='\r')