# Training file Hotel Id assignment #
*more info here*

In [None]:
# Global constanst etc:

NMBR_HOTELS = 3116
#NMBR_HOTELS = 10
PASSES_PER_EPOCH = 5
MAX_WINDOWS_FILENAME_CHAR_LENGTH = 260 
EMBEDDING_SIZE = 1500
VAL_AFTER_EPOCHS = 5

NMBR_EPOCHS = 5
LR = 0.0005
GAMMA = 0.8
Model_description = "EfficientNet, pretrained & constant, Linear embedding layer"

print("""FULL RUN:
Passes per Epoch: {0};
Embedding Size: {1};
# Epochs: {2};
Learning Rate: {3};
Gamma: {4};
Model: {5}.
""".format(PASSES_PER_EPOCH, EMBEDDING_SIZE, NMBR_EPOCHS, LR, GAMMA, Model_description))

# Imports, Setup, etc. #

In [None]:
# Imports:
! pip install timm --no-index --find-links=file:///kaggle/input/timm-package/timm
! python -m pip -qq install --no-index --find-links /kaggle/input/faiss-163/ faiss-cpu==1.6.3 && \
echo "Successfully installed FAISS package"

In [None]:
import numpy as np # linear algebra
import math as m
from datetime import datetime
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import torch.nn as nn
import torchvision
import random
import numpy.random as rnd
import os
import matplotlib.pyplot as plt
from PIL import Image as pil_image
import albumentations.pytorch as APT
from torch.utils.data import Dataset, DataLoader
import torchvision.models as models
import timm
import cv2


# Don't know what this does, so I'll just keep it there just in case...
#import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        pass
        #print(os.path.join(dirname, filename))

# File & Folder names (not all used atm):        
PROJECT_FOLDER = "../input/hotel-id-to-combat-human-trafficking-2022-fgvc9/"
DATA_FOLDER = PROJECT_FOLDER+"/train_images" # I added this one but we do not use it currently... (includes all pictures shuffled and padded.)
IMAGE_FOLDER = DATA_FOLDER + "images/"
OUTPUT_FOLDER = ""

# Make random-ness deterministic

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(42)

# Image Processing
##### Includes creating datasets & Loaders, and data transformations

### Dataset class v2
##### used in rest of code

In [None]:
class Hotel_Dataset_Merlijn_Triplets(Dataset):
    """Dataset to load Hotel images."""
    
    def __init__(self, root_dir="../input/hotelid-2022-train-images-256x256/", data_path="images/", hotel_file_path="train.csv", 
                 train_partition = 0.9, is_trainSet = True, transform = None, nmbr_hotels = None):
        # Setting variables:
        
        self.transform = transform
        self.is_trainSet = is_trainSet
        self.data_path = os.path.join(root_dir,data_path)
        
        self.image_hotelId_dict = dict({})
        self.csv_data = pd.read_csv(os.path.join(root_dir, hotel_file_path))["hotel_id"]
        
        # Read all data:
        for image_file in os.listdir(self.data_path):
            if image_file[-4:] == ".jpg":
                image_nmbr = int(image_file[:-4])
                if image_nmbr in self.csv_data:
                    hotel_id = self.csv_data[image_nmbr]

                    if hotel_id in self.image_hotelId_dict:
                        self.image_hotelId_dict[hotel_id].append(image_nmbr)
                    else:
                        self.image_hotelId_dict[hotel_id] = [image_nmbr]
        
        if nmbr_hotels == None:
            self.nmbr_hotels = len(self.image_hotelId_dict)
        else:
            self.nmbr_hotels = nmbr_hotels
        self.data = np.arange(self.nmbr_hotels*PASSES_PER_EPOCH)
        
        # Re-write into labels, create val & train set
        self.Label_to_Id = np.zeros(len(self.image_hotelId_dict), dtype = int)
        self.image_label_dict_train = dict({})
        self.image_label_dict_val = dict({})
        self.image_label_dict_total = dict({})
        
        for (label, (idx,images)) in enumerate(self.image_hotelId_dict.items()):
            self.Label_to_Id[label] = idx
            max_train_index = max(m.floor(len(images)*train_partition),1)
            self.image_label_dict_train[label] = images[:max_train_index]
            self.image_label_dict_val[label] = images[max_train_index:]
            if len(self.image_label_dict_val[label]) == 0:
                self.image_label_dict_val[label] = self.image_label_dict_train[label]
            self.image_label_dict_total[label] = images
                
    
    def label_to_id(self, label):
        return self.Label_to_Id[int(label)]
    
    def labelList_to_id(self, labels):
        ids = []
        for label in labels:
            ids.append(self.label_to_id(label))
        return ids
    
    def __len__(self):
        return len(self.data)
    
    def __size__(self):
        return np.shape(self.data)
    
    def __get_label_list__(self):
        return self.Label_to_Id
    
    def __images_to_filepaths__(self, ims):
        paths = []
        for im in ims:
            file = ( '0000000000'+str(im)+".jpg") [-13:]
            paths.append (os.path.join(self.data_path, file) )
        return paths
    
    def __get_all_files__(self):
        return self.image_label_dict_total
    
    def __get_labels__(self, images):
        all_labels = []
        for image in images:
            all_labels.append(self.label_image_dict_total[image])
        return all_labels
        
    def __getitem__(self, label): # AKA: get a (random!) triplet from given class.
        
        if self.is_trainSet:
            image_dict = self.image_label_dict_train
        else:
            image_dict = self.image_label_dict_val
        
        label = label % self.nmbr_hotels
        
        # Get image nmbrs 
        im1, im2 = rnd.choice(image_dict[label], 2) # our two positives images
        neg_label = label
        while neg_label == label:
            neg_label = rnd.randint(0,self.nmbr_hotels) # get a random different class
        im3 = rnd.choice(image_dict[neg_label],1)[0] # our negative images
        
        # Get file paths:
        im1, im2, im3 = self.__images_to_filepaths__([im1,im2,im3])
        
        # rewrite to tensors:
        if self.transform:
            #im1, im2, im3 = self.transform(pil_image.open(im1)), self.transform(pil_image.open(im2)), self.transform(pil_image.open(im3))
            im1, im2, im3 = self.transform(image=np.array(pil_image.open(im1)))["image"], self.transform(image=np.array(pil_image.open(im2)))["image"], self.transform(image=np.array(pil_image.open(im3)))["image"]
        
        return [im1, im2, im3, (label, neg_label)]

test = Hotel_Dataset_Merlijn_Triplets()

### Defining data transformations:

In [None]:
class AddRandomMaskTransform:
    
    def __init__(self):
        self.masks_path = r'../input/hotel-id-to-combat-human-trafficking-2022-fgvc9/train_masks'
        self.max_covered_area_ratio=0.5
    
    def __call__(self, source_img: pil_image):
        import math
        source_width, source_height = source_img.size
        source_area = source_width*source_height
        
        # Retrieve a random mask
        random_mask_file_name = random.choice(os.listdir(self.masks_path))
        random_mask_path = os.path.join(self.masks_path, random_mask_file_name)
        random_mask = pil_image.open(random_mask_path)

        # The training masks provided by the competition often seem to be at least as large as some of the images
        # of the hotels. Resize the mask to constrain its surface area to max_covered_area_ratio, while retaining
        # its aspect ratio.
        original_mask_width, original_mask_height = random_mask.size
        original_mask_area = original_mask_width*original_mask_height
        max_mask_area = source_area*self.max_covered_area_ratio
        mask_width = math.floor(math.sqrt((max_mask_area*original_mask_width)/original_mask_height))
        mask_height = math.floor(math.sqrt((max_mask_area*original_mask_height)/original_mask_width))
        mask = random_mask.resize((mask_width, mask_height))

        # Paste the mask over a random position in the image
        max_x_coord = max(0, source_width - mask_width)
        max_y_coord = max(0, source_height - mask_height)
        random_x_coord = random.randint(0, max_x_coord)
        random_y_coord = random.randint(0, max_y_coord)
        source_img.paste(mask, (random_x_coord, random_y_coord), mask)
        # Mind that most masks have a transparent border that offsets them from (0,0), preventing
        # most of them from ever showing up all the way at the left or the top of the image.
        return source_img

In [None]:
IMAGE_SIZE_WIDTH = 256
IMAGE_SIZE_HEIGHT = 256
IMG_SIZE = IMAGE_SIZE_WIDTH

import albumentations as A
import albumentations.pytorch as APT
import cv2 

# used for training dataset - augmentations and occlusions
train_transform = A.Compose([
    A.HorizontalFlip(p=0.75),
    A.VerticalFlip(p=0.25),
    A.ShiftScaleRotate(p=0.5, border_mode=cv2.BORDER_CONSTANT),
    A.OpticalDistortion(p=0.25),
    A.Perspective(p=0.25),
    A.CoarseDropout(p=0.5, min_holes=1, max_holes=6, 
                    min_height=IMG_SIZE//16, max_height=IMG_SIZE//4,
                    min_width=IMG_SIZE//16,  max_width=IMG_SIZE//4), # normal coarse dropout
    
    A.CoarseDropout(p=0.75, max_holes=1, 
                    min_height=IMG_SIZE//4, max_height=IMG_SIZE//2,
                    min_width=IMG_SIZE//4,  max_width=IMG_SIZE//2, 
                    fill_value=(255,0,0)),# simulating occlusions in test data

    #A.RandomBrightnessContrast(p=0.75),
    A.ToFloat(),
    APT.transforms.ToTensorV2(),
])

# used for validation dataset - only occlusions
val_transform = A.Compose([
    A.CoarseDropout(p=0.75, max_holes=1, 
                    min_height=IMG_SIZE//4, max_height=IMG_SIZE//2,
                    min_width=IMG_SIZE//4,  max_width=IMG_SIZE//2, 
                    fill_value=(255,0,0)),# simulating occlusions
    A.ToFloat(),
    APT.transforms.ToTensorV2(),
])

# no augmentations
base_transform = A.Compose([
    A.ToFloat(),
    APT.transforms.ToTensorV2(),
])

In [None]:
class Hotel_Dataset_Merlijn(Dataset):
    """Dataset to load Hotel images."""
    
    def __init__(self, root_dir="../input/hotel-id-to-combat-human-trafficking-2022-fgvc9/", data_path="train_images/",
                 labels_known = True, max_images=10, transform=None, label_list=None, save_names=False):
        # Setting variables:
        
        self.transform = transform
        self.max_images = max_images
        self.labels_known = labels_known
        
        # Directory paths:
        self.root_dir = root_dir
        self.data_path = data_path
        self.full_data_path = os.path.join(root_dir, data_path)
        dirs = os.listdir(self.full_data_path)

        # Initialise as empty dataset:
        data_size = NMBR_HOTELS*self.max_images
        if labels_known:
            self.data = np.chararray((data_size,2), itemsize=MAX_WINDOWS_FILENAME_CHAR_LENGTH)
            self.label_to_id_arr = np.zeros( NMBR_HOTELS, dtype=np.int32 )
        else:
            self.data = np.chararray(data_size, itemsize=MAX_WINDOWS_FILENAME_CHAR_LENGTH)
            if type(label_list) != None:
                self.label_to_id_arr = label_list
            else:
                print("Warning: no lable list provided for test dataset!")
            
        # Sampling Train or Val data (with labels known):
        
        if labels_known:
            for (dir_nmbr, this_dir) in enumerate(dirs):
                if dir_nmbr < NMBR_HOTELS:
                    self.label_to_id_arr[dir_nmbr] = int(this_dir)

                    # Sample pictures:
                    path_to_dir = os.path.join(self.full_data_path, this_dir)
                    files = os.listdir(path_to_dir)
                    sampled_pics = rnd.choice(files, size=self.max_images) #Note: these could be duplicates...

                    #Save as data:
                    data = np.dstack( (sampled_pics, np.repeat(dir_nmbr, self.max_images)))
                    (index_start, index_end) = ( (dir_nmbr)*self.max_images, (dir_nmbr+1)*self.max_images)
                    self.data[index_start: index_end] = data
            
        # Sampling Test data:
        
        else:
            self.data = os.listdir(self.full_data_path)
    
    def label_to_id(self, label):
        return self.label_to_id_arr[int(label)]
    
    def __len__(self):
        return len(self.data)
    
    def __size__(self):
        return np.shape(self.data)
    
    def __get_label_list__(self):
        return self.label_to_id_arr
    
    def __getitem__(self, idx):
        #note: has to be changed for no label types, still!
        
        if self.labels_known:
            record = self.data[idx].decode()
            label = int(record[1])
            hotel_id = self.label_to_id(label)
            image_path = os.path.join(self.full_data_path, str(hotel_id), record[0])
            image = pil_image.open(image_path)
            if self.transform:
                image = self.transform(image=np.array(image))["image"]
            return (image, label)
        
        else:
            record = self.data[idx]
            image_path = os.path.join(self.full_data_path, record)
            image = pil_image.open(image_path)

            if self.transform:
                image = self.transform(image=np.array(image))["image"]
            return (image, record)

### Creating Datasets & Loaders

In [None]:
# Method 1: both sets have each hotel represented, but duplicates are possible:

#dataset_train = Hotel_Dataset_Merlijn(max_images = 1, transform=train_transform)
#dataset_val = Hotel_Dataset_Merlijn(max_images = 1, transform=train_transform)
# Not sure if we cannot just use the same one twice, but never mind...


# Method 2: only duplicates in dataset may occur, no guarantee about labels
nmbr_images, val_perc = 10, 0.8
dataset_train = Hotel_Dataset_Merlijn_Triplets(transform=base_transform, nmbr_hotels = NMBR_HOTELS)
dataset_val = Hotel_Dataset_Merlijn_Triplets(is_trainSet = False, transform = train_transform, nmbr_hotels = NMBR_HOTELS)

labels = dataset_train.__get_label_list__()
dataset_test = Hotel_Dataset_Merlijn(data_path="test_images/", transform=base_transform, labels_known = False, label_list = labels)
     
train_loader = DataLoader(dataset_train, batch_size = 128, shuffle=True) # these are now the classes that get taken: that should be fixed...
val_loader = DataLoader(dataset_val, batch_size = 128, shuffle=True)
test_loader = DataLoader(dataset_test, batch_size = 1) #this seems sloppy, but makes predict-code nicer now...

# Method 3
# dataset_train = Hotel_Dataset_Merlijn_Bram(max_images = 20, training=True, transform=train_transform)
# dataset_val = Hotel_Dataset_Merlijn_Bram(max_images = 20, training=False, transform=test_transform)
# dataset_test = Hotel_Dataset_Merlijn_Bram(max_images = 20, training=False, transform=test_transform)

# train_loader = DataLoader(dataset_train, batch_size = 5, shuffle=True)#len(dataset_train))
# val_loader = DataLoader(dataset_test, batch_size = 5, shuffle=True)
# test_loader = DataLoader(dataset_test, batch_size = 1, shuffle=True)

### Visualising/testing data stuff

In [None]:
# Test/visualise datasets:

# Function to visualise datasets:
def show_images(ds, n_images=10, labels_known = True):
    # Only works properly for n_images>10...
    xlen, ylen = 10, m.ceil(n_images/10)
    fig, ax = plt.subplots(ylen,xlen, figsize=(22,8))
    
    
    for x in range(xlen):
        for y in range(ylen):
            if not labels_known:
                item = ds.__getitem__(y*10+x)
            else:
                (item, label) = ds.__getitem__(y*10+x)
            ax[y,x].imshow(np.transpose(item, (1,2,0))) # gives error: convert to PIL

def visualise_data():
    print("""Sizes Datasets:
        Train: {0}
        Validate: {1}
        Test: {2}""")#.format(dataset_train.__size__(), dataset_val.__size__(), dataset_test.__size__()))
    print(dataset_train[0])

    show_images(dataset_train,20, labels_known = True) # This gives an error, but I am too lazy to fix it now...
#visualise_data()



## Auxiliariy functions

In [None]:
def save_checkpoint(model, scheduler, optimizer, epoch, name, loss=None, score=None):
    checkpoint = {"epoch": epoch,
                  "model": model.state_dict(),
                  #"scheduler": scheduler.state_dict(),
                  "optimizer": optimizer.state_dict(),
                  "loss": loss,
                  "score": score,
                  }

    torch.save(checkpoint, f"{OUTPUT_FOLDER}checkpoint-{name}.pt")

In [None]:
def get_all_embeddings(model, dataloader, transform):
    # Create array with all embeddings:
    print ("Creating all embeddings:")
    nmbr_dirs_to_report = 100
    all_embeddings = []
    all_classes = []
    model.eval()
    with torch.no_grad():
        all_files_dict = dataloader.__get_all_files__()
        for (i,(hotel, image_list)) in enumerate(all_files_dict.items()):
            if i <= NMBR_HOTELS:
                all_images = []
                for (j,image) in enumerate(image_list):
                    if j < 100: # Filtering out our giant hotel-set just for testing...
                        image = dataloader.__images_to_filepaths__([image])[0]
                        all_images.append( transform(image=np.array(pil_image.open(image)))["image"] )
                all_images = torch.stack(all_images).to(device)
                embeddings, _p = model(all_images)
                embeddings = embeddings.to('cpu')

                for emb in embeddings:
                    all_embeddings.append(emb.detach().numpy())
                    all_classes.append(hotel)
    
    return (np.stack(all_embeddings), np.array(all_classes))

def get_closest_labels(embedding, all_embeddings, all_labels, n=5, get_ids = False):
    
    index = faiss.IndexFlatL2(EMBEDDING_SIZE)
    index.add(all_embeddings)
    _ds, closest_images = index.search(embedding, 100)
    closest_images = closest_images[:,1:] #filter itself!
    all_top_5s = []
    for ci in closest_images:
        closest_classes = all_labels[ci]
        unique_classes, ind = np.unique(closest_classes, return_index=True)
        top_5 = unique_classes[np.argsort(ind)][:5]
        if get_ids:
            top_5 = dataset_train.__get_label_list__()[top_5]
        all_top_5s.append(top_5)
    
    return all_top_5s
    
def get_accuracy(top5s, target):
    accuracy = 0
    for i,t5 in enumerate(top5s):
        is_correct = (t5 == target[i]).astype(float)
        accuracy += np.max(is_correct * ( (0.5) ** np.arange(5) ) )
    return accuracy/len(target)

In [None]:
class TripletLoss(nn.Module): # As taken from https://towardsdatascience.com/a-hands-on-introduction-to-image-retrieval-in-deep-learning-with-pytorch-651cd6dba61e
    def __init__(self, margin=1.0):
        super(TripletLoss, self).__init__()
        self.margin = margin
        
    def calc_euclidean(self, x1, x2):
        return (x1 - x2).pow(2).sum(1)
    
    def val_score(self, x1, x2, x3):
        return self.forward(x1,x2,x3) # Does this just work?
    
    # Distances in embedding space is calculated in euclidean
    def forward(self, anchor, positive, negative):
        distance_positive = self.calc_euclidean(anchor, positive)
        distance_negative = self.calc_euclidean(anchor, negative)
        losses = torch.relu(distance_positive - distance_negative + self.margin)
        return losses.mean()

## Model

In [None]:
# Define model:

class Hotel_Model (nn.Module):
    
    def __init__(self, n_classes = NMBR_HOTELS, embedding_size = EMBEDDING_SIZE):
        super(Hotel_Model, self).__init__()
        
        # use (soon-to-be) pre-trained model for heavy lifting:
        #self.main_model = timm.create_model("efficientnet_b0", num_classes=EMBEDDING_SIZE, pretrained=False)
        
        self.main_model = timm.create_model("efficientnet_b0", num_classes=1000, pretrained=False)
        self.main_model.load_state_dict(torch.load("../input/timm-pretrained-efficientnet/efficientnet/efficientnet_b0_ra-3dd342df.pth"))
        for param in self.main_model.parameters():
            param.requires_grad = False
        self.main_model.eval()
        
        # Use own torch-layers for last embedding step and classification:
        in_features = self.main_model.get_classifier().in_features    
        self.main_model.classifier = nn.Identity() # turn off classifier in pre-trained model
        self.embedding = nn.Sequential(
            nn.ReLU(),
            nn.Linear(in_features, embedding_size),
            nn.ReLU())
        
        # Note: classification is not used for loss, only for gauging accuracy!
        self.classifier = nn.Linear(embedding_size, n_classes) 
        
    
    def forward(self,x):
        x = self.main_model(x)
        #x = x.view(x.size(0), -1) #re-scale
        x = self.embedding(x)
        return x, x #self.classifier(x)
    
model_test = Hotel_Model()

## Training Cycle

In [None]:
def train_epoch(train_loader, optimizer, loss_fn, model):
    """Training and adjusting model for one epoch."""
    
    for i, data in enumerate(train_loader): #loop over all data in current batch (in loader!)        
        # Initialisation:
        optimizer.zero_grad()
        x1, x2, x3,  (_label, _neg_label) = data
        x1, x2, x3 = x1.to(device), x2.to(device), x3.to(device)
        
        # Run trough model
        e1, _p1 = model(x1)
        e2, _p2 = model(x2)
        e3, _p3 = model(x3)
        
        e1, e2, e3 = e1.to('cpu'), e2.to('cpu'), e3.to('cpu')
        
        # Compute loss & alter model
        loss = loss_fn(e1, e2, e3)
        loss.backward()
        optimizer.step()
        
        # Log progress
        #prediction, labels = prediction.detach().numpy(), labels.detach().numpy()
        #accuracy = np.mean(labels == np.argmax(prediction, axis = 1))
        
        print("Training batch {} done! (Loss = {})".format(i+1, loss))
        
        # Gather data and stuff: TBW
    
    print("All training completed!")
    
    # Clear all arrays from memory:
    #data, inputs, labels, embedding, prediction, loss = [],[],[],[],[],[]
    
def val_epoch(val_loader, loss_fn, model):
    "Compute validation data for one epoch"
    # get embeddings of all images:
    all_embeddings, all_labels = get_all_embeddings(model, dataset_train, val_transform)
    index = faiss.IndexFlatL2(EMBEDDING_SIZE)
    print(all_embeddings.shape, EMBEDDING_SIZE)
    index.add(all_embeddings)

    # Just like train_epoch, but without adjusting the model...
    total_vloss = 0.0
    avg_accuracy = 0.0
    nmbr_images = 0.0
    
    for i, vdata in enumerate(val_loader):
        x1, _x2, _x3, (label, _neg_label) = vdata
        x1 = x1.to(device)
        
        emb, p1 = model(x1)
        emb, label = emb.to('cpu').detach().numpy(), label.detach().numpy()
        closest_image_labels = get_closest_labels(emb, all_embeddings, all_labels)
        accuracy = get_accuracy(closest_image_labels, label)
        avg_accuracy += accuracy
        print( "Validation batch {0} done! Acc: {1}".format((i+1), accuracy) )
    
    vdata, inputs, labels, emb, prediction = [],[],[],[],[]
    return (avg_accuracy / (i+1) )

In [None]:
def train_complete(train_loader, val_loader, model, optimizer, scheduler, loss_fn, nmbr_epochs = 5):
    """Trains and Validates model given loaders and initial model."""

    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    best_vloss = 10**9
    val_epochs = VAL_AFTER_EPOCHS
    for epoch in range(nmbr_epochs):
        print("Starting epoch {}".format(epoch+1))
        
        # Train one epoch:
        model.train()
        train_epoch(train_loader, optimizer, loss_fn, model)
        scheduler.step()
        
        if (epoch+1)%val_epochs == 0:
            # Validate epoch:
            print("Start Validation...")
            model.eval()
            with torch.no_grad():
                vloss = val_epoch(val_loader,loss_fn, model)
                #vloss = "Not validating!"
            print("Validation complete: Total Accuracy = {}. Save model & start new epoch:".format(vloss))
        
        # Save current model:
        save_checkpoint(model, "test", optimizer, epoch, "TestTraining")     

In [None]:
import numpy as np
import faiss

a1 = np.array([0.,0.,0.,0.,0.])
a2 = np.array([1.,2.,3.,4.,4.])
a3 = np.array([1.,2.,3.,3.,3.])
a4 = np.array([1.,2.,2.,2.,2.])

embeddings = np.array([a3,a1,a4,a2])
embeddings = embeddings.astype(np.float32)
print("Embeddings: \n{}".format(embeddings))

target = np.array([1.,2.,3.,4.,5.])
target = target.astype(np.float32)
print("Target: \n{}".format(target))

n = 3

def find_nearest_n_neighbours(target, embeddings, n: int):
    index = faiss.IndexFlatL2(len(target))
    index.add(embeddings)
    
    distances, indices = index.search(np.array([target]), n)
    #print("Indices: \n{}".format(indices))
    #print("Distances: \n{}".format(distances))
    
find_nearest_n_neighbours(target,embeddings,n)

In [None]:
def predict(test_loader, model, labels_to_id, train_embeddings, train_labels):
    """Function to predict the 5 most likely hotels"""
    
    index = faiss.IndexFlatL2(EMBEDDING_SIZE)
    index.add(train_embeddings)
    
    nmbr_cases = len(test_loader)
    predictions_top_5 = np.chararray((nmbr_cases ,2), itemsize=MAX_WINDOWS_FILENAME_CHAR_LENGTH)
    
    for i,data in enumerate(test_loader): #Should just be one batch...
        (image, name) = data
        image = image.to(device)
        predictions_top_5[i,0] = name[0] # Don't know why we need the [0]...
        this_embedding, _p = model(image)
        this_embedding = this_embedding.to('cpu').detach().numpy()
        closest_images = get_closest_labels(this_embedding, train_embeddings, train_labels, n=5, get_ids = True)[0]
        closest_images_str = ' '.join(map(str, closest_images))
        predictions_top_5[i,1] = closest_images_str
        
    return predictions_top_5


## Running the code & Making submission:

In [None]:
all_images_tensor = []

In [None]:
# Specify device:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Define model & Datasets
model = Hotel_Model().to(device)

# Optimizer & Loss function: Placeholders for now:

optimizer = torch.optim.Adagrad(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma = GAMMA )
loss_fn = TripletLoss()

In [None]:
# Train model:

nmbr_epochs = NMBR_EPOCHS
train_complete(train_loader, val_loader, model, optimizer, scheduler, loss_fn, nmbr_epochs)

model.train(False)
with torch.no_grad():
    all_embeddings, all_labels = get_all_embeddings(model, dataset_train, train_transform)
    # Make prediction:

    final_preds = predict(test_loader, model, dataset_train.labelList_to_id, all_embeddings, all_labels ) # predictions as np-array
    final_preds = pd.DataFrame(final_preds).stack().str.decode('utf-8').unstack() # as panda dataFrame of strings
    final_preds.columns = ['image_id','hotel_id'] # rename header

    print(final_preds)
    final_preds.to_csv(OUTPUT_FOLDER + 'submission.csv', index=False)