# Setup
This notebook is intended to run on colab, so some things are commented out to make it work on kaggle.

In [None]:
# !nvidia-smi

In [None]:
# from google.colab import drive
# drive.mount('/gdrive')
# %cd /gdrive

In [None]:
# !pip install efficientnet_pytorch
# !pip install git+https://github.com/rwightman/pytorch-image-models
# !pip install pytorch-metric-learning
# !pip install faiss-gpu
# !pip install imgaug -U
# !pip install albumentations -U

# Imports

In [None]:
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random
import os
import math

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.utils import class_weight
from PIL import Image as pil_image
from tqdm import tqdm
import scipy

import matplotlib
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

import timm
from timm.optim import Lookahead, RAdam
#from pytorch_metric_learning import miners, losses, samplers , distances, regularizers 

# Global

In [None]:
IMG_SIZE = 512
SEED = 42
PROJECT_FOLDER = "../input/hotelid-2022-train-images-512x512/"
DATA_FOLDER = "../input/hotelid-2022-train-images-512x512/images/"
TRAIN_DATA_FOLDER = "../input/hotelid-2022-train-images-512x512/images/"
TEST_DATA_FOLDER = "../input/hotel-id-to-combat-human-trafficking-2022-fgvc9/test_images/"
OUTPUT_FOLDER = "./"

# PROJECT_FOLDER = "/gdrive/MyDrive/Projects/Hotel-ID/"
# DATA_FOLDER = "/home/data/"
# OUTPUT_FOLDER = PROJECT_FOLDER + "output/"

In [None]:
# !mkdir {DATA_FOLDER}
# !unzip -qq {PROJECT_FOLDER}data/train-{IMG_SIZE}x{IMG_SIZE}.zip -d /home/data/

In [None]:
print(os.listdir(PROJECT_FOLDER))
print(len(os.listdir(DATA_FOLDER)))

# Helper functions - seed and metric calculator

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

# Dataset and transformations

In [None]:
import albumentations as A
import albumentations.pytorch as APT
import cv2 

# train_transform = A.Compose([
#     # A.Resize(IMG_SIZE, IMG_SIZE),
#     # A.CLAHE(p=1), 
    
# #     A.HorizontalFlip(p=0.75),
#     A.VerticalFlip(p=0.25),
# #     A.ShiftScaleRotate(p=0.5, border_mode=cv2.BORDER_CONSTANT),
#     A.OpticalDistortion(p=0.25),
#     A.Perspective (p=0.25),
# #     A.CoarseDropout(p=0.5),

#     A.RandomBrightnessContrast(p=0.75),
#     A.ToFloat(),
#     APT.transforms.ToTensorV2(),
# ])


# val_transform = A.Compose([
#     # A.Resize(IMG_SIZE, IMG_SIZE),
#     # A.CLAHE(p=1),
#     A.ToFloat(),
#     APT.transforms.ToTensorV2(),
# ])

In [None]:
class HotelTrainDataset:
    def __init__(self, data, transform=None, data_path="train_images/"):
        self.data = data
        self.data_path = data_path
        self.transform = transform
        self.fake_load = False

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        record = self.data.iloc[idx]
        image_path = self.data_path + record["image_id"]

        if self.fake_load:
            image = np.random.randint(0, 255, (32, 32, 3)).astype(np.uint8)
        else:
            image = np.array(pil_image.open(image_path)).astype(np.uint8)

        if self.transform:
            transformed = self.transform(image=image)
        
        return {
            "image" : transformed["image"],
            "target" : record['hotel_id_code'],
        }

# Model

In [None]:
class EmbeddingNet(nn.Module):
    def __init__(self, n_classes=100, embed_size=64, backbone_name="efficientnet_b0"):
        super(EmbeddingNet, self).__init__()

        self.embed_size = embed_size
        self.backbone = timm.create_model(backbone_name, pretrained=False)
        in_features = self.backbone.get_classifier().in_features

        fc_name, _ = list(self.backbone.named_modules())[-1]
        if fc_name == 'classifier':
            self.backbone.classifier = nn.Identity()
        elif fc_name == 'head.fc':
            self.backbone.head.fc = nn.Identity()
        elif fc_name == 'fc':
            self.backbone.fc = nn.Identity()
        else:
            raise Exception("unknown classifier layer: " + fc_name)

        self.post = nn.Sequential(
            nn.utils.weight_norm(nn.Linear(in_features, self.embed_size*2), dim=None),
            nn.BatchNorm1d(self.embed_size*2),
            nn.Dropout(0.2),
            nn.utils.weight_norm(nn.Linear(self.embed_size*2, self.embed_size)),
        )

        self.classifier = nn.Sequential(
            nn.BatchNorm1d(self.embed_size),
            nn.Dropout(0.2),
            nn.Linear(self.embed_size, n_classes),
        )
        
    def embed_and_classify(self, x):
        x = self.forward(x)
        return x, self.classifier(x)

    def forward(self, x):
        x = self.backbone(x)
        x = x.view(x.size(0), -1)
        x = self.post(x)
        return x

# Model helper functions

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def get_embeds(loader, model, bar_desc="Generating embeds"):
    outputs_all = []
    
    model.eval()
    with torch.no_grad():
        t = tqdm(loader, desc=bar_desc)
        for i, sample in enumerate(t):
            input = sample['image'].to(args.device)
            output = model(input)
            outputs_all.extend(output.detach().cpu().numpy())
#             outputs_all.extend(output.detach().cpu().numpy().astype(np.float16))
            
            
    return outputs_all

In [None]:
def get_distance_matrix(embeds, base_embeds, distance_func):
    distance_matrix = []
    base_embeds = torch.Tensor(base_embeds)
    embeds_dataset = torch.utils.data.TensorDataset(torch.Tensor(embeds))
    embeds_dataloader = DataLoader(embeds_dataset, num_workers=2, batch_size=1024, shuffle=False)
    
    t = tqdm(embeds_dataloader)
    for i, sample in enumerate(t): 
        distances = distance_func(sample[0], base_embeds)
        distance_matrix.extend(distances.numpy())
        
    return np.array(distance_matrix)

In [None]:
def save_checkpoint(model, scheduler, optimizer, epoch, name, loss=None, score=None):
    checkpoint = {"epoch": epoch,
                  "model": model.state_dict(),
                  "scheduler": scheduler.state_dict(),
                  "optimizer": optimizer.state_dict(),
                  "loss": loss,
                  "score": score,
                  }

    torch.save(checkpoint, f"{OUTPUT_FOLDER}checkpoint-{name}.pt")


def load_checkpoint(model, scheduler, optimizer, name):
    if torch.cuda.is_available():
        checkpoint = torch.load("../input/hotelidcosfaceecaresnet50dtrained/checkpoint-cosface-model-ecaresnet50d_pruned-512x512-4096embeds-3116hotels.pt")
    else:
        checkpoint = torch.load("../input/hotelidcosfaceecaresnet50dtrained/checkpoint-cosface-model-ecaresnet50d_pruned-512x512-4096embeds-3116hotels.pt",map_location=torch.device('cpu'))

    model.load_state_dict(checkpoint["model"])
    scheduler.load_state_dict(checkpoint["scheduler"])
    # optimizer.load_state_dict(checkpoint["optimizer"])

    return model, scheduler, optimizer, checkpoint["epoch"]

In [None]:
def iterate_loader(loader, epochs):
    loader.dataset.fake_load = True
    with torch.no_grad():
        for i in range(epochs):
            t = tqdm(loader, desc=f"Iterating loader {i+1}/{epochs}")
            for j, sample in enumerate(t):
                images = sample['image']
                targets = sample['target']

    loader.dataset.fake_load = False

In [None]:
def train_epoch(args, model, loader, criterion, optimizer, loss_optimizer, scheduler, epoch):
    losses = []
    targets_all = []
    outputs_all = []
    
    model.train()
    t = tqdm(loader)
    
    for i, sample in enumerate(t):
        optimizer.zero_grad()
        
        images = sample['image'].to(args.device)
        targets = sample['target'].to(args.device)
        
        embeds, outputs = model.embed_and_classify(images)
        
        
        loss = criterion(embeds, targets)
        
        loss.backward()
        optimizer.step()
        loss_optimizer.step()
        
        if scheduler:
            scheduler.step()
                
        losses.append(loss.item())
        targets_all.extend(targets.cpu().numpy())
        outputs_all.extend(torch.sigmoid(outputs).detach().cpu().numpy())

        score = np.mean(targets_all == np.argmax(outputs_all, axis=1))
        desc = f"Epoch {epoch}/{args.epochs} - Train loss:{loss:0.4f}, score: {score:0.4f}"
        t.set_description(desc)
        
    return np.mean(losses), score


def test_closest_match(base_df, base_embeds, valid_targets, valid_embeds, model, distance_func, closest, n_matches=5):
    distance_matrix = get_distance_matrix(valid_embeds, base_embeds, distance_func)

    preds = []
    N_val = len(valid_embeds)
    for i in tqdm(range(N_val), total=N_val, desc="Getting closest match"):
        tmp_df = base_df.copy()
        tmp_df["distance"] = distance_matrix[i]
        tmp_df = tmp_df.sort_values(by=["distance", "hotel_id"], ascending=closest).reset_index(drop=True)
        preds.extend([tmp_df["hotel_id_code"].unique()[:n_matches]])

    y = np.repeat([valid_targets], repeats=n_matches, axis=0).T
    preds = np.array(preds)
    acc_top_1 = (preds[:, 0] == valid_targets).mean()
    acc_top_5 = (preds == y).any(axis=1).mean()
    print(f"Accuracy: {acc_top_1:0.4f}, top 5 accuracy: {acc_top_5:0.4f}")
    return preds, distance_matrix


def test(base_loader, valid_loader, model, distance_func, closest):
    base_targets, base_embeds = get_embeds(base_loader, model, "Generating embeds for train")
    valid_targets, valid_embeds = get_embeds(valid_loader, model, "Generating embeds for test")
    val_preds, distance_matrix = test_closest_match(base_loader.dataset.data, base_embeds, valid_targets, valid_embeds, model, distance_func, closest)

    return base_embeds, valid_embeds, base_targets, valid_targets, val_preds, distance_matrix

# Prepare data

In [None]:
def sample_data(n_hotels, min_images, max_images):
    data_df = pd.read_csv(PROJECT_FOLDER + "train.csv")
    sample_df = data_df.groupby("hotel_id").filter(lambda x: (x["image_id"].nunique() > min_images) & (x["image_id"].nunique() < max_images))
    sample_df["hotel_id_code"] = sample_df["hotel_id"].astype('category').cat.codes.values.astype(np.int64)
    sample_df = sample_df[sample_df["hotel_id_code"] < n_hotels]

    print(f"Subsample with {len(sample_df.hotel_id.unique())} hotels out of {len(data_df.hotel_id.unique())}" + 
          f" with total {len(sample_df)} images ({len(sample_df) / len(data_df) * 100:0.2f} %)")
    
    return sample_df

In [None]:
# FOR TESTING DIFFERENT SETTING
#data_df = sample_data(1000, 15, 50)

# FOR FINAL TRAINING
data_df = pd.read_csv(PROJECT_FOLDER + "train.csv")
data_df["hotel_id_code"] = data_df["hotel_id"].astype('category').cat.codes.values.astype(np.int64)

fig = go.Figure()
fig.add_trace(go.Histogram(x=data_df["hotel_id_code"]))
fig.update_xaxes(type="category")
fig.show()

In [None]:
def train_and_validate(args, data_df):
    model_name = f"cosface-model-{args.backbone_name}-{IMG_SIZE}x{IMG_SIZE}-{args.embed_size}embeds-{args.n_classes}hotels"
    print(model_name)

    seed_everything(seed=SEED)

#     val_df = data_df.groupby("hotel_id").sample(args.val_samples, random_state=SEED)
    #     train_df = data_df[~data_df["image_id"].isin(val_df["image_id"])]
    
    val_df = data_df.groupby("hotel_id").sample(args.val_samples, random_state=SEED)
#     train_df = data_df[~data_df["image_id"].isin(val_df["image_id"])]
    train_df = data_df[~data_df["image_id"].isin(val_df["image_id"])]

    train_dataset = HotelTrainDataset(train_df, train_transform, data_path=DATA_FOLDER)
    train_loader = DataLoader(train_dataset, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=True, drop_last=True)
    base_dataset = HotelTrainDataset(train_df, val_transform, data_path=DATA_FOLDER)
    base_loader = DataLoader(base_dataset, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False)
    val_dataset = HotelTrainDataset(val_df, val_transform, data_path=DATA_FOLDER)
    valid_loader = DataLoader(val_dataset, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False)

    print(f"Base: {len(base_dataset)}\nValidation: {len(val_dataset)}")

    model = EmbeddingNet(args.n_classes, args.embed_size, args.backbone_name)
    model = model.to(args.device)

    distance = distances.CosineSimilarity()

    criterion = losses.CosFaceLoss(num_classes=args.n_classes, embedding_size=args.embed_size, embedding_regularizer = regularizers.RegularFaceRegularizer()).to(args.device) # Accuracy: 0.7200, top 5 accuracy: 0.8460
    loss_optimizer = torch.optim.AdamW(criterion.parameters(), lr=args.lr)
    optimizer = Lookahead(torch.optim.AdamW(model.parameters(), lr=args.lr), k=3)

    scheduler = torch.optim.lr_scheduler.OneCycleLR(
                    optimizer,
                    max_lr=args.lr,
                    epochs=args.epochs,
                    steps_per_epoch=len(train_loader),
                    div_factor=10,
                    final_div_factor=1,
                    pct_start=0.1,
                    anneal_strategy="cos",
                )
    
    start_epoch = 1

    if args.continue_from_checkpoint:
        model, scheduler, optimizer, last_epoch = load_checkpoint(model, scheduler, optimizer, model_name)
        iterate_loader(train_loader, last_epoch)
        start_epoch = start_epoch + last_epoch

    torch.cuda.empty_cache()

    for epoch in range(start_epoch, args.epochs+1):
        train_loss, train_score = train_epoch(args, model, train_loader, criterion, optimizer, loss_optimizer, scheduler, epoch)
        save_checkpoint(model, scheduler, optimizer, epoch, model_name, train_loss, train_score)
        if (epoch == 1):
            _ = test(base_loader, valid_loader, model, distance, closest=False)

    base_embeds, valid_embeds, base_targets, valid_targets, val_preds, distance_matrix = test(base_loader, valid_loader, model, distance, closest=False)
    
    output = {"base_embeds": base_embeds,
              "valid_embeds": valid_embeds,
              "base_targets": base_targets,
              "valid_targets": valid_targets,
              "val_preds": val_preds,
              "distance_matrix": distance_matrix,
              "train_df" : train_df,
              "valid_df": val_df,
              }

    torch.save(output, f"{OUTPUT_FOLDER}output-{model_name}.pt")

In [None]:
def test_and_validate(args, data_df):
    model_name = f"cosface-model-{args.backbone_name}-{IMG_SIZE}x{IMG_SIZE}-{args.embed_size}embeds-{args.n_classes}hotels"
    print(model_name)
    
    model_path = "../input/hotelidcosfaceecaresnet50dtrained/checkpoint-cosface-model-ecaresnet50d_pruned-512x512-4096embeds-3116hotels.pt"
    
    seed_everything(seed=SEED)

#     val_df = data_df.groupby("hotel_id").sample(args.val_samples, random_state=SEED)
    #     train_df = data_df[~data_df["image_id"].isin(val_df["image_id"])]
    
    val_df = data_df.groupby("hotel_id").sample(args.val_samples, random_state=SEED)
#     train_df = data_df[~data_df["image_id"].isin(val_df["image_id"])]
    train_df = data_df[~data_df["image_id"].isin(val_df["image_id"])]

    train_dataset = HotelTrainDataset(train_df, train_transform, data_path=DATA_FOLDER)
    train_loader = DataLoader(train_dataset, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=True, drop_last=True)
    base_dataset = HotelTrainDataset(train_df, val_transform, data_path=DATA_FOLDER)
    base_loader = DataLoader(base_dataset, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False)
    val_dataset = HotelTrainDataset(val_df, val_transform, data_path=DATA_FOLDER)
    valid_loader = DataLoader(val_dataset, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False)

    print(f"Base: {len(base_dataset)}\nValidation: {len(val_dataset)}")

    model = EmbeddingNet(args.n_classes, args.embed_size, args.backbone_name)
    model = model.to(args.device)
    
#     if torch.cuda.is_available():
#         model.load_state_dict(torch.load(model_path))
#     else:
#         model.load_state_dict(torch.load(model_path,map_location=torch.device('cpu')))
    

    distance = distances.CosineSimilarity()

    criterion = losses.CosFaceLoss(num_classes=args.n_classes, embedding_size=args.embed_size, embedding_regularizer = regularizers.RegularFaceRegularizer()).to(args.device) # Accuracy: 0.7200, top 5 accuracy: 0.8460
    loss_optimizer = torch.optim.AdamW(criterion.parameters(), lr=args.lr)
    optimizer = Lookahead(torch.optim.AdamW(model.parameters(), lr=args.lr), k=3)

    scheduler = torch.optim.lr_scheduler.OneCycleLR(
                    optimizer,
                    max_lr=args.lr,
                    epochs=args.epochs,
                    steps_per_epoch=len(train_loader),
                    div_factor=10,
                    final_div_factor=1,
                    pct_start=0.1,
                    anneal_strategy="cos",
                )
    
    model, scheduler, optimizer, last_epoch = load_checkpoint(model, scheduler, optimizer, model_name)

    base_embeds, valid_embeds, base_targets, valid_targets, val_preds, distance_matrix = test(base_loader, valid_loader, model, distance, closest=False)
    
    output = {"base_embeds": base_embeds,
              "valid_embeds": valid_embeds,
              "base_targets": base_targets,
              "valid_targets": valid_targets,
              "val_preds": val_preds,
              "distance_matrix": distance_matrix,
              "train_df" : train_df,
              "valid_df": val_df,
              }

#     torch.save(output, f"{OUTPUT_FOLDER}output-{model_name}.pt")

# Train and evaluate

In [None]:
# %%time 

# class args:
#     epochs = 9
#     lr = 1e-3
#     batch_size = 24
#     num_workers = 2
#     embed_size = 4096
#     val_samples = 1
#     continue_from_checkpoint = False
#     backbone_name = "ecaresnet50d_pruned"
#     n_classes = data_df["hotel_id_code"].nunique()
#     device = ('cuda' if torch.cuda.is_available() else 'cpu')

    
# # print(data_df["hotel_id"].nunique())

# # val_df = data_df.groupby("hotel_id").sample(args.val_samples, random_state=SEED)
# # train_df = data_df

# # print(val_df["hotel_id"].nunique())
# # print(train_df["hotel_id"].nunique())


# test_and_validate(args, data_df)



In [None]:
def get_distances(input, base_embeds, model):
    distances = None
    output = model(input)
    output = output.detach().cpu().numpy()
#         output = output.detach().cpu().numpy().astype(np.float16)
    model_base_embeds = base_embeds[0]
    output_distances = cosine_similarity(output, model_base_embeds)
        
    if distances is None:
        distances = output_distances
    else:
        distances = distances * output_distances
            
    return distances

def predict(loader, base_df, base_embeds, model, n_matches=5, bar_desc="Generating embeds"):
    preds = []
    with torch.no_grad():
        t = tqdm(loader, desc=bar_desc)
        for i, sample in enumerate(t):
            input = sample['image'].to(args.device)
            distances = get_distances(input, base_embeds, model)
            
            for j in range(len(distances)):
                tmp_df = base_df.copy()
                tmp_df["distance"] = distances[j]
                tmp_df = tmp_df.sort_values(by=["distance", "hotel_id"], ascending=False).reset_index(drop=True)
                preds.extend([tmp_df["hotel_id"].unique()[:n_matches]])

    return preds


In [None]:
def find_closest_match(args, test_loader, base_loader, model, n_matches=5):
    base_embeds = {}
    base_embeds[0] = get_embeds(base_loader, model, "Generating embeds for train")
    
    preds = predict(test_loader, base_loader.dataset.data, base_embeds, model, n_matches, f"Generating predictions")
        
    return preds

In [None]:

sample_submission_df = pd.read_csv("../input/hotel-id-to-combat-human-trafficking-2022-fgvc9/sample_submission.csv")
test_df = pd.DataFrame(data={"image_id": os.listdir(TEST_DATA_FOLDER), "hotel_id": ""}).sort_values(by="image_id")
print(test_df)

In [None]:
base_transform = A.Compose([
    A.ToFloat(),
    APT.transforms.ToTensorV2(),
])

test_tta_transforms = {
    "base": A.Compose([A.ToFloat(), APT.transforms.ToTensorV2(),]),
    "h_flip": A.Compose([A.ToFloat(), A.HorizontalFlip(p=1), APT.transforms.ToTensorV2(),]),
    "v_flip": A.Compose([A.ToFloat(), A.VerticalFlip(p=1), APT.transforms.ToTensorV2(),]),
    "rotate+90": A.Compose([A.ToFloat(), A.Rotate(limit=90, p=1), APT.transforms.ToTensorV2(),]),
    "rotate-90": A.Compose([A.ToFloat(), A.Rotate(limit=-90, p=1), APT.transforms.ToTensorV2(),]),
#     "rand_bright": A.Compose([A.ToFloat(), A.RandomBrightness(p=1), APT.transforms.ToTensor(),]),
}

def pad_image(img):
    w, h, c = np.shape(img)
    if w > h:
        pad = int((w - h) / 2)
        img = cv2.copyMakeBorder(img, 0, 0, pad, pad, cv2.BORDER_CONSTANT, value=0)
    else:
        pad = int((h - w) / 2)
        img = cv2.copyMakeBorder(img, pad, pad, 0, 0, cv2.BORDER_CONSTANT, value=0)
        
    return img
def open_and_preprocess_image(image_path):
    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = pad_image(img)
    return cv2.resize(img, (IMG_SIZE, IMG_SIZE))

class HotelImageDataset:
    def __init__(self, data, transform=None, data_folder="train_images/"):
        self.data = data
        self.data_folder = data_folder
        self.transform = transform

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        record = self.data.iloc[idx]

        if self.data_folder == "../input/hotel-id-to-combat-human-trafficking-2022-fgvc9/test_images/":
            image_path = self.data_folder + record["image_id"]
        else:
            image_path = self.data_folder + record["image_id"]
        
        if "test" in self.data_folder:
            image = np.array(open_and_preprocess_image(image_path)).astype(np.uint8)
        else:
            image = np.array(pil_image.open(image_path)).astype(np.uint8)

        if self.transform:
            transformed = self.transform(image=image)
        
        return {
            "image" : transformed["image"],
        }

In [None]:
class args:
    batch_size = 32
    num_workers = 4
    n_classes = data_df["hotel_id"].nunique()
    device = ('cuda' if torch.cuda.is_available() else 'cpu')
    backbone_name = "ecaresnet50d_pruned"
    embed_size = 4096
    lr = 1e-3
    epochs = 9
    n_classes = data_df["hotel_id_code"].nunique()
    
    
    
seed_everything(seed=SEED)

base_dataset = HotelImageDataset(data_df, base_transform, data_folder=TRAIN_DATA_FOLDER)
base_loader = DataLoader(base_dataset, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False)

test_dataset = HotelImageDataset(test_df, test_tta_transforms["base"], data_folder=TEST_DATA_FOLDER)
test_loader = DataLoader(test_dataset, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False)

In [None]:
model_name = f"cosface-model-{args.backbone_name}-{IMG_SIZE}x{IMG_SIZE}-{args.embed_size}embeds-{args.n_classes}hotels"

model = EmbeddingNet(args.n_classes, args.embed_size, args.backbone_name)
model = model.to(args.device)
optimizer = Lookahead(torch.optim.AdamW(model.parameters(), lr=args.lr), k=3)

scheduler = torch.optim.lr_scheduler.OneCycleLR(
                    optimizer,
                    max_lr=args.lr,
                    epochs=args.epochs,
                    steps_per_epoch=len(test_loader),
                    div_factor=10,
                    final_div_factor=1,
                    pct_start=0.1,
                    anneal_strategy="cos",
                )

model, scheduler, optimizer, last_epoch = load_checkpoint(model, scheduler, optimizer, model_name)

if len(test_df) > 0:
    print("predicting full test set")
    preds = find_closest_match(args, test_loader, base_loader, model, n_matches=5)
    test_df["hotel_id"] = [str(list(l)).strip("[]").replace(",", "") for l in preds]

test_df.to_csv("submission.csv", index=False)
test_df.head()