# Imports

In [None]:
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')
import timm

In [None]:
# Shared Imports
import random
import os
import pathlib
from typing import Iterator, List, Optional, Tuple
import json

import math
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

# model imports
## Pytorch/modell stuff
import torch
import torch.nn as nn
from torchmetrics import Accuracy
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger

from sklearn.metrics.pairwise import cosine_similarity

# Pre-processing
import albumentations as A
import albumentations.pytorch as APT
import cv2 
from tqdm import tqdm

# Globals

In [None]:
SEED = 42
# Wheter to PAD the images
PAD = True
# The size of the images
PATCH = (256, 256)
# The number of matches to consider
N_MATCHES = 5

# Set random Seed
pl.seed_everything(SEED)

In [None]:
# directory of the test images
TEST_DIR = pathlib.Path("../input/hotel-id-to-combat-human-trafficking-2022-fgvc9/test_images")
# base model
BASE_MODEL = "eca_nfnet_l0"
# Directory of the model weights and saved embeddings
MODEL_WEIGHTS_DIR = pathlib.Path("../input/experimenten-weights")
MODEL_WEIGHTS = pathlib.Path("../input/experimenten-weights/logs/lightning_logs/version_0/checkpoints/epoch_0009.step_000025139.val-map_0.4332.last.ckpt")
BASE_EMB = MODEL_WEIGHTS_DIR / "base_image-embeddings.pkl"

# Loading data

In [None]:
test_df = pd.DataFrame(data={"image_id": os.listdir(TEST_DIR), "hotel_id": ""}).sort_values(by="image_id")

hotel_id_code_df = pd.read_csv(MODEL_WEIGHTS_DIR / 'hotel_id_code_mapping.csv')
hotel_id_code_map = hotel_id_code_df.set_index('hotel_code').to_dict()["hotel_id"]
hotel_id_code_df.head()

In [None]:
test_df.head()

## Pre-process

In [None]:
def open_preprocess_img(img_path: pathlib.Path):
    img = cv2.imread(str(img_path))
    
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    if PAD: img = pad(img)
    
    return cv2.resize(img, PATCH)

def pad(img):
    w, h, c = np.shape(img)
    const = 0
        
    if w == h: return img
    elif (w - h) % 2 != 0: const = 1
        
    if w < h:
        half_py = (h - w) // 2       
        return cv2.copyMakeBorder(img, 0, 0, half_py, half_py + const, cv2.BORDER_CONSTANT, value=0)
    elif h < w:
        half_px = (w - h) // 2
        return cv2.copyMakeBorder(img, half_px, half_px + const, 0, 0, cv2.BORDER_CONSTANT, value=0)

## Augmentations

In [None]:
base_transform = A.Compose([#A.RandomCrop(224,224, p=1),
    A.ToFloat(),
    APT.transforms.ToTensorV2(),
])

## Dataloader

In [None]:
class ImageDataset(Dataset):
    def __init__(self,
                 data: pd.DataFrame,
                 data_path: pathlib.Path,
                 transform: Optional = None,
                ):
        self.data = data
        self.data_path = data_path
        self.transform = transform

    def __len__(self) -> int:
        return len(self.data)

    def __getitem__(self, idx: int):
        record = self.data.iloc[idx]

        image_path = self.data_path / record["image_id"]
        image = np.array(open_preprocess_img(image_path)).astype(np.uint8)
        
        if self.transform:
            transformed = self.transform(image=image)
            image = transformed["image"]
            
        return image

In [None]:
class args:
    num_workers = 2
    n_classes = hotel_id_code_df["hotel_id"].nunique()
    device = ('cuda' if torch.cuda.is_available() else 'cpu')

test_dataset = ImageDataset(test_df, transform=base_transform, data_path=TEST_DIR)
test_loader = DataLoader(test_dataset, num_workers=args.num_workers, batch_size=1, shuffle=False)

# Model

In [None]:
# source: https://github.com/ronghuaiyang/arcface-pytorch/blob/master/models/metrics.py
class ArcMarginProduct(nn.Module):
    r"""Implement of large margin arc distance: :
        Args:
            in_features: size of each input sample
            out_features: size of each output sample
            s: norm of input feature
            m: margin
            cos(theta + m)
        """
    def __init__(self, in_features, out_features, s=30.0, m=0.50, easy_margin=False):
        super(ArcMarginProduct, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.s = s
        self.m = m
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.th = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------------
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        sine = torch.sqrt((1.0 - torch.pow(cosine, 2)).clamp(0, 1))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        # --------------------------- convert label to one-hot ---------------------------
        # one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda')
        one_hot = torch.zeros(cosine.size(), device='cuda')
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        # -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)  # you can use torch.where if your torch.__version__ is 0.4
        output *= self.s

        return output

In [None]:
class HotelModel(pl.LightningModule):
    def __init__(self,
                n_hotels: int,
                steps_per_epoch: int,
                n_embeddings: int = 256,
                base_model = None,
                pretrained: bool = False,
                learning_rate: float = 0.003,
                
                ):
        super().__init__()
        
        # Hyperparams
        self.n_embeddings = n_embeddings
        self.n_hotels = n_hotels
        self.learning_rate = learning_rate
        self.steps_per_epoch = steps_per_epoch
        
        # Metrics
        self.loss_fn = nn.CrossEntropyLoss()
        self.train_acc = Accuracy()
        self.val_acc = Accuracy()
        
        # Model Definition 
        ## Base model
        self.base_model = timm.create_model(base_model, pretrained=False)        
        in_features = self.base_model.get_classifier().in_features
        
        fc_name, _ = list(self.base_model.named_modules())[-1]
        if fc_name == 'classifier':
            self.base_model.classifier = nn.Identity()
        elif fc_name == 'head.fc':
            self.base_model.head.fc = nn.Identity()
        elif fc_name == 'fc':
            self.base_model.fc = nn.Identity()
        #else:
            #raise Exception("unknown classifier layer: " + fc_name)
        
        ## Arcface module
        self.arc_face = ArcMarginProduct(self.n_embeddings, n_hotels, s=30.0, m=0.20, easy_margin=False)
        
        ## Top model
        self.top_model = nn.Sequential(
            nn.utils.weight_norm(nn.Linear(1000, self.n_embeddings*2), dim=None),
            nn.BatchNorm1d(self.n_embeddings*2),
            nn.Dropout(0.2),
            nn.utils.weight_norm(nn.Linear(self.n_embeddings*2, self.n_embeddings)),
            nn.BatchNorm1d(self.n_embeddings),
        )
        
        # Save hyper params
        self.save_hyperparameters()

    def configure_optimizers(self):
        optimizer = Lookahead(torch.optim.AdamW(self.parameters(), lr=self.learning_rate), k=3)
        
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
                    optimizer,
                    max_lr=self.learning_rate,
                    epochs=EPOCHS,
                    steps_per_epoch=self.steps_per_epoch,
                    div_factor=10,
                    final_div_factor=1,
                    pct_start=0.1,
                    anneal_strategy="cos",
                )
        
        schedule = {
            # Required: the scheduler instance.
            "scheduler": scheduler,
        }
        return [optimizer], [schedule]
    
    def forward(self, x, targets = None):
        y_hat = self.base_model(x)
        y_hat = y_hat.view(y_hat.size(0), -1)
        y_hat = self.top_model(y_hat)
        
        if targets is not None:
            y_hat = self.arc_face(y_hat, targets)

        return y_hat

    def training_step(self, batch, batch_idx):
        x, y = batch
        
        # Forward pass
        y_hat = self.forward(x, y)
        loss = self.loss_fn(y_hat, y)
        self.train_acc(y_hat, y)

        # Store results
        self.log("train_loss", loss, prog_bar=False)
        
        return loss
    
    def training_epoch_end(self, train_step_outputs) -> None:
        # Log metrics
        self.log("train_acc", self.train_acc, prog_bar=True)

    def validation_step(self, batch, batch_idx):
        x, y = batch
        
        # Forward pass
        y_hat = self.forward(x, y)
        loss = self.loss_fn(y_hat, y)
        self.val_acc(y_hat, y)

        # Store results
        self.log("val_loss", loss, prog_bar=False)
        return y_hat
        
    def validation_epoch_end(self, validation_step_outputs) -> None:
        self.log("val_acc", self.val_acc, prog_bar=True)
        
    def predict_step(self, batch, batch_idx):
        y_hat = self.forward(batch)
        return y_hat

In [None]:
def get_model(model_type, backbone_name, checkpoint_path, args):
    model = HotelModel.load_from_checkpoint(checkpoint_path, map_location='cpu', strict=True)
    return model

In [None]:
model = get_model("classification", 
                  BASE_MODEL,
                  MODEL_WEIGHTS, 
                  args)

## Helper Functions

In [None]:
def generate_embeddings(args, loader, model, bar_desc="Generating embeds"):
    outputs_all = []
    model = model.to(args.device)
    model.eval()
    with  torch.no_grad():
        tq = tqdm(loader, desc=bar_desc)
        for i, sample in enumerate(tq):
            input = sample.to(args.device)
            output = model(input)
            outputs_all.extend(output.detach().cpu().numpy())
            
    return outputs_all

In [None]:
def find_matches(query, base_embeds, base_targets, k=N_MATCHES):
    distance_df = pd.DataFrame(index=np.arange(len(base_targets)), data={"hotel_id": base_targets})
    # calculate cosine distance of query embeds to all base embeds
    distance_df["distance"] = cosine_similarity([query], list(base_embeds))[0]
    # sort by distance and hotel_id
    distance_df = distance_df.sort_values(by=["distance", "hotel_id"], ascending=False).reset_index(drop=True)
    # return first 5 different hotel_id_codes
    return distance_df["hotel_id"].unique()[:N_MATCHES]


def predict(args, base_embeddings_df, test_loader, model):
    test_embeds = generate_embeddings(args, test_loader, model, "Generate test embeddings")
    
    preds = []
    for query_embeds in tqdm(test_embeds, desc="Similarity - match finding"):
        tmp = find_matches(query_embeds, 
                           base_embeddings_df["embeddings"].values, 
                           base_embeddings_df["hotel_id"].values)
        preds.extend([tmp])
        
    return preds

## Load base embeddings

In [None]:
base_embeddings_df = pd.read_pickle(BASE_EMB)
display(base_embeddings_df.head())

# Submission

In [None]:
preds = predict(args, base_embeddings_df, test_loader, model)
# transform array of hotel_ids into string
test_df["hotel_id"] = [str(list(l)).strip("[]").replace(",", "") for l in preds]

test_df.to_csv("submission.csv", index=False)
test_df.head()