# Intro



This notebook is based on a starter notebook made by user [Michaln](https://www.kaggle.com/michaln), for the [Hotel-ID to Combat Human Trafficking 2022 - FGVC9](https://www.kaggle.com/competitions/hotel-id-to-combat-human-trafficking-2022-fgvc9) competition. 

- Starter Training notebook: [Hotel-ID starter - -similarity - inference](https://www.kaggle.com/code/michaln/hotel-id-starter-similarity-training)
- Starter Inference notebook: [Hotel-ID starter - similarity- inference](https://www.kaggle.com/code/michaln/hotel-id-starter-similarity-inference)
- Another starter notebook: [Hotel-ID starter - classification - traning](https://www.kaggle.com/code/michaln/hotel-id-starter-classification-traning)
- With its inference part: [Hotel-ID starter - classification - inference](https://www.kaggle.com/code/michaln/hotel-id-starter-classification-inference)

## Basic Overview
Using a pre-trained network, we generate an embedding for the input, this embedding is used to find the distance to other embeddings for all hotels. This notebook uses cosine distance for that. The 5 closest matches are chosen for the map5 score. We use ArcMargin to push the differences between embeddings from different classes. Additionally, we use various data augmentation techniques to divirsify the training data.


## Data
This notebook uses preprocessed images that were resized and padded to 512x512 pixels.

Used dataset: [Hotel-ID 2022 train images 512x512](https://www.kaggle.com/datasets/michaln/hotelid-2022-train-images-512x512) created by [Michaln](https://www.kaggle.com/michaln) notebook.

# Imports

In [None]:
!pip install timm

In [None]:
!pip install git+https://github.com/ufoym/imbalanced-dataset-sampler.git
from torchsampler import ImbalancedDatasetSampler

In [None]:
import numpy as np
import pandas as pd
import random
import os

In [None]:
from sklearn.metrics import accuracy_score
from PIL import Image as pil_image
from tqdm import tqdm

import matplotlib
import matplotlib.pyplot as plt

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import timm
from timm.optim import Lookahead

from sklearn.metrics.pairwise import cosine_similarity

# Global

In [None]:
IMG_SIZE = 512
SEED = 42
N_MATCHES = 5

PROJECT_FOLDER = "../input/hotel-id-to-combat-human-trafficking-2022-fgvc9/"
DATA_FOLDER = "../input/hotelid-2022-train-images-512x512/"
IMAGE_FOLDER = DATA_FOLDER + "images/"
OUTPUT_FOLDER = ""

train_df = pd.read_csv(os.path.join(DATA_FOLDER, 'train.csv'))

In [None]:
print(os.listdir(PROJECT_FOLDER))

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

# Dataset and transformations

Coarse dropout with fill_value=(255,0,0) (full red channel) is used to simulate the occlussions like the one in test dataset. 
```python
A.CoarseDropout(p=1., max_holes=1, 
                min_height=IMG_SIZE//4, max_height=IMG_SIZE//2,
                min_width=IMG_SIZE//4,  max_width=IMG_SIZE//2, 
                fill_value=(255,0,0))
```

Peculiarly, we found that applying the random brightness augmentation after the dropout did not make the results better or significantly worse.  

In [None]:
import albumentations as A
import albumentations.pytorch as APT
import cv2 

# used for training dataset - augmentations and occlusions
train_transform = A.Compose([
    A.RandomResizedCrop(height=IMG_SIZE, width=IMG_SIZE, scale=(0.25, 1.0), p=0.75),
    A.HorizontalFlip(p=0.75),
    A.VerticalFlip(p=0.25),
    A.ShiftScaleRotate(p=0.5, border_mode=cv2.BORDER_CONSTANT),
    A.OpticalDistortion(p=0.25),
    A.Perspective(p=0.25),
    A.CoarseDropout(p=0.5, min_holes=1, max_holes=6, 
                    min_height=IMG_SIZE//16, max_height=IMG_SIZE//4,
                    min_width=IMG_SIZE//16,  max_width=IMG_SIZE//4), # normal coarse dropout
    
    A.CoarseDropout(p=0.75, max_holes=1, 
                    min_height=IMG_SIZE//4, max_height=IMG_SIZE//2,
                    min_width=IMG_SIZE//4,  max_width=IMG_SIZE//2, 
                    fill_value=(255,0,0)),# simulating occlusions in test data

    A.RandomBrightnessContrast(p=0.75),
    A.ToFloat(),
    APT.transforms.ToTensorV2(),
])

# used for validation dataset - only occlusions
val_transform = A.Compose([
    A.CoarseDropout(p=0.75, max_holes=1, 
                    min_height=IMG_SIZE//4, max_height=IMG_SIZE//2,
                    min_width=IMG_SIZE//4,  max_width=IMG_SIZE//2, 
                    fill_value=(255,0,0)),# simulating occlusions
    A.ToFloat(),
    APT.transforms.ToTensorV2(),
])

# no augmentations
base_transform = A.Compose([
    A.ToFloat(),
    APT.transforms.ToTensorV2(),
])

In [None]:
class HotelTrainDataset(Dataset):
    def __init__(self, data, transform=None, data_path="train_images/"):
        self.data = data
        self.data_path = data_path
        self.transform = transform

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        record = self.data.iloc[idx]
        image_path = self.data_path + record["image_id"]
        image = np.array(pil_image.open(image_path)).astype(np.uint8)

        if self.transform:
            transformed = self.transform(image=image)
            image = transformed["image"]
        
        return {
            "image" : image,
            "target" : record['hotel_id_code'],
        }
    
    def get_labels(self):
        return list(self.data.loc[:,'hotel_id_code'])

# Model

Includes the ArcMargin class.

In [None]:
# from https://github.com/ronghuaiyang/arcface-pytorch
import math
import torch.nn.functional as F

class ArcMarginProduct(nn.Module):
    r"""Implement of large margin arc distance: :
        Args:
            in_features: size of each input sample
            out_features: size of each output sample
            s: norm of input feature
            m: margin
            cos(theta + m)
        """
    def __init__(self, in_features, out_features, s=30.0, m=0.50, easy_margin=False):
        super(ArcMarginProduct, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.s = s
        self.m = m
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.th = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------------
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        sine = torch.sqrt((1.0 - torch.pow(cosine, 2)).clamp(0, 1))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        # --------------------------- convert label to one-hot ---------------------------
        # one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda')
        one_hot = torch.zeros(cosine.size(), device='cuda')
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        # -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)  # you can use torch.where if your torch.__version__ is 0.4
        output *= self.s
        # print(output)

        return output

In [None]:
class EmbeddingModel(nn.Module):
    def __init__(self, n_classes=100, embedding_size=64, backbone_name="efficientnet_b0"):
        super(EmbeddingModel, self).__init__()
        self.embed_size = embedding_size
        self.backbone = timm.create_model(backbone_name, num_classes=0, pretrained=True)
        o = self.backbone(torch.randn(1, 3, IMG_SIZE, IMG_SIZE))
        in_features = o.shape[1]
        
        self.arcface = ArcMarginProduct(embedding_size, n_classes, m=0.2)
       
        self.post = nn.Sequential(
            nn.utils.weight_norm(nn.Linear(in_features, self.embed_size*2), dim=None),
            nn.BatchNorm1d(self.embed_size*2),
            nn.Dropout(0.2),
            nn.utils.weight_norm(nn.Linear(self.embed_size*2, self.embed_size)),
            nn.BatchNorm1d(self.embed_size),
        )
        
    def forward(self, x, targets = None):
        x = self.backbone(x)
        x = x.view(x.size(0), -1)
        x = self.post(x)
        if targets is not None:
            x = self.arcface(x, targets)
        return x

# Model helper functions

In [None]:
# method to iterate loader and generate embeddings of images
# returns embeddings and image class
def generate_embeddings(loader, model, bar_desc="Generating embeds"):
    targets_all = []
    outputs_all = []
    
    model.eval()
    with torch.no_grad():
        t = tqdm(loader, desc=bar_desc)
        for i, sample in enumerate(t):
            input = sample['image'].to(args.device)
            target = sample['target'].to(args.device)
            output = model(input)
            
            targets_all.extend(target.cpu().numpy())
            outputs_all.extend(output.detach().cpu().numpy())

    targets_all = np.array(targets_all).astype(np.float32)
    outputs_all = np.array(outputs_all).astype(np.float32)
            
    return outputs_all, targets_all

In [None]:
def save_checkpoint(model, scheduler, optimizer, epoch, name, loss=None, score=None):
    checkpoint = {"epoch": epoch,
                  "model": model.state_dict(),
                  "scheduler": scheduler.state_dict(),
                  "optimizer": optimizer.state_dict(),
                  "loss": loss,
                  "score": score,
                  }

    torch.save(checkpoint, f"{OUTPUT_FOLDER}checkpoint-{name}.pt")


def load_checkpoint(model, scheduler, name):
    checkpoint = torch.load(name)

    model.load_state_dict(checkpoint["model"])
    scheduler.load_state_dict(checkpoint["scheduler"])
    return model, scheduler, checkpoint["epoch"]

# Train and validation functions

In [None]:
def train_epoch(args, model, loader, criterion, optimizer, scheduler, epoch):
    losses = []
    targets_all = []
    outputs_all = []
    
    model.train()
    t = tqdm(loader)
    
    for i, sample in enumerate(t):
        optimizer.zero_grad()
        
        images = sample['image'].to(args.device)
        targets = sample['target'].to(args.device)
        
        outputs = model(images, targets)
        loss = criterion(outputs, targets)
        
        loss.backward()
        optimizer.step()
        
        if scheduler:
            scheduler.step()
                
        losses.append(loss.item())
        targets_all.extend(targets.cpu().numpy())
        outputs_all.extend(torch.sigmoid(outputs).detach().cpu().numpy())

        score = accuracy_score(targets_all, np.argmax(outputs_all, axis=1))
        t.set_description(f"Epoch {epoch}/{args.epochs} - Train loss:{loss:0.4f}, score: {score:0.4f}")
        
    return np.mean(losses), score

In [None]:
def test_classification(loader, model):
    targets_all = []
    outputs_all = []
    
    model.eval()
    t = tqdm(loader, desc="Classification")
    
    for i, sample in enumerate(t):
        images = sample['image'].to(args.device)
        targets = sample['target'].to(args.device)
        
        with torch.no_grad():
            outputs = model(images, targets)
        
        targets_all.extend(targets.cpu().numpy())
        outputs_all.extend(torch.sigmoid(outputs).detach().cpu().numpy())
        
    
    # repeat targets to N_MATCHES for easy calculation of MAP@5
    y = np.repeat([targets_all], repeats=N_MATCHES, axis=0).T
    # sort predictions and get top 5
    preds = np.argsort(-np.array(outputs_all), axis=1)[:, :N_MATCHES]
    # check if any of top 5 predictions are correct and calculate mean accuracy
    acc_top_5 = (preds == y).any(axis=1).mean()
    # calculate prediction accuracy
    acc_top_1 = np.mean(targets_all == np.argmax(outputs_all, axis=1))

    print(f"Classification accuracy: {acc_top_1:0.4f}, MAP@5: {acc_top_5:0.4f}")
    return acc_top_5

In [None]:
# find 5 most similar images from different hotels and return their hotel_id_code
def find_matches(query, base_embeds, base_targets, k=N_MATCHES):
    distance_df = pd.DataFrame(index=np.arange(len(base_targets)), data={"hotel_id_code": base_targets})
    # calculate cosine distance of query embeds to all base embeds
    distance_df["distance"] = cosine_similarity([query], base_embeds)[0]
    # sort by distance and hotel_id
    distance_df = distance_df.sort_values(by=["distance", "hotel_id_code"], ascending=False).reset_index(drop=True)
    # return first 5 different hotel_id_codes
    return distance_df["hotel_id_code"].unique()[:N_MATCHES]
    

def test_similarity(args, base_loader, test_loader, model):
    base_embeds, base_targets = generate_embeddings(base_loader, model, "Generate base embeddings")
    test_embeds, test_targets = generate_embeddings(test_loader, model, "Generate test embeddings")
    
    preds = []
    for query_embeds in tqdm(test_embeds, desc="Similarity - match finding"):
        tmp = find_matches(query_embeds, base_embeds, base_targets)
        preds.extend([tmp])
        
    preds = np.array(preds)
    test_targets_N = np.repeat([test_targets], repeats=N_MATCHES, axis=0).T
    # check if any of top 5 predictions are correct and calculate mean accuracy
    acc_top_5 = (preds == test_targets_N).any(axis=1).mean()
    # calculate prediction accuracy
    acc_top_1 = np.mean(test_targets == preds[:, 0])
    print(f"Similarity accuracy: {acc_top_1:0.4f}, MAP@5: {acc_top_5:0.4f}")

# Prepare data

In [None]:
data_df = pd.read_csv(DATA_FOLDER + "train.csv")
# encode hotel ids
data_df["hotel_id_code"] = data_df["hotel_id"].astype('category').cat.codes.values.astype(np.int64)

In [None]:
# save hotel_id encoding for later decoding
hotel_id_code_df = data_df.drop(columns=["image_id"]).drop_duplicates().reset_index(drop=True)
hotel_id_code_df.to_csv(OUTPUT_FOLDER + 'hotel_id_code_mapping.csv', index=False)
# hotel_id_code_map = hotel_id_code_df.set_index('hotel_id_code').to_dict()["hotel_id"]

## Example of images

In [None]:
def show_images(ds, title_text, n_images=5):
    fig, ax = plt.subplots(1,5, figsize=(22,8))
    
    ax[0].set_ylabel(title_text)
    
    for i in range(5):
        d = ds.__getitem__(i)
        ax[i].imshow(d["image"].T)

# Train and evaluate

In [None]:
def train_and_validate(args, data_df):
    model_name = f"embedding-model-{args.backbone_name}-{IMG_SIZE}x{IMG_SIZE}"
    print(model_name)

    seed_everything(seed=SEED)
    
    if args.split_val:
        # split data into train and validation set
        hotel_image_count = data_df.groupby("hotel_id")["image_id"].count()
        # hotels that have more images than samples for validation
        valid_hotels = hotel_image_count[hotel_image_count > args.val_samples]
        # data that can be split into train and val set
        valid_data = data_df[data_df["hotel_id"].isin(valid_hotels.index)]
        # if hotel had less than required val_samples it will be only in the train set
        valid_df = valid_data.groupby("hotel_id").sample(args.val_samples, random_state=SEED)
        train_df = data_df[~data_df["image_id"].isin(valid_df["image_id"])]
    else: 
        train_df = data_df
    
    train_dataset = HotelTrainDataset(train_df, train_transform, data_path=IMAGE_FOLDER)
    train_loader  = DataLoader(train_dataset, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=True, drop_last=True)
    train_loader_sampler = DataLoader(train_dataset, sampler=ImbalancedDatasetSampler(train_dataset), num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False, drop_last=True)
    if args.split_val:
        valid_dataset = HotelTrainDataset(valid_df, val_transform, data_path=IMAGE_FOLDER)
        valid_loader  = DataLoader(valid_dataset, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False)
    
    # base dataset for image similarity search
    base_dataset  = HotelTrainDataset(train_df, base_transform, data_path=IMAGE_FOLDER)
    base_loader   = DataLoader(base_dataset, num_workers=args.num_workers, batch_size=args.batch_size*4, shuffle=False)
    break_at_epoch = args.break_at_epoch
    
    model = EmbeddingModel(args.n_classes, args.embedding_size ,args.backbone_name)
    optimizer = Lookahead(torch.optim.AdamW(model.parameters(), lr=args.lr), k=3)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer,
            max_lr=args.lr,
            epochs=args.epochs,
            steps_per_epoch=len(train_loader),
            div_factor=10,
            final_div_factor=1,
            pct_start=0.1,
            anneal_strategy="cos",
)
    
    if args.checkpoint:
        model, scheduler, start_epoch = load_checkpoint(model, scheduler, args.checkpoint)
        start_epoch += 1
    else:
        start_epoch = 1
    
    model = model.to(args.device)
    criterion = nn.CrossEntropyLoss()
    
    best_val_acc = 0
    for epoch in range(start_epoch, args.epochs+1):
        if epoch%2==1 or (not args.use_sampler):
            train_loss, train_score = train_epoch(args, model, train_loader, criterion, optimizer, scheduler, epoch)
        else:
            train_loss, train_score = train_epoch(args, model, train_loader_sampler, criterion, optimizer, scheduler, epoch)
        save_checkpoint(model, scheduler, optimizer, epoch, f"{model_name}-last", train_loss, train_score)
        if args.split_val:
            val_acc_top5 = test_classification(valid_loader, model)
            if val_acc_top5 > best_val_acc:
                best_val_acc = val_acc_top5
                save_checkpoint(model, scheduler, optimizer, epoch, f"{model_name}-best", train_loss, train_score)
        if epoch == break_at_epoch:
            break
    if args.split_val:
        test_similarity(args, base_loader, valid_loader, model)
    
    # generate embeddings for all train images and save them for inference
    base_dataset   = HotelTrainDataset(data_df, base_transform, data_path=IMAGE_FOLDER)
    base_loader    = DataLoader(base_dataset, num_workers=args.num_workers, batch_size=args.batch_size*4, shuffle=False)
    base_embeds, _ = generate_embeddings(base_loader, model, "Generate embeddings for all images")
    data_df["embeddings"] = list(base_embeds)
    data_df.to_pickle(f"{OUTPUT_FOLDER}{model_name}_image-embeddings.pkl")

## Training

In [None]:
%%time 

class args:
    epochs = 15
    lr = 1e-3
    batch_size = 32
    num_workers = 2
    split_val = True
    val_samples = 1
    embedding_size = 1800
    backbone_name = "eca_nfnet_l0"
    n_classes = data_df["hotel_id_code"].nunique()
    device = ('cuda' if torch.cuda.is_available() else 'cpu')
    checkpoint = None #"../input/hotelidcheckpoints/checkpoint-embedding-model-eca_nfnet_l0-512x512-embedding-1800.pt"
    break_at_epoch = 15
    use_sampler = True

train_and_validate(args, data_df)