The hope of this notebook was to get the final layer of the convolutional network as an embedding. By using the arcface loss I hoped that the embeddings would be represent the relevant cluster, hence rather than taking the cluster center, any random example of a class should have similar embeddings to other members of a class. However, this doesn't seem to be the case.

See version 6 of this notebook to see actual results.

## What's Special about this NB:
- Used Nvidia Apex to get mixed precision on top of pytorch
- Used Lamb optimizer + lr scheduler which decays learning rate every 100 iterations
- Used ArcFace loss. Jump to `Loss function` section to see calculation. It is different from the author's definition.
- Used efficientnet as base
- Only trained the weight centers and the final convolutional layer of efficientnet.
- 30 minutes to run through 1 epoch of ~300k images (since I only took any class that had >100 instances). See dataset and DataLoader section to see what I did to get it to run this fast.


In [None]:
%%capture
!git clone https://github.com/NVIDIA/apex
!pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex
!pip install efficientnet_pytorch torchtoolbox

In [None]:
import os
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import multiprocessing as mp
from functools import partial
import pickle

import cv2

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import _LRScheduler, StepLR

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

from apex.optimizers import FusedAdam, FusedLAMB
from apex import amp

import albumentations as A
from albumentations.pytorch import ToTensorV2

from efficientnet_pytorch import EfficientNet

import warnings
warnings.simplefilter('ignore')
%matplotlib inline

torch.backends.cudnn.benchmark = True

## Config

In [None]:
BASE = "efficientnet-b0"
SIZE = (128, 128)
EPOCHS = 2
GRAD_ACCUMULATE = 1
BS = 512
p = 0.5
LR_RANGE = [1e-7, 2e-4]

## Data

### Get Training/ Validation dataframes

In [None]:
path = "/kaggle/input/landmark-recognition-2020/"
df = pd.read_csv(path + "train.csv")
df["path"] = df["id"].map(lambda x: "/".join([path+"train"] + list(x[:3])+[x + ".jpg"]))
df.sort_values("landmark_id", inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
k = 100
counts = df["landmark_id"].value_counts()
topk = counts[counts >= k].index
df = df[df["landmark_id"].isin(topk)]
df.shape

In [None]:
id2y = {id_:i for i, id_ in enumerate(df["landmark_id"].unique())}
df["target"] = df["landmark_id"].map(lambda x: id2y[x])

weights = 1 / df["target"].value_counts()
df["weights"] = weights.loc[df["target"]].values
print(f"There are {len(id2y)} classes")

print(df.shape)
df.head()

In [None]:
train_df, val_df = train_test_split(df, test_size=0.15, random_state=42, stratify=df["target"])
train_df.reset_index(drop=True, inplace=True)
train_df = train_df.sample(frac=1)
val_df.reset_index(drop=True, inplace=True)

In [None]:
test_files = []
for root, dirs, files in tqdm(os.walk(path+"test/")):
    if files:
        files = [root+"/"+file for file in files]
        test_files.extend(files)
        
test_df = pd.DataFrame({"path": test_files})

### PyTorch Datasets + DataLoaders

In [None]:
tfms = A.Compose([
    A.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
    ),
    ToTensorV2()
])

In [None]:
class Images(Dataset):
    def __init__(self, df: pd.DataFrame, train: bool = True):
        """
        Parameters:
            df (pd.DataFrame): DataFrame with data description
            train (bool): flag of whether a training dataset is being initialized or testing one
            transforms: image transformation method to be applied
        """
        if train:
            df["weights"] = df["weights"].astype(np.float32)
        self.df = df.reset_index(drop=True)
        self.train = train
        
    def __getitem__(self, index):
        im_path = self.df.loc[index, 'path']
        x = cv2.cvtColor(cv2.resize(cv2.imread(im_path), SIZE), cv2.COLOR_BGR2RGB)
        x = tfms(image=x)['image']
            
        if self.train:
            weights = self.df.loc[index, 'weights']
            y = self.df.loc[index, 'target']
            return x, y, weights
        else:
            return x
    
    def __len__(self):
        return len(self.df)

In [None]:
train_images = Images(train_df)
val_images = Images(val_df)
test_images = Images(test_df, train=False)

train_dl = DataLoader(train_images, BS, num_workers=mp.cpu_count(), pin_memory=True, shuffle=True, drop_last=True)
val_dl = DataLoader(val_images, BS, num_workers=mp.cpu_count(), pin_memory=True)
test_dl = DataLoader(test_images, BS*4, num_workers=mp.cpu_count(), pin_memory=True)

## Loss

In [None]:
class ArcFaceLoss(nn.Module):
    def __init__(self, s=64, m=0.5):
        super().__init__()
        self.s, self.m = s, m
        self.cross_entropy = partial(F.cross_entropy, reduction='none')
        
    def forward(self, costheta, y):
        costheta_y = costheta[torch.arange(len(y)), y]
        costheta_y = torch.cos(torch.acos(costheta_y) +self.m)
        costheta[torch.arange(len(y)), y] = costheta_y.type(costheta.dtype)
        
        return self.cross_entropy(self.s*costheta, y)

## Model

In [None]:
class Model(nn.Module):
    def __init__(self, classes, base=BASE, unfreeze=None):
        super().__init__()

        # EfficientNet
        self.base = EfficientNet.from_pretrained(base)
                
        # Replace last layer
        self.centers = nn.Parameter(torch.randn(self.base._fc.in_features, classes))
        self.unfreeze = unfreeze
    
    def get_embedding(self, x):
        pool = F.adaptive_avg_pool2d(self.base.extract_features(x), 1)
        pool = pool.view(x.shape[0], -1)
        
        lens = torch.sqrt((pool**2).sum(dim=-1, keepdim=True))
        return pool / lens
    
    def forward(self, x):
        embeds = self.get_embedding(x)
        
        lens = torch.sqrt((self.centers**2).sum(dim=0, keepdim=True))
        centers = self.centers / lens
        
        return embeds.matmul(centers)
    
    def freeze(self):
        for n,p in self.named_parameters():
            if not any([layer in n for layer in self.unfreeze]):
                p.requires_grad = False

In [None]:
unfreeze = ["conv_head", "base._bn1", "centers"]
model = Model(len(id2y), unfreeze=unfreeze)
model = model.to(device)
model.freeze()

arcface_loss = ArcFaceLoss()

optimizer = FusedLAMB(model.parameters(), lr=1e-2)
model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
scheduler = StepLR(optimizer, step_size=100, gamma=0.7)

In [None]:
def metrics(y_pred, y, weights, k=5):
    """
    Weighted accuracy and top-k accuracy
    parameters:
    - y_pred: predicted logits or probabilities 
    - y: Actual class
    - weights: importance of each instance **must sum to one**
    - k: number of categories to look for
    """
    topk = y_pred.topk(k=k, dim=-1)[1] == y[:, None]
    topk_acc = (weights * topk.any(dim=-1).float()).sum()
    acc = (weights * topk[:,0].float()).sum()
    return acc, topk_acc

## Training

In [None]:
val_p = 0.5
val_k = int(val_p * len(train_dl))
val_accs = []
val_topk_accs = []

accs = []
topk_accs = []
losses = []

for p in model.parameters(): p.grad = None
model.train()
for _ in range(EPOCHS):
    for i, (x, y, weights) in tqdm(enumerate(train_dl), total=len(train_dl)):
        x, y, weights = x.to(device), y.to(device), weights.to(device)

        y_pred = model(x)
        loss_all = arcface_loss(y_pred, y)
        loss = (loss_all * weights).sum()
        loss = loss / GRAD_ACCUMULATE

        acc, topk_acc = metrics(y_pred, y, weights / weights.sum())
        print(f"\rLoss: {loss:.4f}, Accuracy: {acc:.4f}, top-5 Accuracy: {topk_acc:.4f}", end="")

        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()

        if (i+1) % GRAD_ACCUMULATE == 0:
            optimizer.step()
            for p in model.parameters(): p.grad = None
            scheduler.step()

        losses.append(loss.detach().cpu().numpy())
        accs.append(acc)
        topk_accs.append(topk_acc)

        if (i+1) % val_k == 0:
            print("\n")
            sum_weights = 0.0
            loss = 0.0
            for x, y, weights in tqdm(val_dl):
                x, y, weights = x.to(device), y.to(device), weights.to(device)
                sum_weights += weights.sum()
                with torch.no_grad():
                    model.eval()
                    y_pred = model(x)
                    loss_all = arcface_loss(y_pred, y)
                    loss += (loss_all * weights).sum()
                    acc, topk_acc = metrics(y_pred, y, weights/weights.sum())
                    val_accs.append(acc)
                    val_topk_accs.append(topk_acc)
            print(f"\nValidation Loss: {loss / sum_weights:.4f}, Accuracy: {acc:.4f}, top-5 Accuracy: {topk_acc:.4f}\n")
            model.train()

In [None]:
topk_accs[0]

In [None]:
accs = torch.stack(accs).cpu().numpy()
topk_accs = torch.stack(topk_accs).cpu().numpy()
val_accs = torch.stack(val_accs).cpu().numpy()
val_topk_accs = torch.stack(val_topk_accs).cpu().numpy()

In [None]:
plt.plot(losses)
plt.title("Loss")
plt.show()
plt.plot(accs)
plt.title("Accuracy")
plt.show()
plt.plot(topk_accs)
plt.title("Top-k")
plt.show()

plt.plot(val_accs)
plt.title("Validation Accuracy")
plt.show()
plt.plot(val_topk_accs)
plt.title("Validation Top-k")
plt.show()

## Save

In [None]:
torch.save(model.state_dict(), "model.ckpt")
with open("id2y.pickle", "wb") as f:
    pickle.dump(id2y, f)

## Evaluate

In [None]:
path = "/kaggle/input/landmark-recognition-2020/"
df = pd.read_csv(path + "train.csv")
df["path"] = df["id"].map(lambda x: "/".join([path+"train"] + list(x[:3])+[x + ".jpg"]))
df.sort_values("landmark_id", inplace=True)
df.reset_index(drop=True, inplace=True)

grp_df = df.groupby("landmark_id").head(1)
int2id = {i:id_ for i, id_ in enumerate(grp_df["landmark_id"].values)}

In [None]:
grp_images = Images(grp_df, train=False)
test_images = Images(test_df, train=False)
BS = 1024

grp_dl = DataLoader(grp_images, BS, num_workers=mp.cpu_count(), pin_memory=True)
test_dl = DataLoader(test_images, BS, num_workers=mp.cpu_count(), pin_memory=True)

In [None]:
train_embeds = []
with torch.no_grad():
    model.eval()
    for x in tqdm(grp_dl):
        x = x.to(device)
        train_embeds.extend(model.get_embedding(x))

train_embeds = torch.stack(train_embeds)

In [None]:
ps = []
categories = []
with torch.no_grad():
    model.eval()
    for x in tqdm(test_dl):
        x = x.to(device)
        test = model.get_embedding(x)
        p, category = train_embeds.matmul(test.T).max(dim=0)
        category = [int2id[i] for i in category.cpu().numpy()]
        ps.extend((p + 1) * 0.5)
        categories.extend(category)

ps = torch.stack(ps).cpu().numpy()

In [None]:
test_df["id"] = test_df["path"].map(lambda x: os.path.basename(x)[:-4])
test_df["landmark"] = categories
test_df["p"] = ps
test_df["landmarks"] = test_df["landmark"].astype(str) + test_df["p"].map(lambda x: f" {x:.4f}")
cols = ["id", "landmarks"]
test_df[cols].to_csv("submission.csv", index=False)

In [None]:
test_df = test_df.sample(frac=1)
for i in range(5):
    plt.subplot(121)
    im_path = test_df["path"].values[i]
    plt.imshow(cv2.cvtColor(cv2.resize(cv2.imread(im_path), SIZE), cv2.COLOR_BGR2RGB))
    plt.subplot(122)
    im_path = grp_df.loc[grp_df["landmark_id"] == test_df["landmark"].values[i], "path"].values[0]
    plt.imshow(cv2.cvtColor(cv2.resize(cv2.imread(im_path), SIZE), cv2.COLOR_BGR2RGB))
    plt.show()