In [None]:
%%bash 
cp /kaggle/input/landmark-lib/efficientnet_pytorch-0.7.0.xyz /kaggle/working/efficientnet_pytorch-0.7.0.tar.gz
pip install /kaggle/working/efficientnet_pytorch-0.7.0.tar.gz

# install rest of stuff
files=$(ls /kaggle/input/landmark-lib/*.whl)
for file in $files
do
    if [[ $file != *"efficientnet"* && $file != *"Keras"* ]]; then
        pip install $file
    fi
done

pip install pytorch-lightning

In [None]:
import os
import gc
gc.enable()
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import multiprocessing as mp
from functools import partial
import pickle

import cv2

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import _LRScheduler, StepLR
import torch_optimizer as optim

import pytorch_lightning as pl


device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

import albumentations as A
from albumentations.pytorch import ToTensorV2

from efficientnet_pytorch import EfficientNet

import warnings
warnings.simplefilter('ignore')
%matplotlib inline

torch.backends.cudnn.benchmark = True

# torch.cuda.memory_allocated()
# torch.cuda.get_device_properties(0).total_memory / 1e9
print(torch.__version__, device, mp.cpu_count())

## Config

In [None]:
BASE = "efficientnet-b0"
WEIGHTS_PATH = "/kaggle/input/landmark-lib/efficientnet-b0-355c32eb.pth"
SIZE = (128, 128)
EPOCHS = 4
GRAD_ACCUMULATE = 2
BS = 256
LR_RANGE = [1e-7, 2e-4]
MAX_GRP_NUM = 200
MIN_LANDMARK_COUNT = 100
SAMPLES_PER_GRP = 5
NUM_TOP_PREDICTS = 20

## Data
### Get Training/ Validation dataframes
- We ignore classes that have less than `MIN_LANDMARK_COUNT` examples.
- Any class that has more than `MAX_GRP_NUM` we only take `MAX_GRP_NUM` samples.
- We weight every example to count for the data imbalance.

In [None]:
path = "/kaggle/input/landmark-recognition-2020/"
df = pd.read_csv(path + "train.csv")
df["path"] = df["id"].map(lambda x: "/".join([path+"train"] + list(x[:3])+[x + ".jpg"]))
df.sort_values("landmark_id", inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
counts = df["landmark_id"].value_counts()
topk = counts[counts >= MIN_LANDMARK_COUNT].index
df = df[df["landmark_id"].isin(topk)]
df.shape

In [None]:
dfs = []
for _, grp in tqdm(df.groupby("landmark_id")):
    if len(grp) < MAX_GRP_NUM:
        dfs.append(grp)
    else:
        dfs.append(grp.sample(MAX_GRP_NUM))

df = pd.concat(dfs)
print(df.shape)

In [None]:
id2y = {id_:i for i, id_ in enumerate(df["landmark_id"].unique())}
df["target"] = df["landmark_id"].map(lambda x: id2y[x])
counts = df["target"].value_counts()

weights = counts.max() / counts 
df["weights"] = weights.loc[df["target"]].values
print(f"There are {len(id2y)} classes")

print(df.shape)
df.head()

In [None]:
train_df, val_df = train_test_split(df, test_size=0.15, random_state=42, stratify=df["target"])
train_df.reset_index(drop=True, inplace=True)
train_df = train_df.sample(frac=1)
val_df.reset_index(drop=True, inplace=True)

In [None]:
test_files = []
for root, dirs, files in tqdm(os.walk(path+"test/")):
    if files:
        files = [root+"/"+file for file in files]
        test_files.extend(files)
        
test_df = pd.DataFrame({"path": test_files})

### PyTorch Datasets + DataLoaders
- We use Albumentations to augment the images. See below for the augmentations.
- Read in and resize the images from cv2 since it is faster that way.

In [None]:
train_tfms = A.Compose([
    A.RandomBrightnessContrast(),
    A.Blur(),
    A.RandomContrast(),
    A.HorizontalFlip(),
], p=0.9)

test_tfms = A.Compose([
    A.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
    ),
    ToTensorV2()
])

train_tfms = A.Compose([train_tfms, test_tfms])

In [None]:
class Images(Dataset):
    def __init__(self, df: pd.DataFrame, tfms:A.Compose, train: bool = True):
        """
        Parameters:
            df (pd.DataFrame): DataFrame with data description
            train (bool): flag of whether a training dataset is being initialized or testing one
            transforms: image transformation method to be applied
        """
        if train:
            df["weights"] = df["weights"].astype(np.float32)
        self.tfms = tfms
        self.df = df.reset_index(drop=True)
        self.train = train
        
    def __getitem__(self, index):
        im_path = self.df.loc[index, 'path']
        x = cv2.cvtColor(cv2.resize(cv2.imread(im_path), SIZE), cv2.COLOR_BGR2RGB)
        x = self.tfms(image=x)['image']
            
        if self.train:
            weights = self.df.loc[index, 'weights']
            y = self.df.loc[index, 'target']
            return x, y, weights
        else:
            return x
    
    def __len__(self):
        return len(self.df)

In [None]:
train_images = Images(train_df, train_tfms)
val_images = Images(val_df, test_tfms)
test_images = Images(test_df, test_tfms, train=False)

train_dl = DataLoader(train_images, BS, num_workers=mp.cpu_count(), pin_memory=True, shuffle=True, drop_last=True)
val_dl = DataLoader(val_images, BS, num_workers=mp.cpu_count(), pin_memory=True)
test_dl = DataLoader(test_images, BS, num_workers=mp.cpu_count(), pin_memory=True)

## Loss
The arcface loss is used instead of the usual cross entropy loss. See the arcface paper [here]().

The main points are:
- We already get the cos of the angle between the head and the embedding. See model below for details.
- We take the arccos to get the angle and add a margin m.
- However, if the sum is greater than 180 degrees, we don't add that margin parameter. This is done because if we don't, the loss encourages the angle to get bigger (towards 180), which is the opposite of what we want.

In [None]:
class ArcFaceLoss(nn.Module):
    def __init__(self, s:float=64.0, m:float=0.5):
        super().__init__()
        self.s, self.m = torch.tensor(s), torch.tensor(m)
        self.threshold = torch.tensor(np.pi - m)
        self.cross_entropy = partial(F.cross_entropy, reduction='none')
        
    def forward(self, costheta, y):
        costheta_y = costheta[torch.arange(len(y)), y]
        angle = torch.acos(costheta_y)
        # ensure that new angle is less than pi before adding margin m
        angle[angle < self.threshold] = angle[angle < self.threshold] + self.m
        costheta_y = torch.cos(angle)
        costheta[torch.arange(len(y)), y] = costheta_y.type(costheta.dtype)
        
        return self.cross_entropy(self.s*costheta, y)

## Model
The model that we are using is very similar to simly putting a "head" ontop of a "efficientnet" architecture. However, the difference is we normalise the embedding created from the base model before multiplying with the head which is also normalised. Note: Normalising here means making the vectors unit length.

In [None]:
def metrics(y_pred, y, weights, k=5):
    """
    Weighted accuracy and top-k accuracy
    parameters:
    - y_pred: predicted logits or probabilities 
    - y: Actual class
    - weights: importance of each instance **must sum to one**
    - k: number of categories to look for
    """
    topk = y_pred.topk(k=k, dim=-1)[1] == y[:, None]
    topk_acc = (weights * topk.any(dim=-1).float()).sum()
    acc = (weights * topk[:,0].float()).sum()
    return acc, topk_acc

In [None]:
class Model(pl.LightningModule):
    def __init__(self, classes, loss_fn=ArcFaceLoss()):
        super().__init__()

        # EfficientNet
        self.base = model = EfficientNet.from_pretrained(BASE, WEIGHTS_PATH)
                
        # Replace last layer
        self.centers = nn.Parameter(torch.randn(self.base._fc.in_features, classes))
        self.loss_fn = loss_fn
    
    def get_embedding(self, x):
        pool = F.adaptive_avg_pool2d(self.base.extract_features(x), 1)
        pool = pool.view(x.shape[0], -1)
        
        lens = torch.sqrt((pool**2).sum(dim=-1, keepdim=True))
        return pool / lens
    
    def forward(self, x):
        embeds = self.get_embedding(x)
        
        lens = torch.sqrt((self.centers**2).sum(dim=0, keepdim=True))
        centers = self.centers / lens
        
        return embeds.matmul(centers)
    
    def get_loss_metrics(self, batch):
        x, y, weights = batch
        y_pred = self(x)
        
        loss_all = self.loss_fn(y_pred, y)
        loss = (loss_all * weights).mean()
        
        acc, topk_acc = metrics(y_pred, y, weights)
        
        return loss, acc, topk_acc

    def training_step(self, batch, batch_idx):
        loss, acc, topk_acc = self.get_loss_metrics(batch)
        self.log_dict({'train_loss': loss, 'train_acc': acc, 'train_topk': topk_acc}, prog_bar=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        loss, acc, topk_acc = self.get_loss_metrics(batch)
        self.log_dict({'valid_loss': loss, 'valid_acc': acc, 'valid_topk': topk_acc}, prog_bar=True)
    
    def configure_optimizers(self):
        optimizer = optim.RAdam(self.parameters(), lr=1e-2)
        scheduler = StepLR(optimizer, step_size=100, gamma=0.9)
        return [optimizer], [scheduler]

In [None]:
model = Model(len(id2y)) # , unfreeze=unfreeze
model = model.to(device)

## Initialise
It seems like a smart idea to initialise the final "center" weights so that they are the average of the embeddings for a given category.

In [None]:
samples_per_landmark = pd.concat([grp.sample(SAMPLES_PER_GRP) for _, grp in train_df.groupby("landmark_id")])
sample_images = Images(samples_per_landmark, test_tfms) 
samples_dl = DataLoader(sample_images, BS, num_workers=mp.cpu_count(), pin_memory=True, shuffle=False, drop_last=False)

In [None]:
embeds = []
with torch.no_grad():
    model.eval()
    for i, (x, _, _) in tqdm(enumerate(samples_dl), total=len(samples_dl)):
        x = x.to(device)
        embeds.extend(model.get_embedding(x))
        
embeds = torch.stack(embeds)

# get average directions
centers = embeds.view(-1, SAMPLES_PER_GRP, embeds.shape[-1]).mean(dim=1)
lens = torch.sqrt((centers**2).sum(dim=1))
centers = centers / lens[:, None]
model.centers.data = centers.T

## Train

In [None]:
trainer = pl.Trainer(tpu_cores=8, val_check_interval=0.5) #
trainer.fit(model, train_dl, val_dl)

In [None]:
plt.plot(losses)
plt.title("Loss")
plt.show()
plt.plot(accs)
plt.title("Accuracy")
plt.show()
plt.plot(topk_accs)
plt.title("Top-k")
plt.show()

plt.plot(val_losses)
plt.title("Validation Loss")
plt.show()
plt.plot(val_accs)
plt.title("Validation Accuracy")
plt.show()
plt.plot(val_topk_accs)
plt.title("Validation Top-k")
plt.show()

## Save

In [None]:
torch.save(model.state_dict(), "model.ckpt")
with open("id2y.pickle", "wb") as f:
    pickle.dump(id2y, f)

In [None]:
ps = []
categories = []
with torch.no_grad():
    model.eval()
    for x in tqdm(test_dl):
        x = x.to(device)
        output = F.softmax(64*model(x), -1)
        p, category = torch.topk(output, NUM_TOP_PREDICTS)
        ps.extend(p)
        categories.extend(category)

ps = torch.stack(ps).cpu().numpy()
categories = torch.stack(categories).cpu().numpy()

In [None]:
y2id = {y:id_ for id_,y in id2y.items()}
categories = np.array([[y2id[pred] for pred in preds] for preds in categories])
def concat(label: np.ndarray, conf: np.ndarray) -> str:
    return ' '.join([f'{L} {c}' for L, c in zip(label, conf)])

landmarks = [concat(category, p) for category, p in zip(categories, ps)]

In [None]:
test_df["id"] = test_df["path"].map(lambda x: os.path.basename(x)[:-4])
test_df["landmarks"] = landmarks
test_df.drop("path", axis=1).to_csv("/kaggle/working/submission.csv", index=False)

In [None]:
test_df["landmark"] = test_df["landmarks"].map(lambda x: int(x.split()[0]))
test_df["landmark_p"] = test_df["landmarks"].map(lambda x: float(x.split()[1]))
test_df.head()

Histogram of confidences of the top category. Interesting to see that the peak is just above >0.01. It's ok considering there are ~2000 categories.

In [None]:
plt.hist(test_df["landmark_p"], 40)
plt.show()

## Examples of highly confident test images
- Left column is a sample of 5 images from training set
- Right column is a sample of UPTO 5 images from the test set.

In [None]:
for i, (name, grp) in enumerate(test_df[test_df["landmark_p"]>0.02].groupby("landmark")):
    if i == 5:
        break
    train_examples = df[df["landmark_id"]==name].sample(4)["path"].values
    test_examples = grp.head(4)["path"].values
    print(name)
    print("="*100)
    plt.figure(figsize=(12, 12))
    for j in range(4):
        if len(train_examples) > j:
            im_path = train_examples[j]
            plt.subplot(int(f"42{2*j+1}"))
            plt.imshow(cv2.cvtColor(cv2.resize(cv2.imread(im_path), SIZE), cv2.COLOR_BGR2RGB))
        if len(test_examples) > j:
            im_path = test_examples[j]
            plt.subplot(int(f"42{2*j+2}"))
            plt.imshow(cv2.cvtColor(cv2.resize(cv2.imread(im_path), SIZE), cv2.COLOR_BGR2RGB))
    plt.show()