In [None]:
!pip uninstall typing -y
!pip install git+https://github.com/catalyst-team/catalyst@master --upgrade -q

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

from plotly import graph_objects as go

from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import albumentations as A

from catalyst import dl
from catalyst import utils
from catalyst import data
from catalyst.contrib.nn.criterion import TripletMarginLossWithSampler

import cv2

In [None]:
utils.set_global_seed(42)

In [None]:
train_df = pd.read_csv("../input/landmark-recognition-2020/train.csv")
train_df.head()

In [None]:
def load_img(id_: str, train=True, preproc=True):
    if preproc and train:
        root = Path(
            "../input/google-landmark-retrieval-2020-train-224x224/train_img"
        )
        path_to_img = root / str(id_)
        img = cv2.imread(str(path_to_img)+".jpg")
        return img
    if train:
        root = Path("../input/landmark-recognition-2020/train")
    else:
        root = Path("../input/landmark-recognition-2020/test")
    first_folder = root / str(id_[0])
    second_folder = first_folder / str(id_[1])
    third_folder = second_folder / str(id_[2])
    path_to_img = third_folder / str(id_)
    img = cv2.imread(str(path_to_img)+".jpg")
    return img

In [None]:
class ImgDataset(Dataset):
    def __init__(self, df, transforms = None, train: bool = True):
        self.id = df.id.values
        if train:
            self.labels = df.landmark_id.values
        self.train = train
        if transforms is None:
            transforms = A.Compose([
                A.Resize(width=224, height=224), 
                A.pytorch.ToTensor()
            ])
        self.transforms = transforms
        
    def __getitem__(self, idx: int):
        img = load_img(self.id[idx], train=self.train)
        tensor_img = self.transforms(image=img)["image"]
        
        output = {"features": tensor_img}
        if self.train:
            label = self.labels[idx]
            output["targets"] = label
        return output
    
    def __len__(self):
        return len(self.labels)
    
    def get_labels(self):
        return np.array(self.labels)

In [None]:
train_df_, valid_df_ = train_test_split(train_df, random_state=42, stratify=train_df.landmark_id.values)

In [None]:
train_ds = ImgDataset(train_df_)
valid_ds = ImgDataset(valid_df_)
sampler = data.BalanceBatchSampler(labels=train_ds.get_labels(), p=10, k=20)
train_dl = DataLoader(
    train_ds, sampler=sampler, batch_size=sampler.batch_size, num_workers=4
)
valid_dl = DataLoader(
    valid_ds, sampler=sampler, batch_size=sampler.batch_size, num_workers=4
)
loaders = {"train": train_dl, "valid": valid_dl}

In [None]:
from torchvision import models

In [None]:
model = models.resnext50_32x4d(pretrained=True)
for param in model.parameters():
    param.requires_grad = False
    
head = nn.Sequential(
    nn.Linear(1000, 512),
    nn.ReLU(),
    nn.Linear(512, 100),
)
model = nn.Sequential(
    model,
    head,
)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [None]:
import wandb

#wandb.login("never", "")
wandb.init(project="landmarks")

In [None]:
class MetricLearningRunner(dl.SupervisedRunner):
    def predict_batch(self, batch):
        embeddings = self.model(batch["features"].to(self.device))
        return embeddings, batch["targets"]

In [None]:
sampler_inbatch = data.HardTripletsSampler(norm_required=False)
criterion = TripletMarginLossWithSampler(margin=0.5, sampler_inbatch=sampler_inbatch)

# 4. training with catalyst Runner
callbacks = [
    dl.ControlFlowCallback(dl.CriterionCallback(), loaders="train"),
    #dl.WandbLogger(log_on_batch_end=True, project="landmarks"),
]

runner = MetricLearningRunner(device=utils.get_device())
runner.train(
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    callbacks=callbacks,
    loaders=loaders,
    minimize_metric=False,
    verbose=True,
    num_epochs=200,
    check=True,  # disable if you want to train
)   

In [None]:
def save_embeddings(loader, out_file_emb="embeddings.npy", out_file_labels="labels.npy"):
    embeddings = None
    labels = None
    loader = DataLoader(loader.dataset, batch_size=200, num_workers=2)
    for c_embeddings, c_labels in tqdm(runner.predict_loader(loader=loader), total=len(loader)):
        if embeddings is None:
            embeddings = c_embeddings.cpu().numpy()
            labels = c_labels.cpu().numpy()
            continue
        embeddings = np.vstack((embeddings, c_embeddings.cpu().numpy()))
        labels = np.vstack((labels, c_labels.cpu().numpy()))
    np.save(file=out_file_emb, arr=embeddings)
    np.save(file=out_file_labels, arr=labels)

In [None]:
save_embeddings(loaders["train"])

In [None]:
!catalyst-contrib create-index-model \
--in-npy "embeddings.npy" --out-npy "embeddings_pca.npy" \
--out-pipeline "pipeline.pkl" --out-knn "indexes.pkl" --n-hidden "32"