# Setup

In [None]:
!pip install timm
!pip install pytorch-metric-learning

# Imports

In [None]:
import numpy as np
import pandas as pd
import random
import os

from PIL import Image
from tqdm import tqdm

import timm
from timm.optim import Lookahead
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from pytorch_metric_learning.distances import CosineSimilarity

# Global

In [None]:
IMG_SIZE = 512
DATA_FOLDER  = '/kaggle/input/creation-dataset-512x512/'
TRAIN_FOLDER = DATA_FOLDER + 'train_images/'

In [None]:
random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.backends.cudnn.deterministic = True
os.environ['PYTHONHASHSEED'] = str(0)

# Dataset and transformations

In [None]:
import albumentations as A
import albumentations.pytorch as APT
import cv2

In [None]:
train_transform = A.Compose([
    A.HorizontalFlip(p=0.5),
    A.ShiftScaleRotate(p=0.25, border_mode=cv2.BORDER_CONSTANT),
    A.OpticalDistortion(p=0.25),
    A.Perspective(p=0.25),
    A.CoarseDropout(p=0.25),
    A.RandomBrightnessContrast(p=0.25),
    A.ToFloat(),
    APT.transforms.ToTensorV2(),
])

val_transform = A.Compose([
    A.HorizontalFlip(p=0.5),
    A.ToFloat(),
    APT.transforms.ToTensorV2(),
])

In [None]:
class HotelTrainDataset:
    def __init__(self, data, data_path, transform):
        self.data = data
        self.data_path = data_path
        self.transform = transform
        self.fake_load = False

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        record = self.data.iloc[idx]
        path = self.data_path + record['image_id']
        
        if self.fake_load:
            image = np.random.randint(0, 255, (32, 32, 3)).astype(np.uint8)
        else:
            image = np.array(Image.open(path)).astype(np.uint8)
        
        image = self.transform(image=image)
        return {
            'image': image['image'],
            'target': record['hotel_id_code'],
        }

# Model

In [None]:
class EmbeddingNet(nn.Module):
    def __init__(self, n_classes, embed_size, backbone_name):
        super(EmbeddingNet, self).__init__()
        
        self.embed_size = embed_size
        self.backbone = timm.create_model(backbone_name, pretrained=True)
        in_features = self.backbone.get_classifier().in_features

        fc_name, _ = list(self.backbone.named_modules())[-1]
        if fc_name == 'classifier':
            self.backbone.classifier = nn.Identity()
        elif fc_name == 'head.fc':
            self.backbone.head.fc = nn.Identity()
        elif fc_name == 'fc':
            self.backbone.fc = nn.Identity()
        else:
            raise Exception('Unknown classifier layer: ' + fc_name)
        
        self.post = nn.Sequential(
            nn.utils.weight_norm(nn.Linear(in_features, self.embed_size * 2), dim=None),
            nn.BatchNorm1d(self.embed_size * 2),
            nn.Dropout(0.2),
            nn.utils.weight_norm(nn.Linear(self.embed_size * 2, self.embed_size)),
        )

        self.classifier = nn.Sequential(
            nn.BatchNorm1d(self.embed_size),
            nn.Dropout(0.2),
            nn.Linear(self.embed_size, n_classes),
        )
        
    def embed_and_classify(self, x):
        x = self.forward(x)
        return x, self.classifier(x)

    def forward(self, x):
        x = self.backbone(x)
        x = x.view(x.size(0), -1)
        x = self.post(x)
        return x

# Model helper functions

In [None]:
def save_checkpoint(model, scheduler, optimizer, epoch, loss=None, score=None):
    checkpoint = {
        'epoch': epoch,
        'model': model.state_dict(),
        'scheduler': scheduler.state_dict(),
        'optimizer': optimizer.state_dict(),
        'loss': loss,
        'score': score,
    }
    torch.save(checkpoint, f'classification-model-latest.pt')

def load_checkpoint(model, scheduler):
    checkpoint = torch.load(f'/kaggle/input/classification-training/classification-model-latest.pt')
    model.load_state_dict(checkpoint['model'])
    scheduler.load_state_dict(checkpoint['scheduler'])
    return model, scheduler, checkpoint['epoch']

In [None]:
def train_epoch(args, model, loader, criterion, optimizer, scheduler, epoch):
    losses = []
    targets_all = []
    outputs_all = []
    
    model.train()
    t = tqdm(loader)
    for i, sample in enumerate(t):
        optimizer.zero_grad()
        
        images = sample['image'].to('cuda')
        targets = sample['target'].to('cuda')
        
        embeds, outputs = model.embed_and_classify(images)
        loss = criterion(outputs, targets)
        
        loss.backward()
        optimizer.step()
        scheduler.step()
                
        losses.append(loss.item())
        targets_all.extend(targets.cpu().numpy())
        outputs_all.extend(torch.sigmoid(outputs).detach().cpu().numpy())

        score = np.mean(targets_all == np.argmax(outputs_all, axis=1))
        t.set_description(f'Epoch {epoch}/{args.epochs} - Train loss:{loss:0.4f}, score: {score:0.4f}')
        
    return np.mean(losses), score

In [None]:
def get_embeds(loader, model, bar_desc):
    targets_all = []
    outputs_all = []
    
    model.eval()
    with torch.no_grad():
        t = tqdm(loader, desc=bar_desc)
        for i, sample in enumerate(t):
            input = sample['image'].to('cuda')
            target = sample['target'].to('cuda')
            output = model(input)
            
            targets_all.extend(target.cpu().numpy())
            outputs_all.extend(output.detach().cpu().numpy())
            
    return targets_all, outputs_all

In [None]:
def get_distance_matrix(embeds, base_embeds, distance_func):
    distance_matrix = []
    base_embeds = torch.Tensor(base_embeds)
    embeds_dataset = torch.utils.data.TensorDataset(torch.Tensor(embeds))
    embeds_dataloader = DataLoader(embeds_dataset, num_workers=2, batch_size=1024, shuffle=False)
    
    t = tqdm(embeds_dataloader)
    for i, sample in enumerate(t): 
        distances = distance_func(sample[0], base_embeds)
        distance_matrix.extend(distances.numpy())
        
    return np.array(distance_matrix)

In [None]:
def test_closest_match(base_df, base_embeds, valid_targets, valid_embeds, model, distance_func):
    distance_matrix = get_distance_matrix(valid_embeds, base_embeds, distance_func)

    preds = []
    N_val = len(valid_embeds)
    for i in tqdm(range(N_val), total=N_val, desc='Getting closest match'):
        tmp_df = base_df.copy()
        tmp_df['distance'] = distance_matrix[i]
        tmp_df = tmp_df.sort_values(by=['distance', 'hotel_id'], ascending=False).reset_index(drop=True)
        preds.extend([tmp_df['hotel_id_code'].unique()[:5]])

    y = np.repeat([valid_targets], repeats=5, axis=0).T
    preds = np.array(preds)
    acc_top_1 = (preds[:, 0] == valid_targets).mean()
    acc_top_5 = (preds == y).any(axis=1).mean()
    print(f'Accuracy: {acc_top_1:0.4f}, top 5 accuracy: {acc_top_5:0.4f}')
    return preds, distance_matrix


def test(base_loader, valid_loader, model, distance_func):
    base_targets, base_embeds = get_embeds(base_loader, model, 'Generating embeds for train')
    valid_targets, valid_embeds = get_embeds(valid_loader, model, 'Generating embeds for test')
    val_preds, distance_matrix = test_closest_match(base_loader.dataset.data, base_embeds, valid_targets, valid_embeds, model, distance_func)

    return base_embeds, valid_embeds, base_targets, valid_targets, val_preds, distance_matrix

# Train

In [None]:
def iterate_loader(loader, epochs):
    loader.dataset.fake_load = True
    for i in range(epochs):
        with torch.no_grad():
            t = tqdm(loader, desc=f'Iterating loader {i+1}/{epochs}')
            for j, sample in enumerate(t):
                images = sample['image']
                targets = sample['target']

    loader.dataset.fake_load = False

In [None]:
def train_and_validate(args, data_df):
    model_name = f'classification-model-{args.backbone_name}-{args.embed_size}embeds-{args.n_classes}hotels'
    print(model_name)

    val_df   = data_df.groupby('hotel_id').sample(args.val_samples, random_state=0)
    train_df = data_df[~data_df['image_id'].isin(val_df['image_id'])]

    train_dataset = HotelTrainDataset(train_df, TRAIN_FOLDER, train_transform)
    train_loader  = DataLoader(train_dataset, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=True, drop_last=True)
    base_dataset  = HotelTrainDataset(train_df, TRAIN_FOLDER, val_transform)
    base_loader   = DataLoader(base_dataset, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False)
    val_dataset   = HotelTrainDataset(val_df, TRAIN_FOLDER, val_transform)
    valid_loader  = DataLoader(val_dataset, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False)

    print('Base:', len(base_dataset))
    print('Validation:', len(val_dataset))

    model = EmbeddingNet(args.n_classes, args.embed_size, args.backbone_name)
    model = model.to('cuda')

    distance  = CosineSimilarity()
    criterion = nn.CrossEntropyLoss()
    optimizer = Lookahead(torch.optim.AdamW(model.parameters(), lr=args.lr), k=3)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer,
            max_lr=args.lr,
            epochs=args.epochs,
            steps_per_epoch=len(train_loader),
            div_factor=10,
            final_div_factor=1,
            pct_start=0.1,
            anneal_strategy='cos',
        )
    
    start_epoch = 1
    if args.continue_from_checkpoint:
        model, scheduler, last_epoch = load_checkpoint(model, scheduler)
        iterate_loader(train_loader, last_epoch)
        start_epoch = start_epoch + last_epoch

    torch.cuda.empty_cache()

    for epoch in range(start_epoch, args.epochs + 1):
        train_loss, train_score = train_epoch(args, model, train_loader, criterion, optimizer, scheduler, epoch)
    
    save_checkpoint(model, scheduler, optimizer, epoch, train_loss, train_score)
    base_embeds, valid_embeds, base_targets, valid_targets, val_preds, distance_matrix = test(base_loader, valid_loader, model, distance)

# Prepare data

In [None]:
data_df = pd.read_csv(DATA_FOLDER + 'train_df.csv').drop(['Unnamed: 0'], axis=1)
data_df = data_df.sample(frac=0.9, random_state=0)
data_df['hotel_id_code'] = data_df['hotel_id'].astype('category').cat.codes.values.astype(np.int64)
data_df.head()

# Evalaute

In [None]:
%%time 

class args:
    epochs = 10
    lr = 1e-3
    batch_size = 16
    num_workers = 2
    embed_size = 4096
    val_samples = 1
    continue_from_checkpoint = False
    backbone_name = 'efficientnet_b0'
    n_classes = data_df['hotel_id_code'].nunique()

train_and_validate(args, data_df)