# Introduction
PyTorch embeddings classifier of whales and dolphins

## Importing libraries

In [None]:
import os
import gc
import copy
import cv2
import random
import time
from datetime import datetime

import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets, models, transforms
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler

from sklearn.model_selection import GroupKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors

from PIL import Image

import albumentations as A
from albumentations.pytorch import ToTensorV2

from tqdm import tqdm
from collections import defaultdict
import joblib

import optuna
from optuna.trial import TrialState

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from colorama import Fore, Back, Style
b_ = Fore.BLUE
sr_ = Style.RESET_ALL

%matplotlib inline

## Wandb

In [None]:
!pip install git+https://github.com/rwightman/pytorch-image-models
!pip install --upgrade wandb

In [None]:
import timm
import wandb

try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    api_key = user_secrets.get_secret('wandb_api')
    wandb.login(key=api_key)
    anony = None
except:
    anony = 'must'
    print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')

## Constants

In [None]:
CONFIG = {'seed': 2022,
          'epochs': 5,
          'img_size': 256,
          'model_name': 'tf_efficientnet_b0',
          'embedding_size': 256,
          'train_batch_size': 32,
          'valid_batch_size': 64,
          'learning_rate': 1e-4,
          'scheduler': 'CosineAnnealingLR',
          'min_lr': 1e-6,
          'T_max': 500,
          'weight_decay': 1e-6,
          'n_fold': 5,
          'neigh': 100,
          'margin': 0,
          'device': torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
         }
          
DATA_DIR = '../input/happy-whale-and-dolphin'
TRAIN_DIR = '../input/happy-whale-and-dolphin/train_images'
TEST_DIR = '../input/happy-whale-and-dolphin/test_images'

# Exploratory Data Analysis

## Loading Metadata

In [None]:
df_train = pd.read_csv(f'{DATA_DIR}/train.csv')
df_train['path'] = f'{DATA_DIR}/train_images/' + df_train['image']

df_train.head()

In [None]:
df_test = pd.read_csv('../input/happy-whale-and-dolphin/sample_submission.csv')
df_test['path'] = '../input/happy-whale-and-dolphin/test_images/' + df_test['image']

df_test.head()

## Data Analysis

In [None]:
print('Train samples count: ', len(df_train.index))
df_train.columns

In [None]:
print('Species Count: ',len(df_train['species'].value_counts().index))
df_train['species'].value_counts()

### Fixing duplicate labels
`beluga` -> `beluga_whale`  
`kiler_whale` -> `killer_whale`  
`bottlenose_dolpin` -> `bottlenose_dolphin`  
`globis` -> `globis_whale`

In [None]:
print('Before fixing duplicate labels : ')
print('Number of unique species : ', df_train['species'].nunique())

df_train['species'].replace({
    'bottlenose_dolpin': 'bottlenose_dolphin',
    'kiler_whale': 'killer_whale',
    'beluga': 'beluga_whale',
    'globis': 'globis_whale',
},inplace =True)

print('\nAfter fixing duplicate labels : ')
print('Number of unique species : ', df_train['species'].nunique())


df_train['class'] = df_train['species'].apply(lambda x: x.split('_')[-1])
df_train.head()

In [None]:
df_train.isna().sum()

In [None]:
len(os.listdir('../input/happy-whale-and-dolphin/train_images'))

## Visualization of data

In [None]:
plt.figure(figsize = (15,12))
for idx,i in enumerate(df_train.species.unique()):
    plt.subplot(4,7,idx + 1)
    df = df_train[df_train['species'] == i].reset_index(drop=True)
        
    image_path = df.loc[random.randint(0, len(df) - 1),'path']
    img = Image.open(image_path)
    img = img.resize((224,224))
    plt.imshow(img)
    plt.axis('off')
    plt.title(i)

plt.tight_layout()
plt.show()

In [None]:
def plot_species(df,species_name):
    plt.figure(figsize = (12,12))
    species_df = df[df['species'] ==species_name].reset_index(drop = True)
    plt.suptitle(species_name)
    for idx,i in enumerate(np.random.choice(species_df['path'],8)):
        plt.subplot(8,8,idx+1)
        image_path = i
        img = Image.open(image_path)
        img = img.resize((224,224))
        plt.imshow(img)
        plt.axis('off')
    
    plt.tight_layout()
    plt.show()

In [None]:
for species in df_train['species'].unique():
    plot_species(df_train, species)

In [None]:
def plot_individual(df,individual_id):
    plt.figure(figsize = (12,12))
    species_df = df[df['individual_id'] == individual_id].reset_index(drop = True)
    plt.suptitle(individual_id)
    for idx,i in enumerate(np.random.choice(species_df['path'],8)):
        plt.subplot(8,8,idx+1)
        image_path = i
        img = Image.open(image_path)
        img = img.resize((224,224))
        plt.imshow(img)
        plt.axis('off')
    plt.tight_layout()
    plt.show()

In [None]:
top_5_ids = df_train.individual_id.value_counts().head(5)
for i in top_5_ids.index:
    plot_individual(df_train , i)

# Class Distribution Analysis

## Bar chart of whales/dolphins

In [None]:
plot = sns.countplot(x = df_train['class'], color = '#2596be')
sns.despine()
plot.set_title('Class Distribution\n', font = 'serif', x = 0.1, y=1, fontsize = 16)
plot.set_ylabel('Count', x = 0.02, font = 'serif', fontsize = 12)
plot.set_xlabel('Species', fontsize = 12, font = 'serif')

for p in plot.patches:
    plot.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2, p.get_height()), 
       ha = 'center', va = 'center', xytext = (0, -20),font = 'serif', textcoords = 'offset points', size = 15)

## Circle diagram of whales/dolphins

In [None]:
plt.figure(figsize=(5,5))
class_cnt = df_train.groupby(['class']).size().reset_index(name = 'counts')
colors = sns.color_palette('Paired')[0:9]
plt.pie(class_cnt['counts'], labels=class_cnt['class'], colors=colors, autopct='%1.1f%%')
plt.legend(loc='upper left')
plt.show()

## Bar chart by species

In [None]:
plt.figure(figsize=(8,8))
sns.countplot(data=df_train, y = 'species',  palette='crest', dodge=False)
plt.show()

## Bar charts for most frequent whales and dolphins

In [None]:
fig,ax = plt.subplots(1,2,figsize=(10,5))

whales = df_train[df_train['class'] == 'whale']
dolphins = df_train[df_train['class'] != 'whale']

sns.countplot(y='species', data=whales, order=whales.iloc[0:]['species'].value_counts().index, ax=ax[0], color='#0077b6')
ax[0].set_title('Most frequent whales')
ax[0].set_ylabel(None)
    
sns.countplot(y='species', data=dolphins,order=dolphins.iloc[0:]['species'].value_counts().index, ax=ax[1], color='#90e0ef')
ax[1].set_title('Most frequent dolphins')
ax[1].set_ylabel(None)

plt.tight_layout()
plt.show()

# Data normalization

In [None]:
# %%time
# transform = transforms.Compose([transforms.Resize(255), transforms.CenterCrop(224), transforms.ToTensor()])
# dataset = datasets.ImageFolder(DATA_DIR, transform=transform)

# kwargs = {'num_workers': 10, 'pin_memory': True, 'persistent_workers': True} if CONFIG['device'] == 'cuda' else {}
# dataloader = torch.utils.data.DataLoader(dataset, batch_size=256, shuffle=True, **kwargs)

In [None]:
# %%time
# images, labels = next(iter(dataloader))
# plt.imshow(images[0].permute(1, 2, 0))

In [None]:
def get_mean_and_std(dataloader, device):
    channels_sum, channels_squared_sum = 0, 0
    num_batches = len(dataloader)
    
    for data in tqdm(dataloader):        
        # mean over batch, height and width, but not over the channels
        images = data[0].to(device, dtype=torch.float)
        channels_sum += torch.mean(images, dim=[0, 2, 3])
        channels_squared_sum += torch.mean(images ** 2, dim=[0, 2, 3])
    
    mean = channels_sum / num_batches

    # std = sqrt(E[X^2] - (E[X])^2)
    std = (channels_squared_sum / num_batches - mean ** 2) ** 0.5

    return mean, std

In [None]:
# %%time
# # ~2h 30min

# print(datetime.now())
# mean, std = get_mean_and_std(dataloader, CONFIG['device'])

# print('Mean:', mean)
# print('Standard deviation:', std)

Mean: `tensor([0.4286, 0.4748, 0.5269])`  
Standard deviation: `tensor([0.2193, 0.2160, 0.2262])`

In [None]:
mean = torch.tensor([0.4286, 0.4748, 0.5269])
std = torch.tensor([0.2193, 0.2160, 0.2262])

In [None]:
%%time
transform = transforms.Compose([transforms.Resize(CONFIG['img_size']), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=mean, std=std)])

kwargs = {'num_workers': 10, 'pin_memory': True, 'persistent_workers': True} if CONFIG['device'] == 'cuda' else {}
dataset = datasets.ImageFolder(DATA_DIR, transform=transform)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=256, shuffle=True)

In [None]:
images, labels = next(iter(dataloader))
plt.imshow(images[0].permute(1, 2, 0))

# Training preparation

In [None]:
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(CONFIG['seed'])

In [None]:
gkf = GroupKFold(n_splits=CONFIG['n_fold'])

for fold, ( _, val_) in enumerate(gkf.split(X=df_train, y=df_train.individual_id, groups=df_train.individual_id)):
      df_train.loc[val_ , 'kfold'] = fold

In [None]:
df_train.kfold.value_counts()

# Dataset class

In [None]:
class HappyWhaleDataset(Dataset):
    def __init__(self, df, transforms=None):
        self.df = df
        self.groups = df.groupby('individual_id').groups
        self.keys = list(self.groups.keys())
        self.transforms = transforms
        
    def __len__(self):
        return len(self.groups)
    
    def __getitem__(self, index):         
        # get first random image
        image_indices_1 = self.groups[self.keys[index]]
        image_path_1 = self.df.iloc[image_indices_1, :].sample(n=1)['path'].values[0]
        image_1 = cv2.cvtColor(cv2.imread(image_path_1), cv2.COLOR_BGR2RGB)
        individual_id_1 = self.df.iloc[image_indices_1, :]['individual_id'].values[0]
        
        # get second random image
        image_index_2 = self.df.sample(n=1).index
        image_path_2 = self.df.iloc[image_index_2, :]['path'].values[0]
        image_2 = cv2.cvtColor(cv2.imread(image_path_2), cv2.COLOR_BGR2RGB)
        individual_id_2 = self.df.iloc[image_index_2, :]['individual_id'].values[0]
        
        # 1 if individual ids match, -1 otherwise
        target = 1 if individual_id_1 == individual_id_2 else -1
        
        # transform the dataset if transformations were specified
        if self.transforms:
            image_1 = self.transforms(image=image_1)['image']
            image_2 = self.transforms(image=image_2)['image']
        
        return {
            'image1': image_1,
            'image2': image_2,
            'target': torch.tensor(target, dtype=torch.int)
        }

In [None]:
data_transforms = {
    'train': A.Compose([
        A.Resize(CONFIG['img_size'], CONFIG['img_size']),
        A.HorizontalFlip(p=0.5),
        A.VerticalFlip(p=0.5),
        A.Normalize(
                mean=mean, 
                std=std, 
                max_pixel_value=255.0, 
                p=1.0
            ),
        ToTensorV2()], p=1.),
    
    'valid': A.Compose([
        A.Resize(CONFIG['img_size'], CONFIG['img_size']),
        A.Normalize(
                mean=mean, 
                std=std, 
                max_pixel_value=255.0, 
                p=1.0
            ),
        ToTensorV2()], p=1.)
}

In [None]:
def prepare_loaders(df, fold):
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    train_dataset = HappyWhaleDataset(df_train, transforms=data_transforms['train'])
    valid_dataset = HappyWhaleDataset(df_valid, transforms=data_transforms['valid'])

    train_loader = DataLoader(train_dataset, batch_size=CONFIG['train_batch_size'], 
                              num_workers=2, shuffle=True, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset, batch_size=CONFIG['valid_batch_size'], 
                              num_workers=2, shuffle=False, pin_memory=True)
    
    return train_loader, valid_loader

In [None]:
# Prepare dataloaders
train_loader, valid_loader = prepare_loaders(df_train, fold=0)

# Loss

In [None]:
def criterion(outputs1, outputs2, targets):
    return nn.CosineEmbeddingLoss(margin=CONFIG['margin'])(outputs1, outputs2, targets)

# Create model

In [None]:
class HappyWhaleModel(nn.Module):
    def __init__(self, model_name, pretrained=True):
        super(HappyWhaleModel, self).__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained, num_classes=0)
        self.fc = nn.LazyLinear(CONFIG['embedding_size'])
        self.dropout = nn.Dropout(p=0.3)

    def forward(self, images):
        features = self.model(images)
        features = self.dropout(features)
        output = self.fc(features)
        return output
    
model = HappyWhaleModel(CONFIG['model_name'])
model.to(CONFIG['device'])

In [None]:
# dummy run to initialize the layers 
img = torch.randn(1, 3, CONFIG['img_size'], CONFIG['img_size']).to(CONFIG['device'])
model(img)

# Find best hyperparameters with optuna

In [None]:
def objective(trial):
    N_TRAIN_EXAMPLES = CONFIG['train_batch_size'] * 12
    N_VALID_EXAMPLES = CONFIG['valid_batch_size'] * 4
    device = CONFIG['device']

    # Generate the model.
    model = HappyWhaleModel(CONFIG['model_name']).to(device)

    # Generate the optimizers.
    optimizer_number = trial.suggest_categorical('optimizer number (0: Adam; 1: SGD)', [0, 1])
    lr = trial.suggest_float('learning rate', 1e-2, 1e-1, log=True)
    
    optimizer_map = {
        0: 'Adam',
        1: 'SGD',
    }
    
    optimizer = getattr(optim, optimizer_map[optimizer_number])(model.parameters(), lr=lr)
    
    config = {
        'optimizer (0: Adam; 1: SGD)': optimizer_number,
        'learning rate': lr,
    }
    
    run = wandb.init(project='HappyWhale',
                     name=f'trial_{trial.number + 1}',
                     group='optuna research',
                     config=config,
                     anonymous='must')

    # Training of the model.
    for epoch in range(CONFIG['epochs']):
        model.train()
    
        dataset_size = 0
        running_loss = 0.0
        
        for batch_idx, data in enumerate(train_loader):
            # Limiting training data for faster epochs.
            if batch_idx * CONFIG['train_batch_size'] > N_TRAIN_EXAMPLES:
                break

            images1 = data['image1'].to(device, dtype=torch.float)
            images2 = data['image2'].to(device, dtype=torch.float)
            targets = data['target'].to(device, dtype=torch.int)

            batch_size = images1.size(0)

            outputs1 = model(images1)
            outputs2 = model(images2)
            loss = criterion(outputs1, outputs2, targets)
            loss.backward()
            optimizer.step()
            
            # zero the parameter gradients
            optimizer.zero_grad()

        # Validation of the model.
        model.eval()
        correct = 0
        with torch.no_grad():
            for batch_idx, data in enumerate(valid_loader):
                # Limiting validation data.
                if batch_idx * CONFIG['valid_batch_size'] > N_VALID_EXAMPLES:
                    break
                
                images1 = data['image1'].to(device, dtype=torch.float)
                images2 = data['image2'].to(device, dtype=torch.float)
                targets = data['target'].to(device, dtype=torch.int)

                batch_size = images1.size(0)

                outputs1 = model(images1)
                outputs2 = model(images2)
                loss = criterion(outputs1, outputs2, targets)

                running_loss += (loss.item() * batch_size)
                dataset_size += batch_size

        epoch_loss = running_loss / dataset_size
        run.log({'Cosine Embedding Loss': epoch_loss})

        trial.report(epoch_loss, epoch)

        # handle pruning based on the intermediate value.
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    run.finish()
    
    return epoch_loss

In [None]:
study = optuna.create_study()
study.optimize(objective, n_trials=20, timeout=3600, show_progress_bar=True)

print('Number of finished trials: {}'.format(len(study.trials)))

print('Best trial:')
trial = study.best_trial

print('  Value: {}'.format(trial.value))

print('  Params: ')
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))
          
# Create the summary run.
summary = wandb.init(project='HappyWhale',
                     group='optuna summary',
                     name='summary')

# Getting the study trials.
trials = study.trials

# WandB summary.
for step, trial in enumerate(trials):
    # Logging the loss.
    summary.log({'Cosine Embedding Loss': trial.value}, step=step, commit=False)

    # Logging the parameters.        
    summary.log(trial.params, commit=True)
    
summary.finish()

# Training function

In [None]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.train()
    
    dataset_size = 0
    running_loss = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        images1 = data['image1'].to(device, dtype=torch.float)
        images2 = data['image2'].to(device, dtype=torch.float)
        targets = data['target'].to(device, dtype=torch.int)
        
        batch_size = images1.size(0)

        outputs1 = model(images1)
        outputs2 = model(images2)
        
        loss = criterion(outputs1, outputs2, targets)
        loss.backward()

        optimizer.step()

        # zero the parameter gradients
        optimizer.zero_grad()

        if scheduler is not None:
            scheduler.step()
                
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        bar.set_postfix(Epoch=epoch, Train_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])
    gc.collect()
    
    return epoch_loss

# Validation function

In [None]:
@torch.inference_mode()
def valid_one_epoch(model, dataloader, device, epoch):
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        images1 = data['image1'].to(device, dtype=torch.float)
        images2 = data['image2'].to(device, dtype=torch.float)
        targets = data['target'].to(device, dtype=torch.int)
        
        batch_size = images1.size(0)

        outputs1 = model(images1)
        outputs2 = model(images2)
        loss = criterion(outputs1, outputs2, targets)
        
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        bar.set_postfix(Epoch=epoch, Valid_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])   
    
    gc.collect()
    
    return epoch_loss

# Run training

In [None]:
def run_training(model, optimizer, scheduler, device, num_epochs):
    # To automatically log gradients
    wandb.watch(model, log_freq=100)
    
    if torch.cuda.is_available():
        print('[INFO] Using GPU: {}\n'.format(torch.cuda.get_device_name()))
    
    start = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_epoch_loss = np.inf
    history = defaultdict(list)
    
    for epoch in range(1, num_epochs + 1): 
        gc.collect()
        train_epoch_loss = train_one_epoch(model, optimizer, scheduler, 
                                           dataloader=train_loader, 
                                           device=CONFIG['device'], epoch=epoch)
        
        val_epoch_loss = valid_one_epoch(model, valid_loader, device=CONFIG['device'], 
                                         epoch=epoch)
    
        history['Train Loss'].append(train_epoch_loss)
        history['Valid Loss'].append(val_epoch_loss)
        
        # log the metrics
        wandb.log({'Train Loss': train_epoch_loss})
        wandb.log({'Valid Loss': val_epoch_loss})
        
        # deep copy the model
        if val_epoch_loss <= best_epoch_loss:
            print(f'{b_}Validation Loss Improved ({best_epoch_loss} ---> {val_epoch_loss})')
            best_epoch_loss = val_epoch_loss
            run.summary['Best Loss'] = best_epoch_loss
            best_model_wts = copy.deepcopy(model.state_dict())
            PATH = 'Loss{:.4f}_epoch{:.0f}.bin'.format(best_epoch_loss, epoch)
            torch.save(model.state_dict(), PATH)
            # Save a model file from the current directory
            print(f'Model Saved{sr_}')
            
        print()
    
    end = time.time()
    time_elapsed = end - start
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    print('Best Loss: {:.4f}'.format(best_epoch_loss))
    
    # load best model weights
    model.load_state_dict(best_model_wts)
    
    return model, history

In [None]:
def fetch_scheduler(optimizer):
    if CONFIG['scheduler'] == 'CosineAnnealingLR':
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer,T_max=CONFIG['T_max'], 
                                                   eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == 'CosineAnnealingWarmRestarts':
        scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer,T_0=CONFIG['T_0'], 
                                                             eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == None:
        return None
        
    return scheduler

In [None]:
# Define optimizer and scheduler
optimizer = optim.Adam(model.parameters(), lr=CONFIG['learning_rate'], 
                       weight_decay=CONFIG['weight_decay'])
scheduler = fetch_scheduler(optimizer)

In [None]:
run = wandb.init(project='HappyWhale', 
                 config=CONFIG,
                 name='training',
                 group='training')

In [None]:
model, history = run_training(model, optimizer, scheduler, 
                              device=CONFIG['device'],
                              num_epochs=CONFIG['epochs'])

## Saving model

In [None]:
torch.save(model.state_dict(), 'happy-whale-model.pth')

# Save as artifact for version control.
artifact = wandb.Artifact('model', type='model')
artifact.add_file('happy-whale-model.pth')

run.log_artifact(artifact)
run.finish()

## Loading model
Uncomment if you would like to load your model

In [None]:
entity = 'artmalygin'
model_number = 0

In [None]:
# run = wandb.init(project='HappyWhale',
#                  name='loading',
#                  group='loading')

# artifact = run.use_artifact(f'{entity}/HappyWhale/model:v{model_number}', type='model')
# artifact_dir = artifact.download()

# run.finish()

In [None]:
# wandb_model = HappyWhaleModel(CONFIG['model_name']).to(CONFIG['device'])
# wandb_model.load_state_dict(torch.load(os.path.join(artifact_dir, 'happy-whale-model.pth')))
# wandb_model.eval()

# Evaluation

In [None]:
class HappyWhaleEvaluationDataset(Dataset):
    def __init__(self, df, transforms=None):
        self.df = df
        self.transforms = transforms
        
    def __len__(self):
        return len(self.df.index)
    
    def __getitem__(self, index):
        item = self.df.iloc[index]
        image_path = item['path']
        image = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
        
        # transform the dataset if transformations were specified
        if self.transforms:
            image = self.transforms(image=image)['image']
        
        return {
            'image': image,
        }

In [None]:
train_fold = 0
df_train_eval = df_train[df_train.kfold != train_fold].reset_index(drop=True)
df_valid_eval = df_train[df_train.kfold == train_fold].reset_index(drop=True)

In [None]:
def prepare_loaders_eval(df_train, df_valid, df_test):
    train_dataset = HappyWhaleEvaluationDataset(df_train, transforms=data_transforms['train'])
    valid_dataset = HappyWhaleEvaluationDataset(df_valid, transforms=data_transforms['valid'])
    test_dataset = HappyWhaleEvaluationDataset(df_test, transforms=data_transforms['valid'])

    train_loader = DataLoader(train_dataset, batch_size=CONFIG['train_batch_size'], 
                              num_workers=2, shuffle=True, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset, batch_size=CONFIG['valid_batch_size'], 
                              num_workers=2, shuffle=False, pin_memory=True)
    test_loader = DataLoader(test_dataset, batch_size=CONFIG['valid_batch_size'],
                             num_workers=2, shuffle=False, pin_memory=True)
    
    return train_loader, valid_loader, test_loader

In [None]:
train_loader_eval, valid_loader_eval, test_loader_eval = prepare_loaders_eval(df_train_eval, df_valid_eval, df_test)

In [None]:
@torch.inference_mode()
def inference(model, dataloader, device):
    model.eval()  
    embedding = torch.randn(1, CONFIG['embedding_size'])
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        images = data['image'].to(device, dtype=torch.float)
        
        outputs = model(images).cpu()
        
        embedding = torch.cat((embedding, outputs), 0)
    
    return embedding

In [None]:
def predict_grid(train_predictions, valid_predictions, train_labels, valid_labels, new_individual_thres):
    neigh = NearestNeighbors(n_neighbors=CONFIG['neigh'], metric='cosine')
    neigh.fit(train_predictions)
    
    distances, idxs = neigh.kneighbors(valid_predictions, return_distance=True)
    conf = 1 - distances
    preds = []

    for j in range(len(idxs)):
        preds.append(list(train_labels[idxs[j]]))

    all_top_5_preds = []
    valid_labels_list = []
    
    print(len(preds))
    
    for i in range(len(preds)):
        valid_labels_list.append((valid_labels[i]))

        predict_top = preds[i][:5]
        top_5_conf = conf[i][:5]

        if top_5_conf[0] < new_individual_thres:
           
            temp_list = ['new_individual', predict_top[0], predict_top[1], predict_top[2], predict_top[3]]
            all_top_5_preds.append(temp_list)   
           
        elif top_5_conf[1] < new_individual_thres:
   
            temp_list=[predict_top[0], 'new_individual', predict_top[1], predict_top[2], predict_top[3]]
            all_top_5_preds.append(temp_list)    
           
        elif top_5_conf[2] < new_individual_thres:

            temp_list=[predict_top[0], predict_top[1], 'new_individual', predict_top[2], predict_top[3]]
            all_top_5_preds.append(temp_list)    
           
        elif top_5_conf[3] < new_individual_thres:
           
            temp_list=[predict_top[0], predict_top[1], predict_top[2], 'new_individual', predict_top[3]]
            all_top_5_preds.append(temp_list)  
           
        elif top_5_conf[4] < new_individual_thres:

            temp_list=[predict_top[0], predict_top[1], predict_top[2], predict_top[3], 'new_individual']
            all_top_5_preds.append(temp_list)        
           
        else:
            all_top_5_preds.append(predict_top)

        if (('new_individual' in all_top_5_preds[-1]) and (valid_labels_list[i] not in train_labels)):
            all_top_5_preds[-1] = [valid_labels_list[i] if x == 'new_individual' else x for x in all_top_5_preds[-1]]

    score = map_per_set(valid_labels_list, all_top_5_preds)

    return score

In [None]:
train_predictions = np.array(inference(model, train_loader_eval, CONFIG['device']))[1:]
valid_predictions = np.array(inference(model, valid_loader_eval, CONFIG['device']))[1:]

train_labels = np.array(df_train_eval['individual_id'].values)
valid_labels = np.array(df_valid_eval['individual_id'].values)

In [None]:
def map_per_image(label, predictions):
    """Computes the precision score of one image.

    Parameters
    ----------
    label : string
            The true label of the image
    predictions : list
            A list of predicted elements (order does matter, 5 predictions allowed per image)

    Returns
    -------
    score : double
    """    
    try:
        return 1 / (predictions[:5].index(label) + 1)
    except ValueError:
        return 0.0

def map_per_set(labels, predictions):
    """Computes the average over multiple images.

    Parameters
    ----------
    labels : list
             A list of the true labels. (Only one true label per images allowed!)
    predictions : list of list
             A list of predicted elements (order does matter, 5 predictions allowed per image)

    Returns
    -------
    score : double
    """
    return np.mean([map_per_image(l, p) for l,p in zip(labels, predictions)])

In [None]:
iteration = 0
best_score = 0
best_thres = 0

for thres in np.arange(0.1, 0.9, 0.1):
    print('iteration', iteration, 'of', len(np.arange(0.3, 0.9, 0.1)))        
    iteration += 1
    score = predict_grid(train_predictions, valid_predictions, train_labels, valid_labels, new_individual_thres=thres)
    
    if score > best_score:
        best_score = score
        best_thres = thres
    
    print('thres:', thres, ', score:', score)
    
print('Best score is:', best_score)
print('Best thres is:', best_thres)

In [None]:
def get_submission(train_data, valid_data, train_labels, neighbors=100, metric='cosine', new_individual_thres=0.6):
    neigh = NearestNeighbors(n_neighbors=neighbors, metric=metric)
    neigh.fit(train_data)
    
    distances, idxs = neigh.kneighbors(valid_data, return_distance=True)
    conf = 1-distances
    preds = []
    
    df = pd.read_csv("../input/happy-whale-and-dolphin/sample_submission.csv")
    for i in range(len(idxs)):
        preds.append(train_labels[idxs[i]])
        predict_top_decoded = {}
        
    for i in range(len(distances)):
        
        predict_top= list(preds[i][:5])
        topValues = conf[i][:5]

        if topValues[0] < new_individual_thres:
            
            temp_list = ['new_individual',predict_top[0],predict_top[1],predict_top[2],predict_top[3]]
            predict_top_decoded[df.iloc[i]['image']] = temp_list
            
        elif topValues[1] < new_individual_thres:
    
            temp_list = [predict_top[0], 'new_individual', predict_top[1], predict_top[2], predict_top[3]]
            predict_top_decoded[df.iloc[i]['image']] = temp_list
            
        elif topValues[2] < new_individual_thres:

            temp_list = [predict_top[0], predict_top[1], 'new_individual', predict_top[2], predict_top[3]]
            predict_top_decoded[df.iloc[i]['image']] = temp_list
            
        elif topValues[3] < new_individual_thres:
            
            temp_list = [predict_top[0], predict_top[1], predict_top[2], 'new_individual', predict_top[3]]
            predict_top_decoded[df.iloc[i]['image']] = temp_list
            
        elif topValues[4] < new_individual_thres:

            temp_list = [predict_top[0], predict_top[1], predict_top[2], predict_top[3], 'new_individual']
            predict_top_decoded[df.iloc[i]['image']] = temp_list
            
        else:
            predict_top_decoded[df.iloc[i]['image']] = predict_top
             
    for x in tqdm(predict_top_decoded):
        predict_top_decoded[x] = ' '.join(predict_top_decoded[x])
    
    predictions = pd.Series(predict_top_decoded).reset_index()
    predictions.columns = ['image', 'predictions']
    predictions.to_csv('happy-whale-submission.csv', index=False)
    predictions.head()

In [None]:
test_predictions = np.array(inference(model, test_loader_eval, CONFIG['device']))[1:]
all_train_data = np.concatenate((train_predictions, valid_predictions))
all_training_labels = np.concatenate((train_labels, valid_labels))
get_submission(all_train_data, test_predictions, all_training_labels, neighbors=CONFIG['neigh'], metric='cosine', new_individual_thres=best_thres)

In [None]:
# Saving submission to artifacts
run = wandb.init(project='HappyWhale', 
                 config=CONFIG,
                 name='submission',
                 group='submission')

artifact = wandb.Artifact('submission', type='submission')
artifact.add_file('happy-whale-submission.csv')

run.log_artifact(artifact)
run.finish()

# Conclusion
[View the Complete Dashboard Here](https://wandb.ai/artmalygin/HappyWhale)