# Herbarium 2022

In [None]:
# PyTorch/Lightning
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as T
import pytorch_lightning as pl
import torchmetrics.functional as metrics
from torch.utils.data import Dataset, DataLoader

# Pandas/Numpy/etc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Other
import os
import json
import random
from PIL import Image
from pathlib import Path
from tqdm import tqdm

In [None]:
torch.__version__, torchvision.__version__, pl.__version__

In [None]:
!ls ../input/herbarium-2022-fgvc9

In [None]:
TRAIN_JSON = '../input/herbarium-2022-fgvc9/train_metadata.json'
TEST_JSON = '../input/herbarium-2022-fgvc9/test_metadata.json'
TRAIN_IMGS = '../input/herbarium-2022-fgvc9/train_images/'
TEST_IMGS = '../input/herbarium-2022-fgvc9/test_images/'

## Utils

In [None]:
def load_annotations(metadata):
    '''
    Args:
        metadata (dict): JSON with annotations.
    Returns:
        dataframe (DataFrame): Dataframe with annotations. 
    '''
    metadata_list = []
    categories = {category['category_id']: (category['family'], category['genus'], category['species']) 
                  for category in metadata['categories']}
    for img, anns in tqdm(zip(metadata['images'], metadata['annotations'])):
        category_id = anns['category_id']
        family, genus, species = categories[category_id]
        row = {
            'file_name': img['file_name'],
            'img_id': img['image_id'],
            'category_id': category_id,
            'family': family,
            'genus': genus,
            'species': species
        }
        metadata_list.append(row)
    return pd.DataFrame.from_dict(metadata_list)

# EDA

In [None]:
with open(TRAIN_JSON) as f:
    train_metadata = json.load(f)

In [None]:
# All metadata possible keys
print(f'metadata keys: {[*train_metadata]}\n')

for key in train_metadata.keys():
    print(f'{key}: \n\t{[*train_metadata[key][0]]} \n\tcount: {len(train_metadata[key])}')

In [None]:
# Example annotation
train_metadata['annotations'][0]

In [None]:
# Example category
train_metadata['categories'][0]

In [None]:
# Loading all required train metadata into one dataframe 
train_df = load_annotations(train_metadata)

In [None]:
train_df.sample(5)

### Top K Distribution

In [None]:
columns = ['family', 'genus', 'species']

def plot_top_K_barh(metadata, column, K=10):
    ax = metadata[column] \
           .value_counts() \
           .head(K) \
           .plot(title=f'Top {K} {column}', kind='barh')
    for container in ax.containers:
        ax.bar_label(container)
    return ax

In [None]:
fig = plt.figure(figsize=(30, 8))
fig.suptitle(f'Train data', fontsize=22)
for idx, column in enumerate(columns):
    fig.add_subplot(1, 3, idx+1)
    plot_top_K_barh(train_df, column)
plt.show()

### Saving metadata as CSV

In [None]:
train_df.to_csv('train_metadata.csv', index=False)

## Image Visualization

In [None]:
def plot_images(metadata, img_dir, by=None, name=None):
    '''
    Args:
        metadata (DataFrame): DataFrame with annotations.
        img_dir (str): Path to the image directory.
        by (str): Sample field (or randomly): [family, genus, species, None].
        name (str): Name of the example, in cases of non-random sampling. 
    '''
    
    # Expected values: family, genus, species.
    if by and name is not None:
        metadata = metadata[metadata[by] == name]
    
    sample = metadata.sample(16)
    filenames = sample['file_name'].to_list()
    family = sample['family'].to_list()
    genus = sample['genus'].to_list()
    species = sample['species'].to_list()
    
    fig, axes = plt.subplots(4, 4, figsize=(12, 16))
    title = f'{by} ({name})' if by is not None else "random"
    fig.suptitle(f'Select by {title}\n', fontsize=22)
    for idx, ax in enumerate(axes.flatten()):
        img_path = Path(img_dir).joinpath(filenames[idx])
        img = np.array(Image.open(img_path))
        ax.imshow(img)
        ax.title.set_text(f'family: {family[idx]}\n genus:' \
                          f'{genus[idx]}\n species: {species[idx]}')
        ax.set_axis_off()
    plt.tight_layout()
    plt.show()

In [None]:
plot_images(train_df, img_dir=TRAIN_IMGS)

In [None]:
plot_images(train_df, img_dir=TRAIN_IMGS, by='family', name='Asteraceae')

In [None]:
plot_images(train_df, img_dir=TRAIN_IMGS, by='genus', name='Carex')

In [None]:
plot_images(train_df, img_dir=TRAIN_IMGS, by='species', name='californica')

## PyTorch Dataset

In [None]:
class HerbariumDataset(Dataset):
    def __init__(self, img_dir, metadata_csv, transform):
        self.img_dir = img_dir
        self.metadata = pd.read_csv(metadata_csv)
        self.transform = transform
    
    def __getitem__(self, idx):
        filename = self.metadata['file_name'][idx]
        label = self.metadata['category_id'][idx]
        
        img_path = Path(self.img_dir).joinpath(filename)
        img = Image.open(img_path)
        img = self.transform(img)
        
        return img, label
    
    def __len__(self):
        return len(self.metadata)

In [None]:
# Init all necessary transforms
RESIZE_H, RESIZE_W = 360, 360
train_transform = T.Compose([
    T.ToTensor(),
    T.Resize((RESIZE_H, RESIZE_W)),
    T.RandomHorizontalFlip(p=0.5),
    T.RandomVerticalFlip(p=0.5)
])

test_transform = T.Compose([
    T.ToTensor(),
    T.Resize((RESIZE_H, RESIZE_W))
])

train_dataset = HerbariumDataset(TRAIN_IMGS, 'train_metadata.csv', 
                                 transform=train_transform)

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(10, 10))
for i, ax in enumerate(axes.flatten()):
    img, label = random.choice(train_dataset)
    ax.imshow(img.permute(1, 2, 0))
    ax.set_axis_off()
plt.tight_layout()
plt.show()

# Proof of Concept

Training on 10 *category_id* classes on a custom Convolutional Neural Network

### Dataset subsample for 10 classes (by count of examples)

In [None]:
labels = train_df['category_id'] \
          .value_counts()[:10] \
          .index \
          .to_list()

labels_dict = {label: idx for idx, label in enumerate(labels)}

subsample_df = train_df[train_df['category_id'].isin(labels)] \
                .reset_index()

subsample_df['category_id'] = subsample_df['category_id'] \
                               .map(labels_dict)

In [None]:
# Encoded labels
print(labels_dict)

In [None]:
subsample_df

In [None]:
subsample_df['category_id'].value_counts()

### Train/Test Split

In [None]:
train_subsample_df = subsample_df.sample(frac=0.8, random_state=200)
test_subsample_df = subsample_df.drop(train_subsample_df.index)

In [None]:
ax = train_subsample_df['category_id'] \
      .value_counts() \
      .plot(kind='barh')
for container in ax.containers:
    ax.bar_label(container)
plt.title('Train dataset')
plt.show()

In [None]:
ax = test_subsample_df['category_id'] \
      .value_counts() \
      .plot(kind='barh')
for container in ax.containers:
    ax.bar_label(container)
plt.title('Test dataset')
plt.show()

In [None]:
train_subsample_df.to_csv('train_subsample_metadata.csv', index=False)
test_subsample_df.to_csv('test_subsample_metadata.csv', index=False)

### PyTorch Lightning Custom CNN

In [None]:
LR = 0.01
EPOCHS = 60
CLASSES = 10
NUM_WORKERS = os.cpu_count()
AVAIL_GPUS = torch.cuda.device_count()
BATCH_SIZE = 32 if AVAIL_GPUS else 16

In [None]:
class LitCNN(pl.LightningModule):
    '''
    Custom CNN with PyTorch Lightning.
    '''
    def __init__(self):
        super().__init__()
        self.conv1 = self._conv_module(3, 16)
        self.conv2 = self._conv_module(16, 32)
        self.conv3 = self._conv_module(32, 64)
        self.conv4 = self._conv_module(64, 128)
        self.flatten = nn.Flatten()
        self.drop = nn.Dropout(p=0.2)
        self.fc1 = nn.Linear(20*20*128, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, CLASSES)
        self.relu = nn.ReLU()
        self.loss_fn = nn.CrossEntropyLoss()
        
    def _conv_module(self, in_shape, out_shape):
        return nn.Sequential(
            nn.Conv2d(in_shape, out_shape, kernel_size=3, stride=1),
            nn.BatchNorm2d(out_shape),
            nn.LeakyReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.flatten(x)
        x = self.drop(x)
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x
        
    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.loss_fn(logits, y)
        self.log('loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss
    
    def test_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.loss_fn(logits, y)
        accuracy = metrics.accuracy(logits, y)
        metrics_dict = {'loss': loss, 'accuracy': accuracy}
        self.log_dict(metrics_dict, on_epoch=True, prog_bar=True)
        return metrics_dict
    
    def training_epoch_end(self, outputs):
        avg_loss = torch.tensor([out['loss'] for out in outputs]).mean()
        self.log('train_loss', avg_loss, logger=True, prog_bar=True)
    
    def test_epoch_end(self, outputs):
        avg_loss = torch.tensor([out['loss'] for out in outputs]).mean()
        avg_acc = torch.tensor([out['accuracy'] for out in outputs]).mean()
        self.log('test_loss', avg_loss, on_epoch=True)
        self.log('test_accuracy', avg_acc, on_epoch=True)
    
    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=LR)
        scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
        return {'optimizer': optimizer, 'lr_scheduler': scheduler}
    
    def train_dataloader(self):
        train_dataset = HerbariumDataset(TRAIN_IMGS, 'train_subsample_metadata.csv', 
                                         transform=train_transform)
        return DataLoader(train_dataset, batch_size=BATCH_SIZE, 
                          shuffle=True, num_workers=NUM_WORKERS)
        
    def test_dataloader(self):
        test_dataset = HerbariumDataset(TRAIN_IMGS, 'test_subsample_metadata.csv', 
                                        transform=test_transform)
        return DataLoader(test_dataset, batch_size=BATCH_SIZE, 
                          num_workers=NUM_WORKERS)

In [None]:
model = LitCNN()
model

In [None]:
trainer = pl.Trainer(log_every_n_steps=10, 
                     gpus=AVAIL_GPUS, 
                     max_epochs=EPOCHS)

In [None]:
trainer.fit(model)

In [None]:
trainer.test(model, verbose=False)

# EfficientNet?

# Summary?