## Herbarium

This competition presents a few significant challenges:
* Huge number of classes.
* Imbalanced classes.
* Very Large dataset that Kaggle kernels process slowly.

This is just a very simple kernel to help get folks started with loading and modifying the dataset.

Good luck!

## Imports

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import json
from pandas.io.json import json_normalize

import wandb

import os

from tensorflow.keras.applications import EfficientNetB0

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.layers import Input, Dense, Dropout, Conv2D, MaxPooling2D, BatchNormalization, Flatten

import math
import cv2
from skimage.transform import resize

import seaborn as sns

import multiprocessing

from glob import glob

### Weights and Biases for Personal Logging

In [None]:
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
wandb_api = user_secrets.get_secret("wandb_api")
wandb_user = user_secrets.get_secret("wandb_user")

wandb.login(key = wandb_api)
init = wandb.init(project = 'herbarium')


## GPU or TPU usage?

In [None]:
USE_GPU = True

if USE_GPU:
    strategy = tf.distribute.MirroredStrategy()
    print(f'Number of devices: {strategy.num_replicas_in_sync}')
    print(tf.test.gpu_device_name())
    print(f"Number of accelerators: {strategy.num_replicas_in_sync}")

In [None]:
num_cores = multiprocessing.cpu_count()
print(f"CPU Cores: {num_cores}")

## Metadata Read and Dataframe Conversion

See my post on converting JSON to CSV here: https://www.kaggle.com/c/herbarium-2021-fgvc8/discussion/225237

I have added the csv to the kernel for easy loading and minimal processing time.

In [None]:
train_df = pd.read_csv("../input/herbarium-traincsv/herb_train.csv")

In [None]:
# For Kaggle Pathfinding
filepath_prefix = "../input/herbarium-2021-fgvc8/train/"
train_df['absolute_path'] = filepath_prefix + train_df.file_name

get_base = os.path.basename
train_df["filename_nopath"] = train_df["file_name"].apply(get_base)

In [None]:
train_df.head()

## EDA - Quick Distribution Visualizations

In [None]:
sns.histplot(train_df["category_id"])

In [None]:
sns.histplot(train_df["institution_id"])

In [None]:
sns.histplot(train_df["height"])

In [None]:
sns.histplot(train_df["width"])

In [None]:
sns.histplot(train_df["family"])

In [None]:
sns.histplot(train_df["order"])

## Reproducibility

In [None]:
GLOBAL_SEED = 42

np.random.seed(GLOBAL_SEED)
tf.random.set_seed(GLOBAL_SEED)

## Tensorflow / Keras Starter

In [None]:
TRAIN_DIR = "../input/herbarium-2021-fgvc8/train"
TEST_DIR =  "../input/herbarium-2021-fgvc8/test"

BATCH_SIZE = 128
STAGES_PER_EPOCH = 256
EPOCHS = 200

IMG_HEIGHT = 224
IMG_WIDTH = 224
IMG_SIZE = (IMG_HEIGHT, IMG_WIDTH)

## Splits

#### Train Test Split

In [None]:
# Rename b/c it contains y.

X_train, X_val, = train_test_split(train_df, test_size = 0.30,
    stratify = train_df['name'], random_state = GLOBAL_SEED, shuffle = True
)

In [None]:
print(X_train.shape)
print(X_val.shape)

In [None]:
DO_SUBSET = True

if DO_SUBSET:
    n_samples = 20000
    X_train = X_train.sample(n_samples)
    X_val = X_val.sample(n_samples)
    

In [None]:
print(X_train.shape)
print(X_val.shape)

### Make sure (subsample) train and val have same classes.
These are the names that are not in this validation split

In [None]:
train_names = set(X_train.name)
validation_names = set(X_val.name)
intersection_names = train_names.intersection(validation_names)

len(intersection_names)

# Names in train but not in validation.

names_in_both = train_names.intersection(validation_names)

In [None]:
X_keep_idx = X_train.isin(names_in_both)
X_keep_idx.name.value_counts()

In [None]:
X_keep = X_train[X_keep_idx.name]

print(X_train.shape)
print(X_keep.shape)

In [None]:
X_val_keep_idx = X_val.isin(names_in_both)
X_val_keep_idx.name.value_counts()

In [None]:
X_val_keep = X_val[X_val_keep_idx.name]

print(X_val.shape)
print(X_val_keep.shape)

In [None]:
X_val_keep.head()

In [None]:
n_classes = X_keep.name.nunique()
n_classes

## Image Dataset Generator Approach (Slower - Unused Right Now)

In [None]:
rescale_value = 1/255.

train_data_gen = keras.preprocessing.image.ImageDataGenerator(
    rescale = rescale_value,
    horizontal_flip = True,
    rotation_range = 180,
    shear_range = 30,
    vertical_flip = True
    
    
#    preprocessing_function = do_img_preprocessing_pipeline,
#    featurewise_center = True
)

validation_data_gen = keras.preprocessing.image.ImageDataGenerator(
    rescale = rescale_value,
#    preprocessing_function = do_img_preprocessing_pipeline
)

In [None]:
train_data_generator = train_data_gen.flow_from_dataframe(
    dataframe = X_keep,
    directory = None,
    x_col = "absolute_path",
    y_col = "name",
    seed = GLOBAL_SEED,
    batch_size = BATCH_SIZE,
    shuffle = True,
    class_mode = "categorical",
    target_size = IMG_SIZE,
    subset = "training",
    validate_filenames=False
)

In [None]:
valid_data_generator = validation_data_gen.flow_from_dataframe(
    dataframe = X_val_keep,
    directory = None,
    x_col = "absolute_path",
    y_col = "name",
    seed = GLOBAL_SEED,
    batch_size = BATCH_SIZE,
    shuffle = True,
    class_mode = "categorical",
    target_size = IMG_SIZE,
    validate_filenames=False
)

## Faster Loading and Parallel Processing (In Process)

In [None]:
def do_img_preprocessing_pipeline(image, label):
    
    image = tf.image.resize(image, IMG_SIZE)   # Resize
    image = tf.cast(image, tf.float32) / 255.  # Recale
    
    # Flip
    # Rotate
    
    return image, label

In [None]:
class HerbariumBatchSequence(tf.keras.utils.Sequence):
    
    def __init__(self, x_set, y_set, 
                 batch_size,
                 img_size = (224, 224),
                 augment = False):
        """
        `x_set` is list of paths to the images
        `y_set` are the associated classes.

        """
        
        self.batch_size = batch_size
        self.img_size = img_size
        
        self.x = x_set
        self.y = y_set
        
        label_enc = LabelEncoder()
        self.y = label_enc.fit_transform(self.y)
        self.y = tf.keras.utils.to_categorical(self.y)
    
    def __len__(self):
        """Denotes the number of batches per epoch"""
        return math.ceil(len(self.x) / self.batch_size)
    
    def __getitem__(self, idx):
        """Generate one batch of data"""
        
        first_id = idx * self.batch_size
        last_id =  (idx + 1) * (self.batch_size)
        
        batch_x = self.x[first_id:last_id]
        batch_y = self.y[first_id:last_id]
        
        output = np.array([
            resize(cv2.imread(file_name), self.img_size)
                   for file_name in batch_x]), np.array(batch_y)
        
        return output



In [None]:
TrainGenerator = HerbariumBatchSequence(X_keep.absolute_path, 
                                        X_keep.name,
                                        BATCH_SIZE)

ValidGenerator = HerbariumBatchSequence(X_val_keep.absolute_path, 
                                       X_val_keep.name,
                                       BATCH_SIZE)

## Custom Loss (in progress)

In [None]:
# Source: https://www.kaggle.com/guglielmocamporese/macro-f1-score-keras

import keras.backend as K

def macro_f1(y_true, y_pred):
    y_pred = K.round(y_pred)
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    # tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)
    return K.mean(f1)

# Efficientnet CNN



In [None]:
efficientnet = EfficientNetB0(include_top=True, 
                              weights=None, 
                              input_shape = (IMG_HEIGHT, IMG_WIDTH, 3),
                              classes = n_classes
)

# efficientnet.summary()

In [None]:
model = efficientnet

In [None]:
model.compile(optimizer = 'adam',
              loss = 'categorical_crossentropy',
              metrics = 'accuracy')

In [None]:
#wandb_callback = wandb.keras.WandbCallback(log_weights=True)

history = model.fit(TrainGenerator,
                    steps_per_epoch = STAGES_PER_EPOCH,
                    validation_data = ValidGenerator,
                    workers = num_cores,
                    epochs = 3,
#                    callbacks=[wandb_callback]
                   )

In [None]:
preds = pd.DataFrame(model.predict(valid_data_generator))

In [None]:
preds.head(15)