In [None]:
import pandas as pd
import numpy as np
import json
import os
import random
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
#from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import backend as K
import tensorflow_addons as tfa

EDA: https://www.kaggle.com/code/hanselliott/herbarium22-eda/edit

In [None]:
TRAIN_DIR = "../input/herbarium-2022-fgvc9/train_images/"
TEST_DIR = "../input/herbarium-2022-fgvc9/test_images/"

with open("../input/herbarium-2022-fgvc9/train_metadata.json") as json_file:
    train_meta = json.load(json_file)
with open("../input/herbarium-2022-fgvc9/test_metadata.json") as json_file:
    test_meta = json.load(json_file)

# Data Import Setup

In [None]:
#Create a meta-data df that can be used to call in images
ids = []
categories = []
paths = []

for annotation, image in zip(train_meta['annotations'], train_meta['images']):
    ids.append(image["image_id"])
    categories.append(annotation['category_id'])
    paths.append(image["file_name"])

df_meta = pd.DataFrame({"id":ids, "category":categories, "path":paths})
df_meta.head()

In [None]:
##extract metadata features by category to merge with df_meta
sci_name = {cat["category_id"]:cat["scientificName"] for cat in train_meta['categories']}
family = {cat["category_id"]:cat["family"] for cat in train_meta['categories']}
genus = {cat["category_id"]:cat["genus"] for cat in train_meta['categories']}
species = {cat["category_id"]:cat["species"] for cat in train_meta['categories']}

df_meta["scientific_name"] = df_meta["category"].map(sci_name)
df_meta["family"] = df_meta["category"].map(family)
df_meta["genus"] = df_meta["category"].map(genus)
df_meta["species"] = df_meta["category"].map(species)
df_meta.head()

In [None]:
##split the path based on '/' into parent and child folder. 
##lambda fn is applied to each row in the column to split each path
df_meta['path'].apply(lambda x : x.split('/'))

#add categories/num_categories equivalents to df_meta
df_meta['parent_folder'] = df_meta['path'].apply(lambda x : x.split('/')[0])
df_meta['child_folder'] = df_meta['path'].apply(lambda x : x.split('/')[1])

df_meta.head()

# Preprocessing Helper Functions

In [None]:
def preprocess_cnn(categories, sub_categories, width, height):
    ## Add train, test
    ## if x == train, DIR = TRAIN_DIR else DIR == TEST_DIR...
    """
    Ex: categories = 000, sub_categories = 00 (correspond to parent_folder, child_folder of an image path)
    Function imports images from the selected categories and applies some preprocessing.
    Produces X, y data (image, label)
    """
    list_img = [] ## a list of the images
    labels = []   ## a list of the correspondign categories
    for cat, sub_cat in zip(categories, sub_categories):
        ## Now extract each image from the current categories/sub_categories path
        for ig in os.listdir(os.path.join("../input/herbarium-2022-fgvc9/train_images", cat, sub_cat)):
            ##read in image
            img = cv2.imread(os.path.join("../input/herbarium-2022-fgvc9/train_images", cat, sub_cat, ig))
            ##resize
            img = cv2.resize(img, (width, height), interpolation=cv2.INTER_LINEAR)
            ##equalize
            img_yuv = cv2.cvtColor(img,cv2.COLOR_RGB2YUV) ##convert to YUB
            img_yuv[:,:,0] = cv2.equalizeHist(img_yuv[:,:,0]) ##equalize histogram
            img_equ = cv2.cvtColor(img_yuv, cv2.COLOR_YUV2RGB) ##convert to RGB
             #img_equ = img ##cancelling out equalization for now
            list_img.append(img_equ)
            labels.append(ig.split("__")[0]) ##label is the part before "__" (eg, 2774 for 02774__001.jpg where cat=27, sub_cat=74)
            
    return list_img, labels #X, y

In [None]:
"02774__001".split()

# Top 20 Test
We will test preprocessing/modeling on a smaller subset of the data. We will just use the top 20 categories (out of ~1500 possible).

In [None]:
print("Unique categories: ", len(df_meta['category'].unique()))
print("")

##get the indices of the top 20 most common categories
index_top20 = df_meta['category'].value_counts().head(20).index
#These are the top 20:
print("Top 20 Most Common Categories (scientific name):")
df_meta['scientific_name'].value_counts().head(20)

In [None]:
##Create new df with only the top 20 categoires.
df_meta_top20 = df_meta[df_meta['category'].isin(index_top20)] ##subset based on top 20 indices found above
print("Top 20 Most Common Categories (id):")
df_meta_top20['category'].unique()

In [None]:
print("The subset of the data we will try:")
print("Unique parent folders: ", df_meta_top20['parent_folder'].unique()) ##working with only a subset of the data now
print("Unique subfolders length: ", df_meta_top20['child_folder'].unique().shape, " corresponds to the 20 categories") ##only 20 subfolders = to 20 unique categories
df_meta_top20['child_folder'].unique()

In [None]:
df_meta_top20.head()

In [None]:
print("Parent_folders", df_meta_top20['parent_folder'].unique())
print("Child folders", df_meta_top20['child_folder'].unique())
parent_cats = df_meta_top20['category'].unique()
print("Categories", parent_cats)

In [None]:
cat_parent = ['002', '009', '011', '011', '027', '028', '028', '040', '046', '046', '087', '088','088', '100', '108', 
              '109', '125', '125', '125', '125']
cat_child = df_meta_top20['child_folder'].unique()
len(cat_child)

## Preprocessing

In [None]:
# Import the training images from the top 20 categories
X_raw, y_raw = preprocess_cnn(categories=cat_parent,
                              sub_categories=cat_child,
                              width=299, height=299)   ##the dimensions of each images

In [None]:
##convert X and y to np arrays
X = np.array(X_raw)
y = np.array(y_raw)
num_classes = len(np.unique(y))

print("X.shape: ", X.shape, "~ 1600 imgs, each 299x299 array, with 3 dims (RGB)") 
print("y.shape: ", y.shape)
print("unique y: ", np.unique(y))

In [None]:
##shuffle images?
y_indices = np.arange(len(y))

##randomly shuffle the images up
np.random.seed(42)
np.random.shuffle(y_indices)

In [None]:
##Encode target labels with value between 0 and n_classes-1 (since the categories are currently an assortment of random # strings)
le = LabelEncoder() ##sklearn.preprocessing
y = le.fit_transform(y)
print("unique y: ", np.unique(y))
##One-hot encode
y = tf.keras.utils.to_categorical(y)
y

### Submissions are evaluated using the macro F1 score.

In [None]:
f1_macro = tfa.metrics.F1Score(num_classes=num_classes, average='macro') ##from TensorFlow Addons

In [None]:
# split training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify= y, random_state = 42)

In [None]:
print("X_train.shape: ", X_train.shape)
print("y_train.shape: ", y_train.shape)

# Model

In [None]:
2592**0.

In [None]:
# 2d ConvNet
top20mod = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(299, 299, 3)), ##299 x 299 images, RGB
    tf.keras.layers.Conv2D(filters=32, kernel_size=(3,3), padding='same', activation='relu'),
    tf.keras.layers.Conv2D(filters=32, kernel_size=(3,3), padding='same', activation='relu'),
    tf.keras.layers.MaxPool2D(pool_size=(2,2)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Conv2D(filters=64, kernel_size=(3,3), padding='same', activation='relu'),
    tf.keras.layers.Conv2D(filters=64, kernel_size=(3,3), padding='same', activation='relu'),
    tf.keras.layers.MaxPool2D(pool_size=(4,4)),
    tf.keras.layers.Dropout(0.2),
    #tf.keras.layers.Conv2D(filters=32, kernel_size=(3,3), padding='same', activation='relu'),
    #tf.keras.layers.MaxPool2D(pool_size=(6,6)),
    #tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(360, activation='relu'),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])
top20mod.summary()

In [None]:
t20weights = top20mod.get_weights()
#fn to reset model weights to randomly initialized if want to restart training
reset_model = lambda model, weights: model.set_weights(weights) 

loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=False)
optim = tf.keras.optimizers.Adam(learning_rate=0.0001,
                                 beta_1=0.9,
                                 beta_2=0.999)

top20mod.compile(optimizer=optim,
                 loss=loss_fn,
                 metrics=['accuracy', f1_macro])


In [None]:
# Data Generator (for augmentation/preprocessing in the flow of training)
##train
train_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
                rescale=1.0/255,
                rotation_range=30,
                width_shift_range=0.1,
                height_shift_range=0.1,
                shear_range=0.1,
                zoom_range=0.2,
                horizontal_flip=True,
                fill_mode='nearest',
                validation_split=0.2
)


##test
test_datagen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255)
#
print("Post-datagen train N:", 
train_datagen.flow(X_train, y_train, batch_size=16, subset='training').n,
      "\n Post-datagen validation N:",
train_datagen.flow(X_train, y_train, batch_size=16, subset='validation').n,
)

top20mod = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(299, 299, 3)), ##299 x 299 images, RGB
    tf.keras.layers.Conv2D(filters=32, kernel_size=(3,3), padding='same', activation='relu'),
    tf.keras.layers.Conv2D(filters=32, kernel_size=(3,3), padding='same', activation='relu'),
    tf.keras.layers.MaxPool2D(pool_size=(2,2)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Conv2D(filters=64, kernel_size=(3,3), padding='same', activation='relu'),
    tf.keras.layers.Conv2D(filters=64, kernel_size=(3,3), padding='same', activation='relu'),
    tf.keras.layers.MaxPool2D(pool_size=(4,4)),
    tf.keras.layers.Dropout(0.2),
    #tf.keras.layers.Conv2D(filters=32, kernel_size=(3,3), padding='same', activation='relu'),
    #tf.keras.layers.MaxPool2D(pool_size=(6,6)),
    #tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(360, activation='relu'),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

In [None]:
# TRAINING
reset_model(top20mod, t20weights)
history_cnn = top20mod.fit(
    train_datagen.flow(X_train, y_train, batch_size=16, subset='training'),
    validation_data = train_datagen.flow(X_train, y_train,batch_size=8,subset='validation'),
    batch_size=32, 
    epochs=5)

In [None]:
## Plotting the fit history
fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(20,10))

ax[0].plot(history_cnn.history['loss'], label='loss')
ax[0].plot(history_cnn.history['val_loss'], label='val loss')
ax[0].legend()

ax[1].plot(history_cnn.history['accuracy'], label='acc')
ax[1].plot(history_cnn.history['val_accuracy'], label='val acc')
ax[1].legend()

ax[2].plot(history_cnn.history['f1_score'], label='f1')
ax[2].plot(history_cnn.history['val_f1_score'], label='val f1')
ax[2].legend()

plt.show()

In [None]:
# Predict model onto test data
y_pred = top20mod.predict(test_datagen.flow(X_test))

In [None]:
# Determine class with highest predicted prob
y_pred_class = np.argmax(y_pred, axis=1)
# Convert y_test from one-hot encoded to sparse
y_true = np.argmax(y_test, axis=1)

print(
"y_pred.shape ", y_pred_class.shape,
", y_true.shape ", y_true.shape
)

In [None]:
# compute confusion matrix
conf_mat = tf.math.confusion_matrix(y_true, y_pred_class)
# plot with seaborn
plt.figure(figsize= (10,10))
sns.heatmap(conf_mat, annot=True).set(xlabel="Pred", ylabel="True")
plt.show()