In [None]:
from fastai.vision.all import *
import os
import shutil
import cv2
from sklearn.model_selection import train_test_split
import random

With more than 2.5m images at 1000px it is impossible to quickly train models to try out ideas. What I want to be able to do is prototype models using a small subset of the data and then run the model on the full dataset *once* to generate a submission, since I don't want to use up hours and hours of GPU time for no reason. The challenge is ensuring that the subset is a good enough representation of the full dataset so that the modelling techniques used can be applied to the final run.

I have been experimenting with a couple of different ways to achieve this as you will see below. For reference with around 20k 256px images I can train a resnet18 model in just under 2 mins/epoch on Colab Pro, so keep that in mind when choosing how many images to include in your sample.

Hopefully this is helpful and please let me know in the comments if you have any feedback/other ideas!

In [None]:
# Code via: https://www.kaggle.com/muhammadzubairkhan92/herbarium-2021-exploratory-data-analysis
PATH_BASE = "../input/herbarium-2021-fgvc8/"
PATH_TRAIN = os.path.join(PATH_BASE, "train/")
PATH_TRAIN_META = os.path.join(PATH_TRAIN, "metadata.json")

with open(PATH_TRAIN_META) as json_file:
    metadata = json.load(json_file)
    
ids = []
categories = []
paths = []

for annotation, image in zip(metadata["annotations"], metadata["images"]):
    assert annotation["image_id"] == image["id"]
    ids.append(image["id"])
    categories.append(annotation["category_id"])
    paths.append(image["file_name"])
        
df_meta = pd.DataFrame({"id": ids, "category_id": categories, "file_name": paths})

d_categories = {category["id"]: category["name"] for category in metadata["categories"]}
d_families = {category["id"]: category["family"] for category in metadata["categories"]}
d_orders = {category["id"]: category["order"] for category in metadata["categories"]}

df_meta["category_name"] = df_meta["category_id"].map(d_categories)
df_meta["family_name"] = df_meta["category_id"].map(d_families)
df_meta["order_name"] = df_meta["category_id"].map(d_orders)

df_meta.head()

# Option 1: Sample based on order_name

Whilst there are almost 65000 different herb species in the dataset, these can be grouped into just 81 different `order_name`s:

In [None]:
df_meta.groupby('order_name')['file_name'].count().sort_values().reset_index()

Some of these have just one unique `category_id`, whilst others have thousands:

In [None]:
df_meta.groupby('order_name').agg({'category_id': 'nunique'}).sort_values('category_id').reset_index().rename(columns={'category_id':'unique_categories'})

My idea was to train models to predict `order_name` instead of `category_id` using a subset of the data. This is obviously an easier problem than the actual task, but it could work assuming there are some similarities between species in each `order_name`.

In [None]:
# Code via: https://www.kaggle.com/muhammadzubairkhan92/herbarium-2021-exploratory-data-analysis
def visualize_train_batch(paths, categories, families, orders):
    plt.figure(figsize=(16, 16))
    
    for ind, info in enumerate(zip(paths, categories, families, orders)):
        path, category, family, order = info
        
        plt.subplot(1, 3, ind + 1)
        
        image = cv2.imread(os.path.join(PATH_TRAIN, path))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        plt.imshow(image)
        
        plt.title(
            f"FAMILY: {family} ORDER: {order}\n{category}", 
            fontsize=10,
        )
        plt.axis("off")
    
    plt.show()
    
def visualize_by_order(df, _order=None):
    tmp = df.sample(3)
    if _order is not None:
        tmp = df[df["order_name"] == _order].sample(3)

    visualize_train_batch(
        tmp["file_name"].tolist(), 
        tmp["category_name"].tolist(),
        tmp["family_name"].tolist(),
        tmp["order_name"].tolist(),
    )
    
def visualize_by_family(df, _family=None):
    tmp = df.sample(3)
    if _family is not None:
        tmp = df[df["family_name"] == _family].sample(3)

    visualize_train_batch(
        tmp["file_name"].tolist(), 
        tmp["category_name"].tolist(),
        tmp["family_name"].tolist(),
        tmp["order_name"].tolist(),
    )

In [None]:
visualize_by_order(df_meta, 'Acorales')
visualize_by_order(df_meta, 'Myrtales')

You can experiment with this more yourself, but my initial thoughts are that there are enough similarities between herbs in each `order_name` that this method could work.

We can take a stratified sample of the original dataset by grouping on `order_name`. The following code ensures that examples of each `order_name` are included, and limits the maximum number of each class to 300 (you can adjust this yourself to get a larger sample if you like).

In [None]:
max_n = 300

# https://stackoverflow.com/questions/44114463/stratified-sampling-in-pandas
sample = df_meta.groupby('order_name', group_keys=False).apply(lambda x: x.sample(min(len(x), max_n)))

print(len(sample))
sample['order_name'].value_counts()

In [None]:
sample.head()

We can copy the files we need over to a new folder in our working directory:

In [None]:
os.mkdir('./sample')
target_dir = '/kaggle/working/sample'

files = sample['file_name'].unique()

for file in files:
    shutil.copy(os.path.join(PATH_TRAIN, file), target_dir)

`fastai` has a `resize_images` function that will resize all of the images in a folder for us:

In [None]:
resize_images(target_dir, max_size=256, dest=f'sample_resized')

We can now zip the file and delete the original images (best for working on an external notebook server e.g. Colab). If you are jumping straight into modelling from here, you can skip this step.

In [None]:
# http://www.seanbehan.com/how-to-use-python-shutil-make_archive-to-zip-up-a-directory-recursively-including-the-root-folder/
def make_archive(source, destination):
        base = os.path.basename(destination)
        name = base.split('.')[0]
        format = base.split('.')[1]
        archive_from = os.path.dirname(source)
        archive_to = os.path.basename(source.strip(os.sep))
        print(source, destination, archive_from, archive_to)
        shutil.make_archive(name, format, archive_from, archive_to)
        shutil.move('%s.%s'%(name,format), destination)

make_archive('/kaggle/working/sample_resized', '/kaggle/working/sample_resized.zip')
sample.to_csv('/kaggle/working/sample.csv', index=False)

In [None]:
# Delete original size sample images
shutil.rmtree('/kaggle/working/sample')

# Can keep this one if you are going straight into model building, otherwise better to delete before saving version/output
shutil.rmtree('/kaggle/working/sample_resized')

# Option 2: Sample the most common categories
This is obviously a harder problem to solve than before, since we will be predicting `category_id` instead of `order_name`. It will be more representative of the final task, however.

In [None]:
num_cats = 5000

most_freq = df_meta['category_id'].value_counts().head(num_cats).reset_index()
top_cats = df_meta[df_meta['category_id'].isin(most_freq['index'].unique())]

print(len(top_cats))
top_cats['category_id'].value_counts()

To cut this down further, you can either use `train_test_split` to generate a stratified sample, or randomly pick N categories from `top_cats`. We could of course only select a small `num_cats` above to get a reasonable dataset size straight away, but the risk is that we might not get enough variety in the images if all of the most common `category_id`s are similar looking plants.

In [None]:
# Note: if you make train_size too small you will lose some categories, since there won't be enough examples
sample2, _ = train_test_split(top_cats, train_size=0.1, stratify=top_cats['category_id'])

print(len(sample2))
sample2['category_id'].value_counts()

In [None]:
num_cats_rand = 500

if num_cats_rand > num_cats:
    num_cats_rand = num_cats
    
cats = random.sample(list(top_cats['category_id'].unique()), num_cats_rand)

sample3 = df_meta[df_meta['category_id'].isin(cats)]

print(len(sample3))
sample3['category_id'].value_counts()

You can copy the files and resize using the same process as before.