# Sample images

Get 100 randomly sampled images from the following 4 classes
1. flowering
1. not flowering
1. fruiting
1. not fruiting

In [1]:
import sys

sys.path.append('..')

In [2]:
import shutil
import sqlite3
from pathlib import Path

import pandas as pd

In [3]:
DATA_DIR = Path('..') / 'data'

TEMP_DIR = DATA_DIR / 'temp'
IMAGE_DIR = DATA_DIR / 'images'

DB = DATA_DIR / 'angiosperms.sqlite'

In [4]:
def get_image_class(cls):
    sql = f"""
        select *
          from angiosperms
          join images using (coreid)
         where {cls} = 1
      order by random()
         limit 100
    """
    with sqlite3.connect(DB) as cxn:
        df = pd.read_sql(sql, cxn)
    path = TEMP_DIR / f'{cls}.csv'
    df.to_csv(path, index=False)
    dir_ = TEMP_DIR / f'{cls}'
    dir_.mkdir(parents=True, exist_ok=True)
    for idx, row in df.iterrows():
        src = Path('..') / row['path']
        dst = dir_ / Path(row['path']).name
        shutil.copy(src, dst)

In [5]:
get_image_class('flowering')
get_image_class('not_flowering')
get_image_class('fruiting')
get_image_class('not_fruiting')

## Build training, validation, and test datasets

The classes are woefully unbalanced and many images have multiple labels, so I'm going to oversample the smaller classes in the training dataset. We should be pretty safe with this approach due to the heavy data augmentation. I'm saving the splits so that I don't wind up training on my test data. Note that an image can belong to multiple classes I need to be careful that I don't add any core IDs in the test dataset to the training/validation datasets.