# Ingest images

In [2]:
from glob import glob
from pathlib import Path
from collections import namedtuple

import pandas as pd
from PIL import Image, ImageFilter
import exifread
import zbarlight
from tqdm import tqdm

import lib.db as db
import lib.util as util
from lib.dict_attr import DictAttrs

In [3]:
Dimensions = namedtuple('Dimensions', 'width height')

CXN = db.connect()
RAW_DATA = Path('..') / 'data' / 'raw'
PROCESSED_DATA = Path('..') / 'data' / 'processed'

### Create images and errors tables

In [3]:
def create_images_table():
    CXN.execute('DROP TABLE IF EXISTS images')
    CXN.execute("""
        CREATE TABLE images (
            sample_id TEXT PRIMARY KEY NOT NULL,
            file_name TEXT NOT NULL UNIQUE
        )""")
    CXN.execute("""CREATE INDEX image_idx ON images (sample_id)""")

In [4]:
def create_errors_table():
    """Create errors table for persisting errors."""
    CXN.execute('DROP TABLE IF EXISTS errors')
    CXN.execute("""
        CREATE TABLE errors (
            error_key   TEXT NOT NULL,
            msg         TEXT,
            ok          INTEGER,
            resolution  TEXT
        )""")
    CXN.execute("""CREATE INDEX error_idx ON errors (error_key)""")

### Search the image for the sample ID (UUID)

This is a helper function that slides a window over the image. It helps with feature extraction by limiting the search area.

In [5]:
def window_slider(image_size, window=None, stride=None):
    window = window if window else Dimensions(400, 400)
    stride = stride if stride else Dimensions(200, 200)

    for top in range(0, image_size.height, stride.height):
        bottom = top + window.height
        bottom = image_size.height if bottom > image_size.height else bottom

        for left in range(0, image_size.width, stride.width):
            right = left + window.width
            right = image_size.width if right > image_size.width else right

            box = (left, top, right, bottom)

            yield box

Use various strategies to get the QR code from the image.

In [6]:
def get_qr_code(image):
    # Try a direct extraction
    qr_code = zbarlight.scan_codes('qrcode', image)
    if qr_code:
        return qr_code[0].decode('utf-8')

    # Try a slider
    for box in window_slider(image):
        cropped = image.crop(box)
        qr_code = zbarlight.scan_codes('qrcode', cropped)
        if qr_code:
            return qr_code[0].decode('utf-8')

    # Try rotating the image *sigh*
    for degrees in range(5, 85, 5):
        rotated = image.rotate(degrees)
        qr_code = zbarlight.scan_codes('qrcode', rotated)
        if qr_code:
            return qr_code[0].decode('utf-8')

    # Try to sharpen the image
    sharpened = image.filter(ImageFilter.SHARPEN)
    qr_code = zbarlight.scan_codes('qrcode', sharpened)
    if qr_code:
        return qr_code[0].decode('utf-8')

    return None

Read in the image and process it further.


In [7]:
def get_image_data(file_name):
    with open(file_name, 'rb') as image_file:
        # exif = exifread.process_file(image_file)
        image = Image.open(image_file)
        image.load()

    qr_code = get_qr_code(image)

    return qr_code

### Ingest one image batch

In [8]:
def ingest_images(dir_name):
    pattern = str(dir_name / '*.JPG')

    sample_ids = {}  # Keep track of already used sample_ids

    images = []  # A batch of images to insert
    errors = []  # A batch of errors to insert

    files = sorted(glob(pattern))

    for file_name in tqdm(files):
        sample_id = get_image_data(file_name)

        # Handle a missing sample ID
        if not sample_id:
            msg = 'MISSING: QR code missing in {}'.format(file_name)
            errors.append((file_name, msg))

        # Handle a duplicate sample ID
        elif sample_ids.get(sample_id):
            msg = ('DUPLICATES: Files {} and {} have the same '
                   'QR code').format(sample_ids[sample_id], file_name)
            errors.append((file_name, msg))

        # The image seems OK
        else:
            sample_ids[sample_id] = file_name
            images.append((sample_id, file_name))

    # Insert the image and error batches
    sql = 'INSERT INTO images (sample_id, file_name) VALUES (?, ?)'
    CXN.executemany(sql, images)

    sql = 'INSERT INTO errors (error_key, msg) VALUES (?, ?)'
    CXN.executemany(sql, errors)

    CXN.commit()

### Resolve an error

In [9]:
def resolve_error(dir_name, file_name, ok, resolution):
    error_key = str(dir_name / f'{file_name}.JPG')
    sql = 'UPDATE errors SET ok = ?, resolution = ? WHERE error_key = ?'
    CXN.execute(sql, (ok, resolution, error_key))
    CXN.commit()

### Manually set an image record

In [10]:
def manual_insert(dir_name, file_name, sample_id):
    file_name = str(RAW_DATA / dir_name / f'{file_name}.JPG')
    sql = 'INSERT INTO images (sample_id, file_name) VALUES (?, ?)'
    CXN.execute(sql, (sample_id, file_name))
    CXN.commit()

### Create tables

In [11]:
create_images_table()
create_errors_table()

### Ingest New York Botanical Garden (1st trip)

In [12]:
path = RAW_DATA / 'DOE-nitfix_specimen_photos'

ingest_images(path)

resolve_error(path, 'R0000149', 1, 'OK: Genuine duplicate')
resolve_error(path, 'R0000151', 1, 'OK: Genuine duplicate')
resolve_error(path, 'R0000158', 1, 'OK: Genuine duplicate')
resolve_error(path, 'R0000165', 1, 'OK: Genuine duplicate')
resolve_error(path, 'R0000674', 1, 'OK: Is a duplicate of R0000473')
resolve_error(path, 'R0000835', 1, 'OK: Is a duplicate of R0000836')
resolve_error(path, 'R0000895', 1, 'OK: Genuine duplicate')
resolve_error(path, 'R0000937', 1, 'OK: Genuine duplicate')
resolve_error(path, 'R0001055', 1, 'OK: Genuine duplicate')

100%|██████████| 1236/1236 [29:43<00:00,  1.44s/it]


### Ingest Harvard Herbaria

In [13]:
path = RAW_DATA / 'HUH_DOE-nitfix_specimen_photos'

ingest_images(path)

resolve_error(path, 'R0001262', 1, 'OK: Is a duplicate of R0001263')
resolve_error(path, 'R0001729', 1, 'OK: Is a duplicate of R0001728')

100%|██████████| 483/483 [12:13<00:00,  1.52s/it]


### Ingest Ohio State University Herbarium

In [14]:
path = RAW_DATA / 'OS_DOE-nitfix_specimen_photos'

ingest_images(path)

resolve_error(path, 'R0000229', 1, 'OK: Genuine duplicate')
resolve_error(path, 'R0001835', 1, 'OK: Genuine duplicate')
resolve_error(path, 'R0001898', 1, 'OK: Genuine duplicate')

100%|██████████| 688/688 [15:29<00:00,  1.35s/it]


### Ingest California Academy of Sciences Herbarium

In [15]:
path = RAW_DATA / 'CAS-DOE-nitfix_specimen_photos'

ingest_images(path)

resolve_error(path, 'R0001361', 1, 'OK: Genuine duplicate')
resolve_error(path, 'R0002349', 1, 'OK: Genuine duplicate')

100%|██████████| 2412/2412 [56:43<00:00,  1.41s/it]


### Ingest Missouri Botanical Garden

In [16]:
path = RAW_DATA / 'MO-DOE-nitfix_specimen_photos'

ingest_images(path)

resolve_error(path, 'R0002933', 1, 'OK: Genuine duplicate')
resolve_error(path, 'R0003226', 1, 'OK: Genuine duplicate')
resolve_error(path, 'R0003663', 1, 'OK: Manually fixed')
resolve_error(path, 'R0003509', 0, 'ERROR: Blurry image')

manual_insert(path, 'R0003663', '2eea159f-3c25-42ef-837d-27ad545a6779')

100%|██████████| 1027/1027 [23:25<00:00,  1.37s/it]


### Ingest New York Botanical Garden (2nd trip)

In [18]:
path = RAW_DATA / 'NY_visit_2'

ingest_images(path)

100%|██████████| 606/606 [14:34<00:00,  1.44s/it]


### Put data into a CSV file

In [5]:
csv_name = 'images.csv'

df = pd.read_sql('SELECT * FROM images', CXN)

csv_path = PROCESSED_DATA / csv_name
df.to_csv(csv_path, index=False)

df.head()

Unnamed: 0,sample_id,file_name
0,6fcdf583-e9bb-4764-84de-f277cc6ec6b7,../data/raw/DOE-nitfix_specimen_photos/R000000...
1,6fa18219-4958-4d75-8bf3-032fa909315c,../data/raw/DOE-nitfix_specimen_photos/R000000...
2,6f93bea8-43f4-45ad-95f5-ecad63f13037,../data/raw/DOE-nitfix_specimen_photos/R000000...
3,6f66cc88-3583-4e9b-97ea-03b1d681def8,../data/raw/DOE-nitfix_specimen_photos/R000000...
4,6f5bc099-ff55-4740-8a2f-e63466b47892,../data/raw/DOE-nitfix_specimen_photos/R000000...


In [6]:
csv_name = 'errors.csv'

df = pd.read_sql('SELECT * FROM errors', CXN)

csv_path = PROCESSED_DATA / csv_name
df.to_csv(csv_path, index=False)

df.head()

Unnamed: 0,error_key,msg,ok,resolution
0,../data/raw/DOE-nitfix_specimen_photos/R000014...,DUPLICATES: Files ../data/raw/DOE-nitfix_speci...,1.0,OK: Genuine duplicate
1,../data/raw/DOE-nitfix_specimen_photos/R000015...,DUPLICATES: Files ../data/raw/DOE-nitfix_speci...,1.0,OK: Genuine duplicate
2,../data/raw/DOE-nitfix_specimen_photos/R000015...,DUPLICATES: Files ../data/raw/DOE-nitfix_speci...,1.0,OK: Genuine duplicate
3,../data/raw/DOE-nitfix_specimen_photos/R000016...,DUPLICATES: Files ../data/raw/DOE-nitfix_speci...,1.0,OK: Genuine duplicate
4,../data/raw/DOE-nitfix_specimen_photos/R000067...,DUPLICATES: Files ../data/raw/DOE-nitfix_speci...,1.0,OK: Is a duplicate of R0000473
