# Download Images

In [1]:
import sys

sys.path.append('..')

In [2]:
import multiprocessing
import shutil
import sqlite3
from pathlib import Path

import pandas as pd

from herbarium.pylib import image_util as iu

In [3]:
DATA_DIR = Path('..') / 'data'

URI_DIR = DATA_DIR / 'temp'
IMAGE_DIR = DATA_DIR / 'images'

ERROR1 = DATA_DIR / 'temp' / 'download_errors.txt'
ERROR2 = DATA_DIR / 'temp' / 'validate_errors.txt'

DB = DATA_DIR / 'angiosperms.sqlite'

## Sample records from each image class

In [4]:
# iu.sample_records(DB, URI_DIR)

## Download images

In [5]:
csvs = list(URI_DIR.glob('uris_*.csv'))

In [6]:
with multiprocessing.Pool(processes=6) as pool:
    results = []
    for csv_file in csvs:
        results.append(pool.apply_async(
            iu.image_util, (csv_file, IMAGE_DIR, ERROR1)))
    all_results = [result.get() for result in results]

## Validate images

In [4]:
iu.validate_images(IMAGE_DIR / 'all', DB, error=ERROR2)

15127it [1:08:02,  3.71it/s]


## Remove images not in the DB

In [None]:
bad_dir = IMAGE_DIR / 'bad'
good_dir = IMAGE_DIR / 'all'

bad_dir.mkdir(exist_ok=True)
good_dir.mkdir(exist_ok=True)

In [None]:
for src in IMAGE_DIRGE_DIR.glob('*.jpg'):
    dst = bad_dir / src.name
    shutil.move(src, dst)

In [5]:
sql = """select coreid from images join angiosperms using (coreid)"""

with sqlite3.connect(DB) as cxn:
    for row in cxn.execute(sql):
        src = good_dir / f'{row[0]}.jpg'
        dst = dst_dir / f'{row[0]}.jpg'
        shutil.move(src, dst)

In [None]:
shutil.move(bad_dir, DATA_DIR / 'backup')

In [None]:
for src in good_dir.glob('*'):
    dst = IMAGE_DIR / src.name
    shutil.move(src, dst)

In [6]:
sql = """delete from images
    where coreid not in (select coreid from images join angiosperms using (coreid));"""
with sqlite3.connect(DB) as cxn:
    cxn.execute(sql)

In [None]:
good_dir.rmdir()

## Get image means and standard deviations

In [5]:
mean, std = iu.get_image_norm(IMAGE_DIR)

100%|██████████████████████████████████████████████████████| 946/946 [1:08:58<00:00,  4.37s/it]


In [6]:
print(f'{mean=} {std=}')

mean=tensor([0.7743, 0.7529, 0.7100]) std=tensor([0.2286, 0.2365, 0.2492])
