<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Setup" data-toc-modified-id="Setup-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Setup</a></span></li><li><span><a href="#CalAcademy-Manifest" data-toc-modified-id="CalAcademy-Manifest-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>CalAcademy Manifest</a></span></li><li><span><a href="#MOBOT-Manifest-(Old)" data-toc-modified-id="MOBOT-Manifest-(Old)-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>MOBOT Manifest (Old)</a></span></li></ul></div>

# Setup

In [2]:
from os.path import basename
from pathlib import Path

import pandas as pd

import lib.db as db
import lib.util as util

In [3]:
CXN = db.connect()
INTERIM_DATA = Path('..') / 'data' / 'interim'

# CalAcademy Manifest

In [15]:
sql = """
    SELECT image_file, raw_images.sample_id, scientific_name FROM raw_images
      JOIN taxon_ids ON (taxon_ids.id = raw_images.sample_id)
     WHERE image_file LIKE '%/CAS-DOE-nitfix_specimen_photos/%'
"""
images = pd.read_sql(sql, CXN)

images.image_file = images.image_file.str.extract(r'.*/(.*)', expand=False)

print(len(images))
images.head()

2589


Unnamed: 0,image_file,sample_id,scientific_name
0,R0001833.JPG,4360e849-b172-4ed9-973f-c5899b17842c,Begonia acetosella
1,R0001851.JPG,45ed57c1-fcbc-4aca-adaa-1c3938c9cf24,Begonia almedana
2,R0001852.JPG,45e91b4e-fca1-485e-9dce-bb030900a807,Begonia angustiloba
3,R0001853.JPG,45e3b5f1-9f85-4b15-b96c-ca03aca2148d,Begonia aptera
4,R0001799.JPG,445bbcdd-0827-40bf-89a6-bc11f0652765,Begonia arborescens


In [17]:
sql = """
    SELECT image_file FROM image_errors
     WHERE image_file LIKE '%/CAS-DOE-nitfix_specimen_photos/%'
"""
errors = pd.read_sql(sql, CXN)
errors.image_file = errors.image_file.str.extract(r'.*/(.*)', expand=False)

print(len(errors))
errors

7


Unnamed: 0,image_file
0,R0000614.JPG
1,R0001311.JPG
2,R0001361.JPG
3,R0002092.JPG
4,R0002349.JPG
5,R0002381.JPG
6,R0002641.JPG


In [18]:
images.to_csv(INTERIM_DATA / 'cas_manifest.csv', index=False)
errors.to_csv(INTERIM_DATA / 'cas_manifest_missing.csv', index=False)

# MOBOT Manifest (Old)

In [17]:
taxonomy = pd.read_sql('SELECT * FROM taxons', CXN)

sql = """SELECT *
           FROM images
          WHERE file_name LIKE '%/MO-DOE-nitfix_specimen_photos/%'"""

images = pd.read_sql(sql, CXN)

taxons = {}
for key, taxon in taxonomy.iterrows():
    guids = util.split_uuids(taxon.sample_ids)
    for guid in guids:
        taxons[guid] = taxon.scientific_name

In [20]:
for key, image in images.iterrows():
    images.loc[key, 'resolved_name'] = taxons.get(image.sample_id)

images.head()

Unnamed: 0,sample_id,file_name,resolved_name
0,0e6f248b-aa6c-40f1-b571-1f0db94ad3a3,../data/raw/MO-DOE-nitfix_specimen_photos/R000...,Abrus fruticulosus
1,0e669154-9371-441d-85fa-683c1761b184,../data/raw/MO-DOE-nitfix_specimen_photos/R000...,Abrus schimperi
2,0e641ae8-5219-495f-8c72-7b76449c67bc,../data/raw/MO-DOE-nitfix_specimen_photos/R000...,Adenocarpus anagyrifolius
3,0e5e5134-11b2-47ee-aef3-6b1d753b172f,../data/raw/MO-DOE-nitfix_specimen_photos/R000...,Adenocarpus bacquei
4,0e5a4cc5-bd58-4062-8918-94e0c8e9fa7e,../data/raw/MO-DOE-nitfix_specimen_photos/R000...,Adenocarpus decorticans


In [33]:
images.file_name = images.file_name.apply(basename)
print(len(images))
images.head()

1024


Unnamed: 0,sample_id,file_name,resolved_name
0,0e6f248b-aa6c-40f1-b571-1f0db94ad3a3,R0002644.JPG,Abrus fruticulosus
1,0e669154-9371-441d-85fa-683c1761b184,R0002645.JPG,Abrus schimperi
2,0e641ae8-5219-495f-8c72-7b76449c67bc,R0002646.JPG,Adenocarpus anagyrifolius
3,0e5e5134-11b2-47ee-aef3-6b1d753b172f,R0002647.JPG,Adenocarpus bacquei
4,0e5a4cc5-bd58-4062-8918-94e0c8e9fa7e,R0002648.JPG,Adenocarpus decorticans


In [31]:
csv_path = INTERIM_DATA / 'mobot_manifest.csv'
images.to_csv(csv_path, index=False)

In [32]:
missing = images.resolved_name.isna()
missing_images = images[missing]
len(missing_images)

32