In [1]:
import os
from glob import glob
from pathlib import Path
from datetime import datetime
import pandas as pd
import lib.db as db
import lib.google as google
import lib.util as util

In [2]:
CXN = db.connect_up()

INTERIM_DATA = Path('..') / 'data' / 'interim'
EXTERNAL_DATA = Path('..') / 'data' / 'external'

## Look for duplicate sample IDs in the image table

In [3]:
def check_image_sample_ids():
    sql = """
        WITH dups AS (SELECT sample_id, COUNT(*) AS n
                        FROM images
                    GROUP BY sample_id
                    HAVING n > 1)
        SELECT *
          FROM images
         WHERE sample_id IN (SELECT sample_id FROM dups)
      ORDER BY sample_id
    """
    return pd.read_sql(sql, CXN)


check_image_sample_ids()

Unnamed: 0,image_file,sample_id


## Look for duplicate IDs in the taxon IDs table

In [4]:
def check_duplicate_taxon_ids():
    sql = """
        WITH dups AS (SELECT id, COUNT(*) AS n
                        FROM taxon_ids
                    GROUP BY id
                    HAVING n > 1)
        SELECT id, scientific_name, image_file
          FROM taxon_ids
          JOIN images ON (id = images.sample_id)
         WHERE id IN (SELECT id FROM dups)
      ORDER BY id
    """
    return pd.read_sql(sql, CXN)


check_duplicate_taxon_ids()

Unnamed: 0,id,scientific_name,image_file
0,188b1756-2a10-4db8-b897-640a91f3a858,Calliandra humilis,CAS-DOE-nitfix_specimen_photos/R0000613.JPG
1,188b1756-2a10-4db8-b897-640a91f3a858,Zapoteca formosa,CAS-DOE-nitfix_specimen_photos/R0000613.JPG
2,37fd5e47-9bc1-42b5-af0b-221140d67728,Monnina pseudosalicifolia,NY_DOE-nitfix_visit3/R0005299.JPG
3,37fd5e47-9bc1-42b5-af0b-221140d67728,Monnina weddelliana,NY_DOE-nitfix_visit3/R0005299.JPG
4,51535d90-c316-4262-9cfe-3bf6d6bbb095,Kotschya strobilantha,MO-DOE-nitfix_visit2/R0005831.JPG
5,51535d90-c316-4262-9cfe-3bf6d6bbb095,Lathyrus ochrus,MO-DOE-nitfix_visit2/R0005831.JPG
6,68d40646-2b30-4eb6-988e-c87eba536489,Jacksonia aculeata,MO-DOE-nitfix_visit2/R0006295.JPG
7,68d40646-2b30-4eb6-988e-c87eba536489,Jacksonia alata,MO-DOE-nitfix_visit2/R0006295.JPG
8,704626ba-27ae-4b82-b4fd-7c6aacb184fe,Lotus ornithopodioides,MO-DOE-nitfix_visit2/R0005988.JPG
9,704626ba-27ae-4b82-b4fd-7c6aacb184fe,Lotus polyphyllos,MO-DOE-nitfix_visit2/R0005988.JPG


In [5]:
def check_image_records():
    image_files = 0
    for image_dir in util.IMAGE_DIRS:
        pattern = os.fspath(util.IMAGE_ROOT_UP / image_dir / '*.JPG')
        image_files += len(glob(pattern))

    sql = """SELECT COUNT(*) FROM images"""
    image_recs = CXN.execute(sql).fetchone()[0]

    errors = CXN.execute('SELECT COUNT(*) FROM image_errors').fetchone()[0]

    sql = """SELECT COUNT(*) FROM images
              WHERE image_file LIKE 'UFBI_sample_photos/%'"""
    pilot = CXN.execute(sql).fetchone()[0]

    sql = """SELECT COUNT(*) FROM images
              WHERE image_file LIKE 'missing_photos/%'"""
    missing = CXN.execute(sql).fetchone()[0]

    check = image_recs
    check += errors
    check -= image_files
    check -= pilot
    check -= missing
    result = 'fail' if check else 'pass'

    print(f'Image records:    {image_recs:6,d}')
    print(f'Image errors:   + {errors:6,d}')
    print(f'Image files:    - {image_files:6,d} This includes errors')
    print(f'Pilot images:   - {pilot:6,d} No files for these')
    print(f'Missing images: - {missing:6,d} No files for these')
    print(f'Check:          = {check:6,d} {result}')


check_image_records()

Image records:    11,126
Image errors:   +     59
Image files:    - 10,681 This includes errors
Pilot images:   -    456 No files for these
Missing images: -     48 No files for these
Check:          =      0 pass


## Can we find the extra image record

In [6]:
def find_missing_image_records():
    image_files = set()
    for image_dir in util.IMAGE_DIRS:
        pattern = os.fspath(util.IMAGE_ROOT_UP / image_dir / '*.JPG')
        files = {util.normalize_file_name(x) for x in glob(pattern)}
        image_files |= files

    sql = """SELECT * FROM images"""
    image_recs = CXN.execute(sql).fetchall()
    image_recs = {x[0] for x in image_recs}

    errors = CXN.execute('SELECT * FROM image_errors').fetchall()
    errors = {x[0] for x in errors}

    sql = """SELECT * FROM images
              WHERE image_file LIKE 'UFBI_sample_photos/%'"""
    pilot = CXN.execute(sql).fetchall()
    pilot = {x[0] for x in pilot}

    sql = """SELECT * FROM images
              WHERE image_file LIKE 'missing_photos/%'"""
    missing = CXN.execute(sql).fetchall()
    missing = {x[0] for x in missing}

    image_recs |= errors
    image_recs -= image_files
    image_recs -= pilot
    image_recs -= missing

    return image_recs


find_missing_image_records()

set()

## Verify scientific names in Werner data

Look for scientific names in the Werner Excel sheet that are not in the
master taxonomy. Use Master Taxonomy synonyms to adjust scientific names
in the Werner data.

In [7]:
def get_scientific_names():
    sql = """SELECT scientific_name, synonyms, family FROM taxonomy"""
    taxonomy = pd.read_sql(sql, CXN)

    synonyms = taxonomy.synonyms.str.split(r'\s*[;,]\s*', expand=True)

    taxonomy = pd.concat([taxonomy, synonyms], axis=1)
    synonyms = taxonomy.melt(
        id_vars=['scientific_name'],
        value_vars=synonyms.columns,
        value_name='synonym')

    synonyms = synonyms[synonyms.synonym.notna()].drop('variable', axis=1)
    synonyms = synonyms.set_index('synonym').scientific_name.to_dict()

    sci_names = taxonomy.scientific_name.tolist()

    return sci_names, synonyms

In [8]:
def read_werner_data():
    excel_path = EXTERNAL_DATA / 'NitFixWernerEtAl2014.xlsx'
    werner = pd.read_excel(excel_path)
    drops = """NFC Legume Likelihood_non-precursor
        Likelihood_precursor Likelihood_fixer Likelihood_stable_fixer
        Most_likely_state Corrected_lik_precursor
        Corrected_lik_stable_fixer""".split()
    werner = werner.drop(
        drops, axis=1).rename(columns={
            'Species': 'scientific_name',
            'Family': 'family',
            'Order': 'order'
        })
    return werner

In [9]:
sci_names, synonyms = get_scientific_names()
werner = read_werner_data()

print(werner.shape)
in_orders = werner.order.isin(
    ['Cucurbitales', 'Fagales', 'Fabales', 'Rosales'])
werner = werner[in_orders]
print(werner.shape)
is_sci_name = werner.scientific_name.isin(sci_names)
is_synonym = werner.scientific_name.isin(synonyms)
update_it = ~is_sci_name & is_synonym

(3467, 6)
(1709, 6)


In [10]:
werner.loc[update_it, 'scientific_name'] = \
    werner.loc[update_it, 'scientific_name'].apply(lambda x: synonyms[x])

found = werner.scientific_name.isin(sci_names)
print(found.sum())

missing = werner[~found]
print(missing.shape)
missing = missing.set_index('scientific_name', verify_integrity=True)
missing = missing.sort_index()
missing

1675
(34, 6)


Unnamed: 0_level_0,order,family,Data_fixing,Source,Source2
scientific_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Acomastylis rossii,Rosales,Rosaceae,No,TRY,Wright
Amphithalea parvifolia,Fabales,Fabaceae,Yes,Sprent2009,Appendices
Aspalathus subtingens,Fabales,Fabaceae,Yes,Sprent2009,Appendices
Betula davurica,Fagales,Betulaceae,No,TRY,Wright
Cercis gigantea,Fabales,Fabaceae,No,GRIN,
Cercocarpus betuloides,Rosales,Rosaceae,Yes,Manual,Becking_1975
Cullen cinereum,Fabales,Fabaceae,Yes,Sprent2009,Appendices
Cullen leucanthum,Fabales,Fabaceae,Yes,GRIN,
Cullen parvum,Fabales,Fabaceae,Yes,Sprent2009,Appendices
Cullen patens,Fabales,Fabaceae,Yes,Sprent2009,Appendices
