<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Setup" data-toc-modified-id="Setup-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Setup</a></span></li><li><span><a href="#Get-the-Master-Taxonomy-Google-Sheet" data-toc-modified-id="Get-the-Master-Taxonomy-Google-Sheet-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Get the Master Taxonomy Google Sheet</a></span></li><li><span><a href="#Link-Images-to-Taxonomy-IDs" data-toc-modified-id="Link-Images-to-Taxonomy-IDs-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Link Images to Taxonomy IDs</a></span></li><li><span><a href="#Link-Pilot-Data-to-Taxonomy-IDs" data-toc-modified-id="Link-Pilot-Data-to-Taxonomy-IDs-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Link Pilot Data to Taxonomy IDs</a></span></li><li><span><a href="#Link-Corrales-Data-to-Taxonomy-IDs" data-toc-modified-id="Link-Corrales-Data-to-Taxonomy-IDs-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Link Corrales Data to Taxonomy IDs</a></span></li><li><span><a href="#Merge-per-ID-Data-into-the-Taxonomy-Data" data-toc-modified-id="Merge-per-ID-Data-into-the-Taxonomy-Data-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Merge per ID Data into the Taxonomy Data</a></span></li><li><span><a href="#Read-the-Genbank-Loci-Google-Sheet" data-toc-modified-id="Read-the-Genbank-Loci-Google-Sheet-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Read the Genbank Loci Google Sheet</a></span></li><li><span><a href="#Get-NitFix-1-Expedition-Data" data-toc-modified-id="Get-NitFix-1-Expedition-Data-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Get NitFix 1 Expedition Data</a></span></li><li><span><a href="#Output-the-Taxonomy-Data" data-toc-modified-id="Output-the-Taxonomy-Data-9"><span class="toc-item-num">9&nbsp;&nbsp;</span>Output the Taxonomy Data</a></span></li></ul></div>

# Setup

In [1]:
import os
import re
from pathlib import Path
from functools import partial

import numpy as np
import pandas as pd
from pandas.api.types import is_number
import dropbox
from dotenv import load_dotenv, find_dotenv

import lib.db as db
import lib.util as util
import lib.google as google

Some settings are in the .env file that is not stored in the repository.

In [2]:
load_dotenv(find_dotenv())

True

In [3]:
INTERIM_DATA = Path('..') / 'data' / 'interim'
PROCESSED_DATA = Path('..') / 'data' / 'processed'

In [4]:
def join_columns(columns, row):
    value = ','.join([row[c] for c in columns if not is_number(row[c])])
    return value if value else np.nan

# Get the Master Taxonomy Google Sheet

In [5]:
csv_path = INTERIM_DATA / 'taxonomy.csv'

google.sheet_to_csv('NitFixMasterTaxonomy', csv_path)

taxonomy = pd.read_csv(
    csv_path,
    header=0,
    names=[
        'column_a',
        'family',
        'scientific_name',
        'authority',
        'synonyms',
        'ids',
        'provider_acronym',
        'provider_id',
        'quality_notes'])
taxonomy['genus'] = taxonomy.scientific_name.str.split().str[0]
taxonomy.scientific_name = taxonomy.scientific_name.str.split().str.join(' ')
taxonomy.ids = taxonomy.ids.str.lower().str.split().str.join(' ')

In [6]:
split_ids = taxonomy.ids.str.split(r'\s*[;,]\s*', expand=True)
id_cols = {i: f'id_{i + 1}' for i in split_ids.columns}
split_ids.rename(columns=id_cols, inplace=True)

taxonomy = pd.concat([taxonomy, split_ids], axis=1)

print(len(taxonomy))
taxonomy.head()

38655


Unnamed: 0,column_a,family,scientific_name,authority,synonyms,ids,provider_acronym,provider_id,quality_notes,genus,id_1,id_2,id_3
0,kew-2640275,Anisophylleaceae,Anisophyllea apetala,Scort. ex King,,,,,,Anisophyllea,,,
1,kew-2640276,Anisophylleaceae,Anisophyllea beccariana,Baill.,,,,,,Anisophyllea,,,
2,kew-2640277,Anisophylleaceae,Anisophyllea boehmii,Engl.,"Anisophyllea exellii, Anisophyllea gossweileri",,,,,Anisophyllea,,,
3,kew-2640279,Anisophylleaceae,Anisophyllea buchneri,Engl. & Brehmer,,,,,,Anisophyllea,,,
4,kew-2640280,Anisophylleaceae,Anisophyllea buettneri,Engl.,Anisophyllea brachystila,,,,,Anisophyllea,,,


# Link Images to Taxonomy IDs

In [7]:
CXN = db.connect()

images = pd.read_sql('SELECT * FROM raw_images', CXN)

print(len(images))
images.head()

8909


Unnamed: 0,sample_id,image_file
0,6fcdf583-e9bb-4764-84de-f277cc6ec6b7,../data/raw/DOE-nitfix_specimen_photos/R000000...
1,6fa18219-4958-4d75-8bf3-032fa909315c,../data/raw/DOE-nitfix_specimen_photos/R000000...
2,6f93bea8-43f4-45ad-95f5-ecad63f13037,../data/raw/DOE-nitfix_specimen_photos/R000000...
3,6f66cc88-3583-4e9b-97ea-03b1d681def8,../data/raw/DOE-nitfix_specimen_photos/R000000...
4,6f5bc099-ff55-4740-8a2f-e63466b47892,../data/raw/DOE-nitfix_specimen_photos/R000000...


In [8]:
ids = taxonomy.melt(id_vars=['scientific_name'], value_vars=split_ids.columns)
ids.rename(columns={'value': 'id'}, inplace=True)

has_id = ids.id.str.len() > 4
ids = ids[has_id]

ids.id = ids.id.str.split().str.join(' ')

ids = ids.merge(right=images, how='left', left_on='id', right_on='sample_id')

ids[ids['image_file'].notna()].head()

Unnamed: 0,scientific_name,variable,id,sample_id,image_file
0,Anisophyllea purpurascens,id_1,0d31695e-7b2b-416f-80a1-1480cccc845a,0d31695e-7b2b-416f-80a1-1480cccc845a,../data/raw/NY_DOE-nitfix_visit3/R0006006.JPG
1,Apodanthes caseariae,id_1,0d41a40b-1669-418e-8b4b-31e648451c26,0d41a40b-1669-418e-8b4b-31e648451c26,../data/raw/NY_DOE-nitfix_visit3/R0006000.JPG
2,Berlinianche aethiopica,id_1,0d356dfd-5531-472a-bc76-361ee00c0b0d,0d356dfd-5531-472a-bc76-361ee00c0b0d,../data/raw/NY_DOE-nitfix_visit3/R0006004.JPG
3,Pilostyles blanchetii,id_1,0d3f2955-45a3-45bc-ad1b-09e8a33d1efd,0d3f2955-45a3-45bc-ad1b-09e8a33d1efd,../data/raw/NY_DOE-nitfix_visit3/R0006002.JPG
4,Pilostyles haussknechtii,id_1,0d352b41-5fe2-4f89-8486-a29a02140618,0d352b41-5fe2-4f89-8486-a29a02140618,../data/raw/NY_DOE-nitfix_visit3/R0006005.JPG


# Link Pilot Data to Taxonomy IDs

In [9]:
pilot = pd.read_sql('SELECT * FROM raw_pilot', CXN)

print(len(pilot))
pilot.head()

456


Unnamed: 0,pilot_id,sample_id,image_file
0,ny: cronquist 11617,2179dce7-dac2-4fc1-84a3-8725acefa8cc,../data/raw/UFBI_sample_photos/20170523_154701...
1,ny: nee 38556,00420ba6-4228-49e8-845c-30a967de4b51,../data/raw/UFBI_sample_photos/20170523_154645...
2,ny: jorengensen 65676,72b64a1e-0dd9-4f44-9f82-afaee163d57b,../data/raw/UFBI_sample_photos/20170523_154638...
3,ny: jaramillo 10160,3364f3bb-c0a1-4af4-8b3b-a780de9f1594,../data/raw/UFBI_sample_photos/20170523_154629...
4,ny: jorgensen 61589,6e76a0be-4b0f-4e01-a6e6-1cd1395d4458,../data/raw/UFBI_sample_photos/20170523_154621...


In [10]:
ids = ids.merge(right=pilot, how='left', left_on='id', right_on='pilot_id')

columns = ['image_file_x', 'image_file_y']
ids['image_file'] = ids.apply(partial(join_columns, columns), axis=1)
ids.drop(columns, axis=1, inplace=True)

columns = ['sample_id_x', 'sample_id_y']
ids['sample_id'] = ids.apply(partial(join_columns, columns), axis=1)
ids.drop(columns, axis=1, inplace=True)

print(len(ids))
ids.head()

9687


Unnamed: 0,scientific_name,variable,id,pilot_id,image_file,sample_id
0,Anisophyllea purpurascens,id_1,0d31695e-7b2b-416f-80a1-1480cccc845a,,../data/raw/NY_DOE-nitfix_visit3/R0006006.JPG,0d31695e-7b2b-416f-80a1-1480cccc845a
1,Apodanthes caseariae,id_1,0d41a40b-1669-418e-8b4b-31e648451c26,,../data/raw/NY_DOE-nitfix_visit3/R0006000.JPG,0d41a40b-1669-418e-8b4b-31e648451c26
2,Berlinianche aethiopica,id_1,0d356dfd-5531-472a-bc76-361ee00c0b0d,,../data/raw/NY_DOE-nitfix_visit3/R0006004.JPG,0d356dfd-5531-472a-bc76-361ee00c0b0d
3,Pilostyles blanchetii,id_1,0d3f2955-45a3-45bc-ad1b-09e8a33d1efd,,../data/raw/NY_DOE-nitfix_visit3/R0006002.JPG,0d3f2955-45a3-45bc-ad1b-09e8a33d1efd
4,Pilostyles haussknechtii,id_1,0d352b41-5fe2-4f89-8486-a29a02140618,,../data/raw/NY_DOE-nitfix_visit3/R0006005.JPG,0d352b41-5fe2-4f89-8486-a29a02140618


# Link Corrales Data to Taxonomy IDs

In [11]:
corrales = pd.read_sql('SELECT * FROM raw_corrales', CXN)

print(len(corrales))
corrales.head()

48


Unnamed: 0,corrales_id,sample_id,image_file
0,corrales: corrales 770,eb9dc632-82a9-479f-88d5-f172ee6cc2d7,../data/raw/missing_photos/Corrales_770.jpg
1,corrales: corrales 830,4a31b17c-08f2-4236-a7b1-2261554ea658,../data/raw/missing_photos/Corrales_830.jpg
2,corrales: corrales 792,3ffa0f4c-1180-4268-8ba5-c9cc2e251350,../data/raw/missing_photos/Corrales_792.jpg
3,corrales: corrales 704,b6b6b66e-3b3b-4d5c-8197-a959a8fe715e,../data/raw/missing_photos/Corrales_704.jpg
4,corrales: corrales 754,56013e5d-6b1b-4c1c-8f01-3bb7e57c3a83,../data/raw/missing_photos/Corrales_754.jpg


In [12]:
ids = ids.merge(
    right=corrales, how='left', left_on='id', right_on='corrales_id')

columns = ['image_file_x', 'image_file_y']
ids['image_files'] = ids.apply(partial(join_columns, columns), axis=1)
ids.drop(columns, axis=1, inplace=True)

columns = ['sample_id_x', 'sample_id_y']
ids['sample_ids'] = ids.apply(partial(join_columns, columns), axis=1)
ids.drop(columns, axis=1, inplace=True)

no_id = ids.id.str.contains('corrales: corrales no voucher')
ids = ids[~no_id]

print(len(ids))
ids.head()

9685


Unnamed: 0,scientific_name,variable,id,pilot_id,corrales_id,image_files,sample_ids
0,Anisophyllea purpurascens,id_1,0d31695e-7b2b-416f-80a1-1480cccc845a,,,../data/raw/NY_DOE-nitfix_visit3/R0006006.JPG,0d31695e-7b2b-416f-80a1-1480cccc845a
1,Apodanthes caseariae,id_1,0d41a40b-1669-418e-8b4b-31e648451c26,,,../data/raw/NY_DOE-nitfix_visit3/R0006000.JPG,0d41a40b-1669-418e-8b4b-31e648451c26
2,Berlinianche aethiopica,id_1,0d356dfd-5531-472a-bc76-361ee00c0b0d,,,../data/raw/NY_DOE-nitfix_visit3/R0006004.JPG,0d356dfd-5531-472a-bc76-361ee00c0b0d
3,Pilostyles blanchetii,id_1,0d3f2955-45a3-45bc-ad1b-09e8a33d1efd,,,../data/raw/NY_DOE-nitfix_visit3/R0006002.JPG,0d3f2955-45a3-45bc-ad1b-09e8a33d1efd
4,Pilostyles haussknechtii,id_1,0d352b41-5fe2-4f89-8486-a29a02140618,,,../data/raw/NY_DOE-nitfix_visit3/R0006005.JPG,0d352b41-5fe2-4f89-8486-a29a02140618


# Merge per ID Data into the Taxonomy Data

In [13]:
def join_aggregate(values):
    value = ','.join([v for v in values if not is_number(v)])
    return value if value else np.nan

In [14]:
groups = ids.groupby('scientific_name').aggregate({
    'id': join_aggregate,
    'pilot_id': join_aggregate,
    'sample_ids': join_aggregate,
    'image_files': join_aggregate})

print(len(groups))
groups.head()

9619


Unnamed: 0_level_0,id,pilot_id,sample_ids,image_files
scientific_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Abarema abbottii,6552469c-0952-4185-8b93-7fde26747b0e,,6552469c-0952-4185-8b93-7fde26747b0e,../data/raw/NY_DOE-nitfix_visit3/R0004892.JPG
Abarema acreana,652a908c-3a83-4ac4-9f26-ef1cd895b409,,652a908c-3a83-4ac4-9f26-ef1cd895b409,../data/raw/NY_DOE-nitfix_visit3/R0004898.JPG
Abarema adenophora,655cbcbb-19f8-4664-9eb5-e59291dbbd37,,655cbcbb-19f8-4664-9eb5-e59291dbbd37,../data/raw/NY_DOE-nitfix_visit3/R0004890.JPG
Abarema agropecuaria,6529081a-34d5-4971-8264-3cb754732b2c,,6529081a-34d5-4971-8264-3cb754732b2c,../data/raw/NY_DOE-nitfix_visit3/R0004899.JPG
Abarema alexandri,653fc52a-c880-4507-96b4-eaf3f2db2218,,653fc52a-c880-4507-96b4-eaf3f2db2218,../data/raw/NY_DOE-nitfix_visit3/R0004893.JPG


In [15]:
taxonomy = taxonomy.merge(
    right=groups, how='left', left_on='scientific_name', right_index=True)
taxonomy.head()

Unnamed: 0,column_a,family,scientific_name,authority,synonyms,ids,provider_acronym,provider_id,quality_notes,genus,id_1,id_2,id_3,id,pilot_id,sample_ids,image_files
0,kew-2640275,Anisophylleaceae,Anisophyllea apetala,Scort. ex King,,,,,,Anisophyllea,,,,,,,
1,kew-2640276,Anisophylleaceae,Anisophyllea beccariana,Baill.,,,,,,Anisophyllea,,,,,,,
2,kew-2640277,Anisophylleaceae,Anisophyllea boehmii,Engl.,"Anisophyllea exellii, Anisophyllea gossweileri",,,,,Anisophyllea,,,,,,,
3,kew-2640279,Anisophylleaceae,Anisophyllea buchneri,Engl. & Brehmer,,,,,,Anisophyllea,,,,,,,
4,kew-2640280,Anisophylleaceae,Anisophyllea buettneri,Engl.,Anisophyllea brachystila,,,,,Anisophyllea,,,,,,,


# Read the Genbank Loci Google Sheet

In [16]:
csv_path = INTERIM_DATA / 'loci.csv'

google.sheet_to_csv('genbank_loci', csv_path)

loci = pd.read_csv(
    csv_path,
    header=0,
    names=['scientific_name', 'its', 'atpb', 'matk', 'matr', 'rbcl'])
loci.scientific_name = loci.scientific_name.str.split().str.join(' ')
print(len(loci))
loci.head()

38564


Unnamed: 0,scientific_name,its,atpb,matk,matr,rbcl
0,Abarema abbottii,0,0,0,0,0
1,Abarema acreana,0,0,0,0,0
2,Abarema adenophora,0,0,0,0,0
3,Abarema adenophorum,0,0,0,0,0
4,Abarema agropecuaria,0,0,0,0,0


In [17]:
taxonomy = taxonomy.merge(right=loci, how='left', on='scientific_name')
taxonomy.head()

Unnamed: 0,column_a,family,scientific_name,authority,synonyms,ids,provider_acronym,provider_id,quality_notes,genus,...,id_3,id,pilot_id,sample_ids,image_files,its,atpb,matk,matr,rbcl
0,kew-2640275,Anisophylleaceae,Anisophyllea apetala,Scort. ex King,,,,,,Anisophyllea,...,,,,,,0.0,0.0,0.0,0.0,0.0
1,kew-2640276,Anisophylleaceae,Anisophyllea beccariana,Baill.,,,,,,Anisophyllea,...,,,,,,0.0,0.0,0.0,0.0,0.0
2,kew-2640277,Anisophylleaceae,Anisophyllea boehmii,Engl.,"Anisophyllea exellii, Anisophyllea gossweileri",,,,,Anisophyllea,...,,,,,,0.0,0.0,1.0,0.0,1.0
3,kew-2640279,Anisophylleaceae,Anisophyllea buchneri,Engl. & Brehmer,,,,,,Anisophyllea,...,,,,,,0.0,0.0,0.0,0.0,0.0
4,kew-2640280,Anisophylleaceae,Anisophyllea buettneri,Engl.,Anisophyllea brachystila,,,,,Anisophyllea,...,,,,,,0.0,0.0,0.0,0.0,0.0


# Get NitFix 1 Expedition Data

In [18]:
DROPBOX = os.getenv('DROPBOX')
DBX = dropbox.Dropbox(DROPBOX)

In [19]:
csv_path = str(INTERIM_DATA / 'nitfix01.csv')
dbx_path = 'id:zSBrtnqOfSAAAAAAAAAAKw/5657_Nit_Fix_I.reconcile.4.2.csv'

file_metadata = DBX.files_download_to_file(csv_path, dbx_path)

nitfix01 = pd.read_csv(csv_path)
columns = {}
for old in nitfix01.columns:
    new = old.lower()
    new = new.replace('⁰', 'deg')
    new = new.replace("''", 'sec')
    new = new.replace("'", 'min')
    new = re.sub(r'[^a-z0-9_]+', '_', new)
    new = re.sub(r'^_|_$', '', new)
    columns[old] = new
columns['subject_qr_code'] = 'sample_id'

nitfix01.rename(columns=columns, inplace=True)

ids = ids.merge(
    right=nitfix01, how='left', left_on='id', right_on='sample_id')

nitfix01.head()

Unnamed: 0,subject_id,country,state_province,county,location,minimum_elevation,maximum_elevation,main_dropdown,latitude_deg,latitude_min,...,month_1,day_1,year_1,month_2,day_2,year_2,subject_image_name,subject_nybg_bar_code,subject_resolved_name,sample_id
0,16192935,United States of America,North Carolina,Wayne,"Town of Fremont, along NC Rt.222, east of Evan...",,,feet,35,32,...,9 - September,25,2011,,,,R0001220.JPG,NYBG 3196996,Senna obtusifolia,8e37959f-dfa6-44b6-a201-b94215340016
1,16192937,United States of America,Arizona,Maricopa,"Salt Rivr at 35th Avenue bridge in Phoenix, ju...",1022.0,,feet,33.411913,,...,1 - January,21,2012,,,,R0001205.JPG,NYBG 3196995,Senna artemisioides,90a9d5ee-a1c6-4dd3-b6b1-6932ea796abd
2,16192938,Gabon,Ogooué-Lolo,,"Makande surroundings, c. 65 km SSW of Booué. I...",,,,- 0,41 S,...,2 - February,11,1999,,,,R0001202.JPG,NYBG 3196994,Scorodophloeus zenkeri,90f68e06-c5cb-48dc-9de1-5c0512314486
3,16192939,"Tanzania, United Republic of",Tanga,,"Mkaramo Parish, Mkwaja Subchiefeom, Mwera Chie...",150.0,,feet,,,...,7 - July,10,1957,,,,R0001201.JPG,NYBG 3196992,Scorodophloeus fischeri,90fb8362-a4ed-407d-a8b1-32dc56506101
4,16192941,Congo (Democratic Republic of the),Kasaï-Central,,Babadi - Kasai,,,unknown,,,...,12 - December,Not Shown,1934,,,,R0001199.JPG,NYBG 3196991,Leonardoxa romii,911525c9-04f7-4213-8781-a9842216c2d8


# Output the Taxonomy Data

In [20]:
name = 'taxonomy'
taxonomy.to_sql(name, CXN, if_exists='replace', index=False)
taxonomy.to_csv(PROCESSED_DATA / f'{name}.csv', index=False)
CXN.execute("""CREATE INDEX taxonomy_idx ON taxonomy (scientific_name)""")

<sqlite3.Cursor at 0x7f60a162d030>

In [21]:
name = 'taxon_ids'
ids.to_sql(name, CXN, if_exists='replace', index=False)
ids.to_csv(PROCESSED_DATA / f'{name}.csv', index=False)
CXN.execute("""CREATE INDEX taxon_ids_idx ON taxon_ids (scientific_name)""")

<sqlite3.Cursor at 0x7f60a162d500>

In [22]:
name = 'raw_loci'
loci.to_sql(name, CXN, if_exists='replace', index=False)
loci.to_csv(PROCESSED_DATA / f'{name}.csv', index=False)

In [23]:
name = 'raw_nitfix01'
nitfix01.to_sql(name, CXN, if_exists='replace', index=False)
nitfix01.to_csv(PROCESSED_DATA / f'{name}.csv', index=False)