In [1]:
import os
from glob import glob
from pathlib import Path
from datetime import datetime
import pandas as pd
import lib.db as db
import lib.google as google
import lib.util as util

In [2]:
CXN = db.connect_up()

INTERIM_DATA = Path('..') / 'data' / 'interim'
EXTERNAL_DATA = Path('..') / 'data' / 'external'

In [3]:
sql = """SELECT scientific_name, id FROM taxon_ids"""
taxon_ids = pd.read_sql(sql, CXN).set_index('scientific_name').id.to_dict()
len(taxon_ids)

12794

In [4]:
def get_scientific_names():
    sql = """SELECT scientific_name, synonyms, family FROM taxonomy"""
    taxonomy = pd.read_sql(sql, CXN)

    synonyms = taxonomy.synonyms.str.split(r'\s*[;,]\s*', expand=True)

    taxonomy = pd.concat([taxonomy, synonyms], axis=1)
    synonyms = taxonomy.melt(
        id_vars=['scientific_name'],
        value_vars=synonyms.columns,
        value_name='synonym')

    synonyms = synonyms[synonyms.synonym.notna()].drop('variable', axis=1)
    synonyms = synonyms.set_index('synonym').scientific_name.to_dict()

    sci_names = taxonomy.scientific_name.tolist()

    return sci_names, synonyms

In [5]:
def read_werner_data():
    excel_path = EXTERNAL_DATA / 'NitFixWernerEtAl2014.xlsx'
    werner = pd.read_excel(excel_path)
    drops = """Legume Likelihood_non-precursor
        Likelihood_precursor Likelihood_fixer Likelihood_stable_fixer
        Most_likely_state Corrected_lik_precursor
        Corrected_lik_stable_fixer""".split()
    werner = werner.drop(
        drops, axis=1).rename(
            columns={
                'NFC': 'nfc',
                'Species': 'scientific_name',
                'Family': 'family_w',
                'Order': 'order'
            })
    is_nfc = werner.nfc == 'Yes'
    return werner[is_nfc]

In [24]:
sci_names, synonyms = get_scientific_names()
werner = read_werner_data()

print(werner.shape)
is_sci_name = werner.scientific_name.isin(sci_names)
is_synonym = werner.scientific_name.isin(synonyms)
update_it = ~is_sci_name & is_synonym
print(update_it.sum())

werner[update_it]

(1709, 7)
38


Unnamed: 0,scientific_name,nfc,order,family_w,Data_fixing,Source,Source2
3,Inga marginata,Yes,Fabales,Fabaceae,Yes,Manual,Roggy_1999
36,Samanea saman,Yes,Fabales,Fabaceae,Yes,Sprent2009,Appendices
49,Parkia roxburghii,Yes,Fabales,Fabaceae,No,GRIN,
58,Lysiloma tergeminum,Yes,Fabales,Fabaceae,Yes,Sprent2009,Appendices
78,Acacia euthycarpa,Yes,Fabales,Fabaceae,Yes,GRIN,
199,Vachellia farnesiana,Yes,Fabales,Fabaceae,Yes,GRIN,
268,Entada rheedei,Yes,Fabales,Fabaceae,Yes,Sprent2009,Appendices
285,Parkinsonia microphylla,Yes,Fabales,Fabaceae,No,GRIN,
296,Gleditsia caspica,Yes,Fabales,Fabaceae,No,GRIN,
298,Gleditsia rolfei,Yes,Fabales,Fabaceae,No,GRIN,


In [25]:
werner['synonym'] = ''
werner.loc[update_it, 'synonym'] = werner.loc[update_it, 'scientific_name']

In [26]:
werner.loc[update_it, 'scientific_name'] = \
    werner.loc[update_it, 'scientific_name'].apply(lambda x: synonyms[x])

In [27]:
werner[update_it]

Unnamed: 0,scientific_name,nfc,order,family_w,Data_fixing,Source,Source2,synonym
3,Inga semialata,Yes,Fabales,Fabaceae,Yes,Manual,Roggy_1999,Inga marginata
36,Albizia saman,Yes,Fabales,Fabaceae,Yes,Sprent2009,Appendices,Samanea saman
49,Parkia timoriana,Yes,Fabales,Fabaceae,No,GRIN,,Parkia roxburghii
58,Lysiloma tergemina,Yes,Fabales,Fabaceae,Yes,Sprent2009,Appendices,Lysiloma tergeminum
78,Acacia calamifolia,Yes,Fabales,Fabaceae,Yes,GRIN,,Acacia euthycarpa
199,Acacia farnesiana,Yes,Fabales,Fabaceae,Yes,GRIN,,Vachellia farnesiana
268,Entada rheedii,Yes,Fabales,Fabaceae,Yes,Sprent2009,Appendices,Entada rheedei
285,Cercidium microphyllum,Yes,Fabales,Fabaceae,No,GRIN,,Parkinsonia microphylla
296,Gleditsia caspia,Yes,Fabales,Fabaceae,No,GRIN,,Gleditsia caspica
298,Gleditsia fera,Yes,Fabales,Fabaceae,No,GRIN,,Gleditsia rolfei


In [30]:
found = werner.scientific_name.isin(sci_names)
first = werner.scientific_name.duplicated()
print(found.sum())
print(first.sum())
werner[first]

1675
8


Unnamed: 0,scientific_name,nfc,order,family_w,Data_fixing,Source,Source2,synonym
81,Acacia calamifolia,Yes,Fabales,Fabaceae,Yes,Sprent2009,Appendices,
230,Parkia timoriana,Yes,Fabales,Fabaceae,No,Sprent2001,Table5.4,
300,Gleditsia fera,Yes,Fabales,Fabaceae,No,GRIN,,
795,Amphicarpaea bracteata,Yes,Fabales,Fabaceae,Yes,Sprent2009,Appendices,
1224,Cyclopia falcata,Yes,Fabales,Fabaceae,Yes,Sprent2009,Appendices,
1551,Ziziphus jujuba,Yes,Rosales,Rhamnaceae,No,TRY,Wright,
1570,Coriaria japonica,Yes,Cucurbitales,Coriariaceae,Yes,Manual,Becking_1975,
1580,Allocasuarina verticillata,Yes,Fagales,Casuarinaceae,Yes,Manual,Rodriguez-Barrueco_1969,Casuarina stricta


In [31]:
dups = werner.scientific_name.duplicated(keep=False)
werner[dups]

Unnamed: 0,scientific_name,nfc,order,family_w,Data_fixing,Source,Source2,synonym
49,Parkia timoriana,Yes,Fabales,Fabaceae,No,GRIN,,Parkia roxburghii
78,Acacia calamifolia,Yes,Fabales,Fabaceae,Yes,GRIN,,Acacia euthycarpa
81,Acacia calamifolia,Yes,Fabales,Fabaceae,Yes,Sprent2009,Appendices,
230,Parkia timoriana,Yes,Fabales,Fabaceae,No,Sprent2001,Table5.4,
298,Gleditsia fera,Yes,Fabales,Fabaceae,No,GRIN,,Gleditsia rolfei
300,Gleditsia fera,Yes,Fabales,Fabaceae,No,GRIN,,
794,Amphicarpaea bracteata,Yes,Fabales,Fabaceae,Yes,Sprent2009,Appendices,Amphicarpaea edgeworthii
795,Amphicarpaea bracteata,Yes,Fabales,Fabaceae,Yes,Sprent2009,Appendices,
1215,Cyclopia falcata,Yes,Fabales,Fabaceae,Yes,Sprent2009,Appendices,Cyclopia subternata
1224,Cyclopia falcata,Yes,Fabales,Fabaceae,Yes,Sprent2009,Appendices,


In [7]:
in_taxo = werner[found & ~first]
print(in_taxo.shape)
in_taxo.head()

1675
8
(1667, 7)


Unnamed: 0,scientific_name,nfc,order,family_w,Data_fixing,Source,Source2
0,Inga acreana,Yes,Fabales,Fabaceae,Yes,Sprent2009,Appendices
1,Inga densiflora,Yes,Fabales,Fabaceae,Yes,GRIN,
2,Inga laurina,Yes,Fabales,Fabaceae,Yes,Manual,De_Faria_2010
3,Inga semialata,Yes,Fabales,Fabaceae,Yes,Manual,Roggy_1999
4,Inga cinnamomea,Yes,Fabales,Fabaceae,Yes,Manual,De_Faria_2010


In [8]:
has_id = in_taxo.scientific_name.isin(taxon_ids)

should_sample = in_taxo[~has_id]

print(should_sample.shape)
should_sample.head()

(435, 7)


Unnamed: 0,scientific_name,nfc,order,family_w,Data_fixing,Source,Source2
0,Inga acreana,Yes,Fabales,Fabaceae,Yes,Sprent2009,Appendices
5,Inga nobilis,Yes,Fabales,Fabaceae,Yes,Manual,De_Faria_2010
10,Inga pezizifera,Yes,Fabales,Fabaceae,Yes,Manual,De_Faria_2010
11,Inga cayennensis,Yes,Fabales,Fabaceae,Yes,Manual,De_Faria_2010
13,Inga macrophylla,Yes,Fabales,Fabaceae,Yes,Manual,De_Faria_2010


In [9]:
should_sample.to_csv(INTERIM_DATA / 'werner_not_sampled.csv', index=False)