In [1]:
import sys
sys.path.append('..')

In [2]:
from pathlib import Path
from ipywidgets import interact
from tqdm.notebook import tqdm
import pandas as pd

In [14]:
VOCAB_DIR = Path('..') / 'vertnet' / 'vocabulary'
OUTPUT_DIR = Path('..') / 'output'

TERMS = VOCAB_DIR / 'life_stage.csv'
# INPUT = OUTPUT_DIR / 'bats_2020-08-11a.csv'
INPUT = OUTPUT_DIR / 'no_bats_2020-08-12a.csv'

# OUTPUT = OUTPUT_DIR / 'bats_2020-08-11b.csv'
OUTPUT = OUTPUT_DIR / 'no_bats_2021-03-18a.csv'

## Normalize Life Stage

In [4]:
terms = pd.read_csv(TERMS).fillna('')

print(terms.shape)
terms.head()

(314, 5)


Unnamed: 0,label,pattern,attr,replace,notes
0,life_stage,[imm],lower,Juvenile,
1,life_stage,1 mo,lower,Juvenile,
2,life_stage,100% ossified,lower,Adult,
3,life_stage,1st winter,lower,Juvenile,
4,life_stage,1st year,lower,Juvenile,


In [5]:
terms = terms.drop_duplicates()

print(terms.shape)
terms.head()

(314, 5)


Unnamed: 0,label,pattern,attr,replace,notes
0,life_stage,[imm],lower,Juvenile,
1,life_stage,1 mo,lower,Juvenile,
2,life_stage,100% ossified,lower,Adult,
3,life_stage,1st winter,lower,Juvenile,
4,life_stage,1st year,lower,Juvenile,


In [6]:
terms = terms[terms['replace'] != 'NS']

print(terms.shape)
terms.head()

(314, 5)


Unnamed: 0,label,pattern,attr,replace,notes
0,life_stage,[imm],lower,Juvenile,
1,life_stage,1 mo,lower,Juvenile,
2,life_stage,100% ossified,lower,Adult,
3,life_stage,1st winter,lower,Juvenile,
4,life_stage,1st year,lower,Juvenile,


In [7]:
# terms.to_csv(TERMS, index=False)

In [9]:
life_stages = {r['pattern']: r['replace'] for _, r in terms.iterrows()}
len(life_stages)

314

In [15]:
vertnet = pd.read_csv(INPUT, dtype=str, na_filter=False)

In [16]:
vertnet['lifestage_cor'] = ''
for _, row in vertnet.iterrows():
    lifestage = row['lifestage']
    if lifestage:
        row['lifestage_cor'] = life_stages.get(lifestage, 'NS')

In [17]:
vertnet['lifestage_cor'].head(12)

0             
1             
2             
3             
4             
5        Adult
6           NS
7        Adult
8        Adult
9     Juvenile
10       Adult
11       Adult
Name: lifestage_cor, dtype: object

In [18]:
vertnet.to_csv(OUTPUT, index=False)

## Get Bat Binomial

In [20]:
bb_df = pd.read_csv(BATS, dtype=str, na_filter=False)
bat_binomials = set(bb_df['binomial'])
len(bat_binomials)

1023

## Split Vertnet Dataframe

In [22]:
bats = vertnet.loc[vertnet['binomial'].isin(bat_binomials), :]
no_bats = vertnet.loc[vertnet['binomial'].isin(bat_binomials) == False, :]

In [28]:
flag = bats['body_mass_1.ambiguous_key'] != ''
flag |= bats['body_mass_1.estimated_value'] != ''
flag |= bats['body_mass_1.is_shorthand'] != ''
flag |= bats['body_mass_1.units'] != ''
flag |= bats['body_mass_1.units_inferred'] != ''
flag |= bats['body_mass_1.value'] != ''

df = bats.loc[flag, :]

df.to_csv(BODY_MASS_BATS, index=False)

df.head()

Unnamed: 0,binomial,body_mass_1.ambiguous_key,body_mass_1.estimated_value,body_mass_1.is_shorthand,body_mass_1.location,body_mass_1.units,body_mass_1.units_inferred,body_mass_1.value,body_mass_2.ambiguous_key,body_mass_2.estimated_value,...,tragus_length_1.estimated_value,tragus_length_1.location,tragus_length_1.units,tragus_length_1.units_inferred,tragus_length_1.value,tragus_length_2.location,tragus_length_2.units,tragus_length_2.units_inferred,tragus_length_2.value,lifestage_cor
1,Macroglossus minimus,,,,"[{'start': 69, 'end': 83, 'field': 'dynamicpro...",,True,12.0,,,...,,,,,,,,,,
3,Hipposideros diadema,,,,"[{'start': 71, 'end': 85, 'field': 'dynamicpro...",,True,41.0,,,...,,,,,,,,,,
4,Harpyionycteris whiteheadi,,,,"[{'start': 70, 'end': 84, 'field': 'dynamicpro...",,True,58.0,,,...,,,,,,,,,,
6,Nyctinomops macrotis,,,,"[{'start': 70, 'end': 84, 'field': 'dynamicpro...",,True,28.9,,,...,,"[{'start': 0, 'end': 13, 'field': 'occurrencer...",mm,False,4.0,,,,,
7,Myotis yumanensis,,,,"[{'start': 69, 'end': 82, 'field': 'dynamicpro...",,True,3.8,,,...,,"[{'start': 0, 'end': 13, 'field': 'occurrencer...",mm,False,6.0,,,,,


In [29]:
flag = bats['total_length_1.ambiguous_key'] != ''
flag |= bats['total_length_1.estimated_value'] != ''
flag |= bats['total_length_1.is_shorthand'] != ''
flag |= bats['total_length_1.units'] != ''
flag |= bats['total_length_1.units_inferred'] != ''
flag |= bats['total_length_1.value'] != ''

df = bats.loc[flag, :]

df.to_csv(TOTAL_LENGTH_BATS, index=False)

df.head()

Unnamed: 0,binomial,body_mass_1.ambiguous_key,body_mass_1.estimated_value,body_mass_1.is_shorthand,body_mass_1.location,body_mass_1.units,body_mass_1.units_inferred,body_mass_1.value,body_mass_2.ambiguous_key,body_mass_2.estimated_value,...,tragus_length_1.estimated_value,tragus_length_1.location,tragus_length_1.units,tragus_length_1.units_inferred,tragus_length_1.value,tragus_length_2.location,tragus_length_2.units,tragus_length_2.units_inferred,tragus_length_2.value,lifestage_cor
1,Macroglossus minimus,,,,"[{'start': 69, 'end': 83, 'field': 'dynamicpro...",,True,12.0,,,...,,,,,,,,,,
3,Hipposideros diadema,,,,"[{'start': 71, 'end': 85, 'field': 'dynamicpro...",,True,41.0,,,...,,,,,,,,,,
4,Harpyionycteris whiteheadi,,,,"[{'start': 70, 'end': 84, 'field': 'dynamicpro...",,True,58.0,,,...,,,,,,,,,,
6,Nyctinomops macrotis,,,,"[{'start': 70, 'end': 84, 'field': 'dynamicpro...",,True,28.9,,,...,,"[{'start': 0, 'end': 13, 'field': 'occurrencer...",mm,False,4.0,,,,,
7,Myotis yumanensis,,,,"[{'start': 69, 'end': 82, 'field': 'dynamicpro...",,True,3.8,,,...,,"[{'start': 0, 'end': 13, 'field': 'occurrencer...",mm,False,6.0,,,,,


In [30]:
flag = no_bats['body_mass_1.ambiguous_key'] != ''
flag |= no_bats['body_mass_1.estimated_value'] != ''
flag |= no_bats['body_mass_1.is_shorthand'] != ''
flag |= no_bats['body_mass_1.units'] != ''
flag |= no_bats['body_mass_1.units_inferred'] != ''
flag |= no_bats['body_mass_1.value'] != ''

df = no_bats.loc[flag, :]

df.to_csv(BODY_MASS_NO_BATS, index=False)

In [31]:
flag = no_bats['total_length_1.ambiguous_key'] != ''
flag |= no_bats['total_length_1.estimated_value'] != ''
flag |= no_bats['total_length_1.is_shorthand'] != ''
flag |= no_bats['total_length_1.units'] != ''
flag |= no_bats['total_length_1.units_inferred'] != ''
flag |= no_bats['total_length_1.value'] != ''

df = no_bats.loc[flag, :]

df.to_csv(TOTAL_LENGTH_NO_BATS, index=False)

df.head()

Unnamed: 0,binomial,body_mass_1.ambiguous_key,body_mass_1.estimated_value,body_mass_1.is_shorthand,body_mass_1.location,body_mass_1.units,body_mass_1.units_inferred,body_mass_1.value,body_mass_2.ambiguous_key,body_mass_2.estimated_value,...,tragus_length_1.estimated_value,tragus_length_1.location,tragus_length_1.units,tragus_length_1.units_inferred,tragus_length_1.value,tragus_length_2.location,tragus_length_2.units,tragus_length_2.units_inferred,tragus_length_2.value,lifestage_cor
0,Tamiasciurus douglasii,,,,"[{'start': 72, 'end': 87, 'field': 'dynamicpro...",,True,162.0,,,...,,,,,,,,,,
2,Microtus miurus,,,,"[{'start': 71, 'end': 85, 'field': 'dynamicpro...",,True,16.0,,,...,,,,,,,,,,
5,Crocidura beatus,,,,"[{'start': 70, 'end': 83, 'field': 'dynamicpro...",,True,9.0,,,...,,,,,,,,,,
12,Rattus everetti,,,,"[{'start': 72, 'end': 87, 'field': 'dynamicpro...",,True,148.0,,,...,,,,,,,,,,
15,Crocidura grayi,,,,"[{'start': 70, 'end': 83, 'field': 'dynamicpro...",,True,8.0,,,...,,,,,,,,,,
