# Clean Data Extracted from VertNet

In [1]:
import sys
sys.path.append('..')

In [2]:
import re
from pathlib import Path
from pprint import pp
import json
from collections import defaultdict
from ipywidgets import interact
from pprint import pprint
from tqdm.notebook import tqdm
import pandas as pd

In [3]:
DATA_DIR = Path('..') / 'data' / 'output'
ARGS_DIR = Path('..') / 'args'

BASE_NAME = 'all_mammals_2021-11-09'
VER = 'a'
JSONL = DATA_DIR / 'all_mammals_2021-11-09.jsonl'
OUTPUT = DATA_DIR / f'{BASE_NAME}{VER}.csv'
COUNTS = DATA_DIR / f'{BASE_NAME}{VER}_counts.csv'

ARGS = ARGS_DIR / 'all_mammals.args'

In [4]:
VOCAB_DIR = Path('..') / 'vertnet' / 'vocabulary'

TERMS = VOCAB_DIR / 'life_stage.csv'

## Get traits and other fields from the arguments

In [5]:
TRAITS = [ln.strip().replace('--trait=', '')
          for ln in open(ARGS).readlines() if ln.startswith('--trait=')]
OTHERS = [ln.strip().replace('--extra-field=', '')
          for ln in open(ARGS).readlines() if ln.startswith('--extra-field=')]

## Read data

In [6]:
with open(JSONL) as json_file:
    data = [json.loads(d) for d in json_file.readlines()]
len(data)

649163

## Just keep the first value of a trait

In [7]:
for row in data:
    for trait_name in [t for t in TRAITS if t in row]:
        row[trait_name] = row[trait_name][0]

In [8]:
data[0]

{'catalognumber': '28211',
 'continent': 'North America',
 'country': 'Mexico',
 'county': 'Manzanillo',
 'decimallatitude': '19.180133',
 'decimallongitude': '-104.159983',
 'eventdate': '2001-01-10',
 'fieldnotes': 'Yes',
 'geodeticdatum': 'NAD83',
 'georeferenceprotocol': 'Conversion',
 'georeferenceverificationstatus': 'Verified by contributor',
 'locality': '3 km WNW San Jose de Lumbert',
 'occurrenceid': 'urn:catalog:OMNH:Mammals:28211',
 'recordedby': 'Colima Field Expedition 2001',
 'reproductivecondition': 'inactive',
 'sex': 'female',
 'verbatimcoordinates': '19 10.808N 104 09.599W',
 'verbatimcoordinatesystem': 'Degrees decimal minutes',
 'verbatimelevation': '480',
 'verbatimeventdate': '2001-01-10',
 'institutioncode': 'OMNH',
 'collectioncode': 'Mammals',
 'dynamicproperties': '{"totalLengthInmm":"243", "tailLengthInmm":"146", "hindfootLengthInmm":"30", "earLengthInmm":"15", "weightIng":"47.1"}',
 'scientificname': 'Liomys pictus',
 'body_mass': {'start': 100,
  'end': 11

## Normalize Life Stage

In [9]:
terms = pd.read_csv(TERMS).fillna('')

print(terms.shape)
terms.head()

(314, 5)


Unnamed: 0,label,pattern,attr,replace,notes
0,life_stage,[imm],lower,Juvenile,
1,life_stage,1 mo,lower,Juvenile,
2,life_stage,100% ossified,lower,Adult,
3,life_stage,1st winter,lower,Juvenile,
4,life_stage,1st year,lower,Juvenile,


In [10]:
terms = terms.drop_duplicates()

print(terms.shape)
terms.head()

(314, 5)


Unnamed: 0,label,pattern,attr,replace,notes
0,life_stage,[imm],lower,Juvenile,
1,life_stage,1 mo,lower,Juvenile,
2,life_stage,100% ossified,lower,Adult,
3,life_stage,1st winter,lower,Juvenile,
4,life_stage,1st year,lower,Juvenile,


In [11]:
terms = terms[terms['replace'] != 'NS']

print(terms.shape)
terms.head()

(314, 5)


Unnamed: 0,label,pattern,attr,replace,notes
0,life_stage,[imm],lower,Juvenile,
1,life_stage,1 mo,lower,Juvenile,
2,life_stage,100% ossified,lower,Adult,
3,life_stage,1st winter,lower,Juvenile,
4,life_stage,1st year,lower,Juvenile,


In [12]:
life_stages = {r['pattern']: r['replace'] for _, r in terms.iterrows()}
len(life_stages)

314

In [13]:
for row in data:
    lifestage = row.get('lifestage')
    if lifestage:
        row['lifestage_cor'] = life_stages.get(lifestage, 'NS')

In [14]:
data[5]

{'catalognumber': '90252',
 'continent': 'North America',
 'country': 'Canada',
 'county': 'Algoma',
 'decimallatitude': '47.33719',
 'decimallongitude': '-84.56176',
 'eventdate': '1984-07-29',
 'geodeticdatum': 'UTMG NAD27, Decimal Degrees NAD83',
 'georeferencedby': 'Susan M Woodward, Janet Sit (ROM)',
 'georeferenceddate': '20090424',
 'georeferenceprotocol': 'MaNIS/HerpNet/ORNIS Georeferencing Guidelines, GBIF Best Practices',
 'georeferencesources': 'Energy, Mines and Resources Canada, Series A751, Agawa Bay, 41 N/07, 1:50,000, Edition 3, 1995',
 'georeferenceverificationstatus': 'unverified',
 'highergeography': 'North America; Canada; Ontario; Algoma',
 'lifestage': 'adult',
 'locality': "Lake Superior Provincial Park, Near Frater's Creek, Trapline 09",
 'occurrenceid': 'URI:catalog:ROM:Mammals:90252',
 'recordedby': 'Gibson, JS',
 'reproductivecondition': 'no emb; lactating',
 'sex': 'female',
 'verbatimcoordinates': '472000 N;0843400 W',
 'verbatimeventdate': '19840729',
 've

## Get sex notations

In [15]:
sexes = set()
for row in data:
    sex = row.get('sex', '')
    if isinstance(sex, list):
        sex = sex[0]
    if isinstance(sex, dict):
        sex = sex.get('value', '')
    sexes.add(sex)

## Update sex fields

In [16]:
for row in data:
    sex = row.get('sex', '')
    if isinstance(sex, list):
        sex = sex[0]
    if isinstance(sex, dict):
        sex = sex.get('value', '')
    sex = re.split(r'[;,]', sex)
    sex = {s.strip() for s in sex}
    row['sex'] = '; '.join(sex)

## Remove female traits from males and vice versa

In [17]:
MALE = """ testes_size testes_state """.split()

FEMALE = """ embryo_count embryo_length lactation_state
    nipple_count nipple_state ovaries_size ovaries_state
    placental_scar_count pregnancy_state """.split()

In [18]:
for row in data:
    sex = row.get('sex')

    if not sex or ';' in sex or sex[0] not in 'fm':
        continue

    remove = MALE if sex[0] == 'f' else FEMALE
    for key in remove:
        if key in row:
            del row[key]

## Only sex traits in reproductive condition

In [19]:
REPRO = MALE + FEMALE
BODY = [t for t in TRAITS if t not in REPRO]

for row in data:
    for key in [t for t in BODY if t in row]:
        # Remove body traits found in reproductive condition
        if row[key]['field'] == 'reproductivecondition':
            del row[key]

## Remove empty traits

In [20]:
for row in data:
    for key in TRAITS:
        if key in row and not row[key]:
            del row[key]

## Remove empty records

In [21]:
print(len(data))

data = [d for d in data if any(d.get(t) for t in TRAITS)]

print(len(data))

649163
645609


## Separate gonad length and width measurements

In [22]:
GONAD_SIZE = """ testes_size ovaries_size """.split()


for row in data:
    for name in [t for t in GONAD_SIZE if t in row]:
        if isinstance(row[name]['value'], list):
            row[name]['length'], row[name]['width'] = row[name]['value']
        elif row[name].get('dimension') == 'width':
            row[name]['length'] = ''
            row[name]['width'] = row[name]['value']
        else:
            row[name]['length'] = row[name]['value']
            row[name]['width'] = ''

        del row[name]['value']

In [23]:
data[2]

{'catalognumber': '32114',
 'continent': 'North America',
 'country': 'United States',
 'county': 'Carroll',
 'decimallatitude': '35.8640556',
 'decimallongitude': '-88.6769167',
 'eventdate': '2003-10-08',
 'fieldnotes': 'Yes',
 'geodeticdatum': 'NAD83',
 'georeferenceprotocol': 'Conversion',
 'georeferencesources': 'MaNIS [http://manisnet.org/gci2.html]',
 'georeferenceverificationstatus': 'Verified by contributor',
 'locality': 'Milan Army Ammunition Plant, Milan',
 'occurrenceid': 'urn:catalog:OMNH:Mammals:32114',
 'recordedby': 'Kennedy, M. L., et al.',
 'reproductivecondition': 'TS=20x11',
 'sex': 'male',
 'verbatimcoordinates': '35 51 50.6N 88 40 36.9W',
 'verbatimcoordinatesystem': 'Degrees minutes seconds',
 'verbatimeventdate': '2003-10-08',
 'institutioncode': 'OMNH',
 'collectioncode': 'Mammals',
 'dynamicproperties': '{"totalLengthInmm":"225", "tailLengthInmm":"88", "hindfootLengthInmm":"32", "earLengthInmm":"18", "weightIng":"78"}',
 'scientificname': 'Sigmodon hispidus',

## Add a binomial to the data

In [24]:
for row in data:
    binomial = row.get('scientificname', '').split()[:2]
    binomial = ' '.join(binomial)
    row['binomial'] = binomial

In [25]:
data[2]

{'catalognumber': '32114',
 'continent': 'North America',
 'country': 'United States',
 'county': 'Carroll',
 'decimallatitude': '35.8640556',
 'decimallongitude': '-88.6769167',
 'eventdate': '2003-10-08',
 'fieldnotes': 'Yes',
 'geodeticdatum': 'NAD83',
 'georeferenceprotocol': 'Conversion',
 'georeferencesources': 'MaNIS [http://manisnet.org/gci2.html]',
 'georeferenceverificationstatus': 'Verified by contributor',
 'locality': 'Milan Army Ammunition Plant, Milan',
 'occurrenceid': 'urn:catalog:OMNH:Mammals:32114',
 'recordedby': 'Kennedy, M. L., et al.',
 'reproductivecondition': 'TS=20x11',
 'sex': 'male',
 'verbatimcoordinates': '35 51 50.6N 88 40 36.9W',
 'verbatimcoordinatesystem': 'Degrees minutes seconds',
 'verbatimeventdate': '2003-10-08',
 'institutioncode': 'OMNH',
 'collectioncode': 'Mammals',
 'dynamicproperties': '{"totalLengthInmm":"225", "tailLengthInmm":"88", "hindfootLengthInmm":"32", "earLengthInmm":"18", "weightIng":"78"}',
 'scientificname': 'Sigmodon hispidus',

## Count extracts by binomial

In [26]:
binomials = defaultdict(lambda: defaultdict(int))

for row in data:
    for trait_name in TRAITS:
        if row.get(trait_name):
            binomials[row['binomial']][trait_name] += 1

In [27]:
for binomial, count in binomials.items():
    count['binomial'] = binomial

In [28]:
df = pd.DataFrame(binomials.values()).fillna(0)
df = df.set_index('binomial')
df = df.astype(int)
df = df.sort_index()

In [29]:
df.to_csv(COUNTS)

In [30]:
df.head()

Unnamed: 0_level_0,body_mass,ear_length,hind_foot_length,tail_length,total_length,embryo_count,lactation_state,testes_size,pregnancy_state,embryo_length,...,nipple_state,placental_scar_count,placental_scar_state,vagina_state,nipple_count,nipples_enlarged,ovaries_size,ovaries_state,forearm_length,tragus_length
binomial,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,6,10,10,10,12,2,0,2,0,0,...,1,0,0,0,0,0,0,0,0,0
(new SW,7,7,7,7,7,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Aarat,1,2,2,2,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Abelmoschomys simpsoni,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
Abeomelomys sevia,3,4,4,4,4,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Lift trait values

In [31]:
LOC = """ start end field """.split()

In [32]:
for row in data:
    row['field'] = set()
    for trait in [t for t in TRAITS if t in row]:
        for key, value in row[trait].items():
            if key == 'field':
                row['field'].add(value)
            if key in LOC:
                continue
            row[f'{trait}.{key}'] = value
        del row[trait]
    row['field'] = ' '.join(sorted(row['field']))

In [33]:
data[2]

{'catalognumber': '32114',
 'continent': 'North America',
 'country': 'United States',
 'county': 'Carroll',
 'decimallatitude': '35.8640556',
 'decimallongitude': '-88.6769167',
 'eventdate': '2003-10-08',
 'fieldnotes': 'Yes',
 'geodeticdatum': 'NAD83',
 'georeferenceprotocol': 'Conversion',
 'georeferencesources': 'MaNIS [http://manisnet.org/gci2.html]',
 'georeferenceverificationstatus': 'Verified by contributor',
 'locality': 'Milan Army Ammunition Plant, Milan',
 'occurrenceid': 'urn:catalog:OMNH:Mammals:32114',
 'recordedby': 'Kennedy, M. L., et al.',
 'reproductivecondition': 'TS=20x11',
 'sex': 'male',
 'verbatimcoordinates': '35 51 50.6N 88 40 36.9W',
 'verbatimcoordinatesystem': 'Degrees minutes seconds',
 'verbatimeventdate': '2003-10-08',
 'institutioncode': 'OMNH',
 'collectioncode': 'Mammals',
 'dynamicproperties': '{"totalLengthInmm":"225", "tailLengthInmm":"88", "hindfootLengthInmm":"32", "earLengthInmm":"18", "weightIng":"78"}',
 'scientificname': 'Sigmodon hispidus',

## Convert to data frame and remove duplicate records

In [34]:
df = pd.DataFrame(data)
print(df.shape)

(645609, 143)


In [35]:
df.to_csv(OUTPUT, index=False)