# Clean Data Extracted from VertNet

In [1]:
import sys
sys.path.append('..')

In [2]:
import re
from pathlib import Path
from pprint import pp
import json
from collections import defaultdict
from ipywidgets import interact
from pprint import pprint
from tqdm.notebook import tqdm
import pandas as pd

In [3]:
DATA_DIR = Path('..') / 'output'
ARGS_DIR = Path('..') / 'args'

BASE_NAME = 'rodent_sex_2021-09-15'
VER = 'a'
JSONL = DATA_DIR / 'rodent_sex_2021-09-15.jsonl'
OUTPUT = DATA_DIR / f'{BASE_NAME}{VER}.csv'
COUNTS = DATA_DIR / f'{BASE_NAME}{VER}_counts.csv'

ARGS = ARGS_DIR / 'rodent_sex.args'

In [4]:
VOCAB_DIR = Path('..') / 'vertnet' / 'vocabulary'

TERMS = VOCAB_DIR / 'life_stage.csv'

## Get traits and other fields from the arguments

In [5]:
TRAITS = [ln.strip().replace('--trait=', '')
          for ln in open(ARGS).readlines() if ln.startswith('--trait=')]
OTHERS = [ln.strip().replace('--extra-field=', '')
          for ln in open(ARGS).readlines() if ln.startswith('--extra-field=')]
# TRAITS
OTHERS

['catalognumber',
 'collectioncode',
 'continent',
 'coordinateprecision',
 'coordinateuncertaintyinmeters',
 'country',
 'countrycode',
 'county',
 'decimallatitude',
 'decimallongitude',
 'eventdate',
 'footprintspatialfit',
 'footprintsrs',
 'footprintwkt',
 'geodeticdatum',
 'geologicalcontextid',
 'georeferencedby',
 'georeferenceddate',
 'georeferenceprotocol',
 'georeferenceremarks',
 'georeferencesources',
 'georeferenceverificationstatus',
 'habitat',
 'highergeography',
 'highergeographyid',
 'highestbiostratigraphiczone',
 'institutioncode',
 'island',
 'islandgroup',
 'lifestage',
 'locality',
 'locationaccordingto',
 'locationid',
 'locationremarks',
 'maximumdepthinmeters',
 'maximumdistanceabovesurfaceinmeters',
 'maximumelevationinmeters',
 'minimumdepthinmeters',
 'minimumdistanceabovesurfaceinmeters',
 'minimumelevationinmeters',
 'municipality',
 'occurrenceid',
 'recordedby',
 'scientificname',
 'sex',
 'verbatimcoordinates',
 'verbatimcoordinatesystem',
 'verbatimd

## Trait and field categories

In [6]:
LOC = """ start end field """.split()

MALE = """ testes_size testes_state """.split()

FEMALE = """ embryo_count embryo_length lactation_state
    nipple_count nipple_state ovaries_size ovaries_state
    placental_scar_count pregnancy_state """.split()

REPRO = MALE + FEMALE

BODY = [t for t in TRAITS if t not in REPRO]

## Read data

In [7]:
with open(JSONL) as json_file:
    data = [json.loads(d) for d in json_file.readlines()]
len(data)

186077

In [8]:
data[10]

{'catalognumber': '90224',
 'continent': 'North America',
 'country': 'Canada',
 'county': 'Algoma',
 'day': '07',
 'decimallatitude': '47.37688',
 'decimallongitude': '-84.65266',
 'eventdate': '1984-07-07',
 'geodeticdatum': 'UTMG NAD27, Decimal Degrees NAD83',
 'georeferencedby': 'Susan M Woodward, Janet Sit (ROM)',
 'georeferenceddate': '20090424',
 'georeferenceprotocol': 'MaNIS/HerpNet/ORNIS Georeferencing Guidelines, GBIF Best Practices',
 'georeferencesources': 'Energy, Mines and Resources Canada, Series A751, Agawa Bay, 41 N/07, 1:50,000, Edition 3, 1995',
 'georeferenceverificationstatus': 'unverified',
 'highergeography': 'North America; Canada; Ontario; Algoma',
 'lifestage': 'adult',
 'locality': 'Lake Superior Provincial Park, Near Agawa Bay, Trapline 08',
 'month': '7',
 'occurrenceid': 'URI:catalog:ROM:Mammals:90224',
 'recordedby': 'Gibson, JS',
 'reproductivecondition': 'testes-8X6',
 'sex': 'male',
 'stateprovince': 'Ontario',
 'verbatimcoordinates': '472200 N;084400

## Normalize Life Stage

In [9]:
terms = pd.read_csv(TERMS).fillna('')

print(terms.shape)
terms.head()

(314, 5)


Unnamed: 0,label,pattern,attr,replace,notes
0,life_stage,[imm],lower,Juvenile,
1,life_stage,1 mo,lower,Juvenile,
2,life_stage,100% ossified,lower,Adult,
3,life_stage,1st winter,lower,Juvenile,
4,life_stage,1st year,lower,Juvenile,


In [10]:
terms = terms.drop_duplicates()

print(terms.shape)
terms.head()

(314, 5)


Unnamed: 0,label,pattern,attr,replace,notes
0,life_stage,[imm],lower,Juvenile,
1,life_stage,1 mo,lower,Juvenile,
2,life_stage,100% ossified,lower,Adult,
3,life_stage,1st winter,lower,Juvenile,
4,life_stage,1st year,lower,Juvenile,


In [11]:
terms = terms[terms['replace'] != 'NS']

print(terms.shape)
terms.head()

(314, 5)


Unnamed: 0,label,pattern,attr,replace,notes
0,life_stage,[imm],lower,Juvenile,
1,life_stage,1 mo,lower,Juvenile,
2,life_stage,100% ossified,lower,Adult,
3,life_stage,1st winter,lower,Juvenile,
4,life_stage,1st year,lower,Juvenile,


In [12]:
# terms.to_csv(TERMS, index=False)

In [13]:
life_stages = {r['pattern']: r['replace'] for _, r in terms.iterrows()}
len(life_stages)

314

In [14]:
for row in data:
    lifestage = row.get('lifestage')
    if lifestage:
        row['lifestage_cor'] = life_stages.get(lifestage, 'NS')

## Get sex notations

In [15]:
sexes = set()
for row in data:
    sexes.add(row.get('sex', ''))
sexes

{'',
 '10',
 '40',
 '5',
 'adults',
 'caught',
 'clean',
 'dob',
 'f or m',
 'f phyllotis',
 'f; double check sex on skin if possible',
 'f?',
 'female',
 'female ?',
 'female female',
 'female | male',
 'female,female',
 'female,male',
 'female; female',
 'female; female ?',
 'female; female; female',
 'female; female; female; female',
 'female; female; female; female; female',
 'female; female; female; female; female; female',
 'female; female; female; female; female; female; female',
 'female; female; female; female; female; female; female; female',
 'female; female; female; female; female; female; female; female; female',
 'female; female; female; female; female; female; female; female; female; female',
 'female; female; female; male; female',
 'female; female; male',
 'female; female; male; female; female',
 'female; female; male; female; female; female',
 'female; female; male; male',
 'female; male',
 'female; male; female',
 'female; male; female; female',
 'female; male; femal

## Update sex fields

In [16]:
for row in data:
    sex = re.split(r'[;,]', row.get('sex', ''))
    sex = {s.strip() for s in sex}
    row['sex'] = '; '.join(sex)

## Remove female traits from males and vice versa

In [17]:
for row in data:
    sex = row.get('sex')

    if not sex or ';' in sex or sex[0] not in 'fm':
        continue

    remove = MALE if sex[0] == 'f' else FEMALE
    for key in remove:
        if key in row:
            del row[key]

## Only sex traits in reproductive condition

In [18]:
for row in data:
    for key in BODY:

        if key not in row:
            continue

        # Remove body traits found in reproductive condition
        for trait in row[key]:
            if trait['field'] == 'reproductivecondition':
                del trait

## Get max counts of extracted items

In [19]:
counts = defaultdict(int)
for row in data:
    for field in TRAITS:
        value = row.get(field, {})
        count = 1 if isinstance(value, dict) else len(value)
        counts[field] = max(counts[field], count)

for key, value in counts.items():
    print(key, value)

nipples_enlarged 7
vagina_state 16
pregnancy_state 8
embryo_length 9
lactation_state 3
placental_scar_state 5
testes_state 10
scrotal_state 11
testes_size 4


## Remove traits with too many values

In [20]:
MAX_COUNT = 5
for row in data:
    for trait_name in TRAITS:
        trait = row.get(trait_name)
        if isinstance(trait, list) and len(trait) > MAX_COUNT:
            del row[trait_name]

## Remove empty traits

In [21]:
for row in data:
    for key in TRAITS:
        if key in row and not row[key]:
            del row[key]

## Remove empty records

In [22]:
print(len(data))

data = [d for d in data if any(d.get(t) for t in TRAITS)]

print(len(data))

186077
179184


## Get the maximum gonad length and width measurements

In [23]:
GONAD_SIZE = """ testes_size ovaries_size """.split()

In [24]:
for row in data:

    row['max_testes_length'] = 0
    row['max_testes_width'] = 0
    row['max_ovaries_length'] = 0
    row['max_ovaries_width'] = 0

    for trait_name in [t for t in GONAD_SIZE if t in row]:
        for trait in row[trait_name]:
            gonad = trait_name.split('_')[0]

            len_key = f'max_{gonad}_length'
            wid_key = f'max_{gonad}_width'

            # The trait has both a length and a width
            if isinstance(trait['value'], list):
                length, width = trait['value']
                row[len_key] = max(row[len_key], length)
                row[wid_key] = max(row[wid_key], width)

            # The trait has only a width
            elif trait.get('dimension') == 'width':
                row[wid_key] = max(row[wid_key], width)

            # The trait has only a length
            else:
                row[len_key] = max(row[len_key], length)

    row['max_testes_length'] = row['max_testes_length'] if row['max_testes_length'] else ''
    row['max_testes_width'] = row['max_testes_width'] if row['max_testes_width'] else ''
    row['max_ovaries_length'] = row['max_ovaries_length'] if row['max_ovaries_length'] else ''
    row['max_ovaries_width'] = row['max_ovaries_width'] if row['max_ovaries_width'] else ''

## Combine duplicate extractions

The same trait values may be written in multiple locations or in different fields

In [25]:
for row in data:

    for trait_name in [t for t in TRAITS if t in row]:

        trait_dict = {}

        for trait in row[trait_name]:

            loc = {k: v for k, v in trait.items() if k in LOC}
            fld = {k: v for k, v in trait.items() if k not in LOC}

            # Build a key out of the non-location fields
            key = [(k, tuple(v) if isinstance(v, list) else v)
                   for k, v in fld.items()]
            key = tuple(sorted(key))

            # Append a location if this is a duplicate
            if key in trait_dict:
                trait_dict[key]['location'].append(loc)

            # Create a new record if this is not a duplicate
            else:
                trait_dict[key] = fld
                trait_dict[key]['location'] = [loc]

        # Update the extraction
        values = [v for v in trait_dict.values()]
        row[trait_name] = values[0] if len(values) == 1 else values

## Merge the trait locations into a set

In [26]:
for row in data:
    locations = set()

    for trait_name in [t for t in TRAITS if t in row]:
        if not isinstance(row[trait_name], list):
            row[trait_name] = [row[trait_name]]
        for trait in row[trait_name]:
            for loc in trait['location']:
                locations.add(loc['field'])

    row['loc'] = ' '.join(sorted(locations))

In [27]:
pprint(data[:5])

[{'catalognumber': '32114',
  'collectioncode': 'Mammals',
  'continent': 'North America',
  'country': 'United States',
  'county': 'Carroll',
  'day': '8',
  'decimallatitude': '35.8640556',
  'decimallongitude': '-88.6769167',
  'dynamicproperties': '{"totalLengthInmm":"225", "tailLengthInmm":"88", '
                       '"hindfootLengthInmm":"32", "earLengthInmm":"18", '
                       '"weightIng":"78"}',
  'eventdate': '2003-10-08',
  'fieldnotes': 'Yes',
  'geodeticdatum': 'NAD83',
  'georeferenceprotocol': 'Conversion',
  'georeferencesources': 'MaNIS [http://manisnet.org/gci2.html]',
  'georeferenceverificationstatus': 'Verified by contributor',
  'institutioncode': 'OMNH',
  'loc': 'reproductivecondition',
  'locality': 'Milan Army Ammunition Plant, Milan',
  'max_ovaries_length': '',
  'max_ovaries_width': '',
  'max_testes_length': 20.0,
  'max_testes_width': 11.0,
  'month': '10',
  'occurrenceid': 'urn:catalog:OMNH:Mammals:32114',
  'recordedby': 'Kennedy, M. L.

## Remove unwanted columns and values

In [28]:
for row in data:

    for trait_name in [t for t in TRAITS if t in row]:
        
        # Remove gonad sizes because they only want the max values
        if trait_name in GONAD_SIZE:
            del row[trait_name]
        # Just keep the first value? Seems odd to me
        else:
            row[trait_name] = row[trait_name][0]['value']

## Add a binomial to the data

In [29]:
for row in data:
    binomial = row.get('scientificname', '').split()[:2]
    binomial = ' '.join(binomial)
    row['binomial'] = binomial

## Count extracts by binomial

In [30]:
binomials = defaultdict(lambda: defaultdict(int))

for row in data:
    for trait_name in TRAITS:
        if row.get(trait_name):
            binomials[row['binomial']][trait_name] += 1

In [31]:
for binomial, count in binomials.items():
    count['binomial'] = binomial

In [32]:
df = pd.DataFrame(binomials.values()).fillna(0)
df = df.set_index('binomial')
df = df.astype(int)
df = df.sort_index()

In [33]:
df.to_csv(COUNTS)

In [34]:
df.head()

Unnamed: 0_level_0,lactation_state,pregnancy_state,embryo_length,nipples_enlarged,vagina_state,scrotal_state,placental_scar_state,testes_state
binomial,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
,0,0,0,1,1,0,0,0
Abelmoschomys simpsoni,0,0,0,1,0,0,0,0
Abrocoma cinerea,0,0,0,0,8,0,0,0
Abrothrix andinus,8,0,3,1,16,0,0,4
Abrothrix hershkovitzi,0,0,0,0,0,1,0,0


## Convert to data frame and remove duplicate records

In [35]:
df = pd.DataFrame(data)
print(df.shape)

# df = df.drop_duplicates()
# print(df.shape)

# columns = sorted(df.columns)
# df = df.reindex(columns, axis='columns').fillna('')
# print(df.shape)

(179184, 74)


In [36]:
df.to_csv(OUTPUT, index=False)