In [1]:
import sys
sys.path.append('..')

In [2]:
from pathlib import Path
import sqlite3
import csv
import regex
import pandas as pd
import dask.dataframe as dd
from tqdm.notebook import tqdm

In [3]:
DATA = Path('..') / 'data'
RAW = DATA / '00_raw'
PRUNED = DATA / '01_pruned'
IN_DIR = DATA / 'input'
OUT_DIR = DATA / 'output'

DB = PRUNED / 'gazetteer.db'

CHUNK = 1_000_000

In [4]:
unknown = ('unspecified', 'unknown')

remove = regex.compile(r'(?<!\d)[.,;/(){}"\'\[\]\-](?!\d)')

sql = """select * from places where norm = ?"""

threshold = 0.1
hi = 999.0
lo = -hi

## Get vertnet_mammals_male_traits_gaz

In [7]:
in_file = IN_DIR / 'vertnet_mammals_male_traits_gaz.csv'
out_file = OUT_DIR / 'vertnet_mammals_male_traits_gaz_2020-03-18a.csv.gz'

In [8]:
df = pd.read_csv(in_file, dtype=str).fillna('')
print([c for c in df.columns])
df.shape

['1st_body_mass', '1st_body_mass_low', '1st_body_mass_high', '1st_body_mass_ambiguous', '1st_body_mass_units_inferred', '1st_body_mass_estimated', '2nd_body_mass', '2nd_body_mass_low', '2nd_body_mass_high', '2nd_body_mass_ambiguous', '2nd_body_mass_units_inferred', '2nd_body_mass_estimated', '3rd_body_mass', '3rd_body_mass_low', '3rd_body_mass_high', '3rd_body_mass_units_inferred', '4th_body_mass', 'catalognumber', 'collectioncode', 'decimallatitude', 'decimallongitude', 'dynamicproperties', '1st_ear_length', '1st_ear_length_low', '1st_ear_length_high', '1st_ear_length_measured_from', '1st_ear_length_ambiguous', '1st_ear_length_units_inferred', '1st_ear_length_estimated', '2nd_ear_length', '2nd_ear_length_measured_from', '2nd_ear_length_ambiguous', '2nd_ear_length_units_inferred', '2nd_ear_length_estimated', '3rd_ear_length', '3rd_ear_length_measured_from', '3rd_ear_length_ambiguous', '3rd_ear_length_units_inferred', '4th_ear_length', '4th_ear_length_measured_from', '4th_ear_length_amb

(27659, 119)

In [9]:
df['lat'] = None
df['lng'] = None
df['datum'] = None
df['uncert'] = None

with sqlite3.connect(DB) as cxn:
    cxn.row_factory = sqlite3.Row

    for idx, row in tqdm(df.iterrows()):
        for field in ['locality', ]:
            loc = row[field]
            loc = remove.sub(' ', loc)
            loc = ' '.join(loc.lower().split())
            if loc in unknown:
                continue

            hits = cxn.execute(sql, (loc, ))

            has_datum_uncert = []
            has_datum = []
            has_uncert = []
            neither = []
            min_lat = hi
            max_lat = lo
            min_lng = hi
            max_lng = lo
            for hit in hits:
                lat = round(hit[1], 4)
                lng = round(hit[2], 4)
                datum = hit['datum']
                uncert = hit['uncert']
                # print(loc, lat, lng, datum, uncert)
                if datum and uncert:
                    has_datum_uncert.append((lat, lng, datum, uncert))
                elif datum:
                    has_datum.append((lat, lng, datum, uncert))
                elif uncert:
                    has_uncert.append((lat, lng, datum, uncert))
                else:
                    neither.append((lat, lng, datum, uncert))
                min_lat = min(min_lat, lat)
                max_lat = max(max_lat, lat)
                min_lng = min(min_lng, lng)
                max_lng = max(max_lng, lng)
            if max_lat - min_lat >= threshold:
                continue
            if max_lng - min_lng >= threshold:
                continue
            if has_datum_uncert:
                df.at[idx, 'lat'] = has_datum_uncert[0][0]
                df.at[idx, 'lng'] = has_datum_uncert[0][1]
                df.at[idx, 'datum'] = has_datum_uncert[0][2]
                df.at[idx, 'uncert'] = has_datum_uncert[0][3]
                break
            elif has_datum:
                df.at[idx, 'lat'] = has_datum[0][0]
                df.at[idx, 'lng'] = has_datum[0][1]
                df.at[idx, 'datum'] = has_datum[0][2]
                df.at[idx, 'uncert'] = has_datum[0][3]
                break
            elif has_uncert:
                df.at[idx, 'lat'] = has_uncert[0][0]
                df.at[idx, 'lng'] = has_uncert[0][1]
                df.at[idx, 'datum'] = has_uncert[0][2]
                df.at[idx, 'uncert'] = has_uncert[0][3]
                break
            elif neither:
                df.at[idx, 'lat'] = neither[0][0]
                df.at[idx, 'lng'] = neither[0][1]
                df.at[idx, 'datum'] = neither[0][2]
                df.at[idx, 'uncert'] = neither[0][3]
                break

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [10]:
df.loc[:, ['locality', 'lat', 'lng', 'datum', 'uncert']].head(20)

Unnamed: 0,locality,lat,lng,datum,uncert
0,,,,,
1,"4.20 km N, 0.58 km W Cerro Mellizo Sud, Parque...",,,,
2,"Apeadero Militar Grl. Munoz, km 41 along Hwy 307",,,,
3,"Quebrada de Lopez, San Francisco del Monte de Oro",,,,
4,15 km N Paso del Rey,,,,
5,15 km N Paso del Rey,,,,
6,5 km N Las Higuerillas on Hwy 308,,,,
7,camp Site,,,,
8,5 km N Las Higuerillas on Hwy 308,,,,
9,"3.12 km S, 2.24 km W Cerro Mellizo Sud, Parque...",,,,


In [11]:
df.to_csv(out_file, index=False)

In [12]:
df.count()

1st_body_mass                   27659
1st_body_mass_low               27659
1st_body_mass_high              27659
1st_body_mass_ambiguous         27659
1st_body_mass_units_inferred    27659
                                ...  
4th_total_length                27659
lat                              3042
lng                              3042
datum                            2057
uncert                           3042
Length: 123, dtype: int64

In [14]:
round(3042 / 27659 * 100.0, 2)

11.0

## vertnet_mammals_female_traits_gaz

In [15]:
in_file = IN_DIR / 'vertnet_mammals_female_traits_gaz.csv'
out_file = OUT_DIR / 'vertnet_mammals_female_traits_gaz_2020-03-18a.csv.gz'

In [16]:
df = pd.read_csv(in_file, dtype=str).fillna('')
print([c for c in df.columns])
df.shape

['catalognumber', 'collectioncode', 'decimallatitude', 'decimallongitude', 'dynamicproperties', '1st_embryo_count', '1st_embryo_count_female', '1st_embryo_count_left', '1st_embryo_count_male', '1st_embryo_count_right', '2nd_embryo_count', '2nd_embryo_count_left', '2nd_embryo_count_right', '3rd_embryo_count', '1st_embryo_length', '1st_embryo_length_low', '1st_embryo_length_high', '1st_embryo_length_units_inferred', '2nd_embryo_length', '2nd_embryo_length_low', '2nd_embryo_length_high', '2nd_embryo_length_units_inferred', '3rd_embryo_length', '3rd_embryo_length_units_inferred', '4th_embryo_length', '4th_embryo_length_units_inferred', 'eventdate', 'fieldnotes', 'institutioncode', '1st_lactation_state', '2nd_lactation_state', '1st_life_stage', '2nd_life_stage', '3rd_life_stage', '4th_life_stage', 'lifestage', 'locality', 'maximumelevationinmeters', 'minimumelevationinmeters', '1st_nipple_count', '1st_nipple_count_notation', '2nd_nipple_count', '1st_nipple_state', '2nd_nipple_state', 'occur

(38460, 86)

In [17]:
df['lat'] = None
df['lng'] = None
df['datum'] = None
df['uncert'] = None

with sqlite3.connect(DB) as cxn:
    cxn.row_factory = sqlite3.Row

    for idx, row in tqdm(df.iterrows()):
        for field in ['locality', ]:
            loc = row[field]
            loc = remove.sub(' ', loc)
            loc = ' '.join(loc.lower().split())
            if loc in unknown:
                continue

            hits = cxn.execute(sql, (loc, ))

            has_datum_uncert = []
            has_datum = []
            has_uncert = []
            neither = []
            min_lat = hi
            max_lat = lo
            min_lng = hi
            max_lng = lo
            for hit in hits:
                lat = round(hit[1], 4)
                lng = round(hit[2], 4)
                datum = hit['datum']
                uncert = hit['uncert']
                # print(loc, lat, lng, datum, uncert)
                if datum and uncert:
                    has_datum_uncert.append((lat, lng, datum, uncert))
                elif datum:
                    has_datum.append((lat, lng, datum, uncert))
                elif uncert:
                    has_uncert.append((lat, lng, datum, uncert))
                else:
                    neither.append((lat, lng, datum, uncert))
                min_lat = min(min_lat, lat)
                max_lat = max(max_lat, lat)
                min_lng = min(min_lng, lng)
                max_lng = max(max_lng, lng)
            if max_lat - min_lat >= threshold:
                continue
            if max_lng - min_lng >= threshold:
                continue
            if has_datum_uncert:
                df.at[idx, 'lat'] = has_datum_uncert[0][0]
                df.at[idx, 'lng'] = has_datum_uncert[0][1]
                df.at[idx, 'datum'] = has_datum_uncert[0][2]
                df.at[idx, 'uncert'] = has_datum_uncert[0][3]
                break
            elif has_datum:
                df.at[idx, 'lat'] = has_datum[0][0]
                df.at[idx, 'lng'] = has_datum[0][1]
                df.at[idx, 'datum'] = has_datum[0][2]
                df.at[idx, 'uncert'] = has_datum[0][3]
                break
            elif has_uncert:
                df.at[idx, 'lat'] = has_uncert[0][0]
                df.at[idx, 'lng'] = has_uncert[0][1]
                df.at[idx, 'datum'] = has_uncert[0][2]
                df.at[idx, 'uncert'] = has_uncert[0][3]
                break
            elif neither:
                df.at[idx, 'lat'] = neither[0][0]
                df.at[idx, 'lng'] = neither[0][1]
                df.at[idx, 'datum'] = neither[0][2]
                df.at[idx, 'uncert'] = neither[0][3]
                break

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [18]:
df.loc[:, ['locality', 'lat', 'lng', 'datum', 'uncert']].head(20)

Unnamed: 0,locality,lat,lng,datum,uncert
0,,,,,
1,"San Francisco del Monte de Oro, 4 km E of down...",,,,
2,"2.1 km N, 1.2 km E Cerro de la Laguna, Parque ...",,,,
3,3 km N Salinas del Diamante R. R. Station,,,,
4,"4 km W Jct Hwy 338 and road to Horco Molle, al...",,,,
5,"near LCTA plot 57, Fort Sill",,,,
6,,,,,
7,Ellenton Bay,,,,
8,"14 mi SE Broken Bow, on Mountain Fork River, O...",,,,
9,"Brock Rd, Fairbanks",,,,


In [19]:
df.to_csv(out_file, index=False)

In [20]:
df.count()

catalognumber        38460
collectioncode       38460
decimallatitude      38460
decimallongitude     38460
dynamicproperties    38460
                     ...  
4th_sex              38460
lat                   3068
lng                   3068
datum                 1847
uncert                3068
Length: 90, dtype: int64

In [21]:
round(3068 / 38460 * 100.0, 2)

7.98