In [1]:
import sys
sys.path.append('..')

In [2]:
from pathlib import Path
import sqlite3
import pandas as pd

In [3]:
DATA = Path('..') / 'data'
PRUNED = DATA / '01_pruned'
INPUT = DATA / 'input'

DB = PRUNED / 'gazetteer.db'

OUTPUT = PRUNED / 'matched.csv'

TARGET = INPUT / 'mammals_no_geo.csv.gz'

In [4]:
cxn = sqlite3.connect(DB)
cxn.row_factory = sqlite3.Row

In [40]:
unknown = ('unspecified', 'unknown')

sql = """select * from places where locality = ?"""

threshold = 0.1
hi = 999.0
lo = -hi

In [39]:
df = pd.read_csv(TARGET, dtype=str).fillna('')
print([c for c in df.columns])
df.shape

['binomial', 'X1st_body_mass', 'catalognumber', 'collectioncode', 'decimallatitude', 'decimallongitude', 'dynamicproperties', 'X1st_ear_length', 'X1st_ear_length_low', 'X1st_ear_length_high', 'X1st_ear_length_measured_from', 'X1st_ear_length_ambiguous', 'X1st_ear_length_units_inferred', 'X1st_ear_length_estimated', 'eventdate', 'fieldnotes', 'X1st_hind_foot_length', 'X1st_hind_foot_length_low', 'X1st_hind_foot_length_high', 'X1st_hind_foot_length_includes', 'X1st_hind_foot_length_units_inferred', 'X1st_hind_foot_length_estimated', 'institutioncode', 'X1st_life_stage_notation', 'X2nd_life_stage_notation', 'lifestage', 'locality', 'maximumelevationinmeters', 'minimumelevationinmeters', 'occurrenceid', 'occurrenceremarks', 'recordedby', 'references', 'reproductivecondition', 'scientificname', 'sex', 'X1st_sex_notation', 'X2nd_sex_notation', 'X1st_tail_length', 'X1st_tail_length_low', 'X1st_tail_length_high', 'X1st_tail_length_ambiguous', 'X1st_tail_length_units_inferred', 'X1st_tail_lengt

(76894, 66)

In [44]:
df['lat'] = None
df['lng'] = None
df['datum'] = None
df['uncert'] = None

for idx, row in df.iterrows():
    loc = row['locality']
    if loc in unknown:
        continue

    hits = cxn.execute(sql, (loc, ))

    has_datum_uncert = []
    has_datum = []
    has_uncert = []
    neither = []
    min_lat = hi
    max_lat = lo
    min_lng = hi
    max_lng = lo
    for hit in hits:
        lat = round(hit[1], 4)
        lng = round(hit[2], 4)
        datum = hit['datum']
        uncert = hit['uncert']
        # print(loc, lat, lng, datum, uncert)
        if datum and uncert:
            has_datum_uncert.append((lat, lng, datum, uncert))
        elif datum:
            has_datum.append((lat, lng, datum, uncert))
        elif uncert:
            has_uncert.append((lat, lng, datum, uncert))
        else:
            neither.append((lat, lng, datum, uncert))
        min_lat = min(min_lat, lat)
        max_lat = max(max_lat, lat)
        min_lng = min(min_lng, lng)
        max_lng = max(max_lng, lng)
    if max_lat - min_lat >= threshold:
        continue
    if max_lng - min_lng >= threshold:
        continue
    if has_datum_uncert:
        df.at[idx, 'lat'] = has_datum_uncert[0][0]
        df.at[idx, 'lng'] = has_datum_uncert[0][1]
        df.at[idx, 'datum'] = has_datum_uncert[0][2]
        df.at[idx, 'uncert'] = has_datum_uncert[0][3]
    elif has_datum:
        df.at[idx, 'lat'] = has_datum[0][0]
        df.at[idx, 'lng'] = has_datum[0][1]
        df.at[idx, 'datum'] = has_datum[0][2]
        df.at[idx, 'uncert'] = has_datum[0][3]
    elif has_uncert:
        df.at[idx, 'lat'] = has_uncert[0][0]
        df.at[idx, 'lng'] = has_uncert[0][1]
        df.at[idx, 'datum'] = has_uncert[0][2]
        df.at[idx, 'uncert'] = has_uncert[0][3]
    elif neither:
        df.at[idx, 'lat'] = neither[0][0]
        df.at[idx, 'lng'] = neither[0][1]
        df.at[idx, 'datum'] = neither[0][2]
        df.at[idx, 'uncert'] = neither[0][3]

In [45]:
df.loc[:, ['locality', 'lat', 'lng', 'datum', 'uncert']].head(20)

Unnamed: 0,locality,lat,lng,datum,uncert
0,"3 miles south, 5 miles west Fort Collins 5780'",,,,
1,"Lodi Center, 1.5 mi. S",,,,
2,"5mi E of Ithaca, Pleasant Hill, elevation 1700'",,,,
3,"2 mi SE Walden, Arapaho Natl. Wildlife Refuge,...",,,,
4,"Sherman Township, French Creek Tributary",,,,
5,E Bloomfield,,,,
6,"2 mi SE Walden, Arapaho Natl. Wildlife Refuge,...",,,,
7,French Creek,,,,
8,French Creek,,,,
9,unspecified,,,,


In [46]:
df.to_csv(OUTPUT, index=False)

In [47]:
df.count()

binomial                 76894
X1st_body_mass           76894
catalognumber            76894
collectioncode           76894
decimallatitude          76894
                         ...  
tragus_length_1.value    76894
lat                      10472
lng                      10472
datum                     7669
uncert                   10472
Length: 70, dtype: int64

In [29]:
loc = 'Saranac Lake'
hits = cxn.execute(sql, (loc, ))
for hit in hits:
    lat = round(hit[1], 4)
    lng = round(hit[2], 4)
    datum = hit['datum']
    uncert = hit['uncert']
    print(loc, lat, lng, datum, uncert)

Saranac Lake 44.3295 -74.1313 None 0
Saranac Lake 44.3294 -74.1317 None 0
Saranac Lake 44.3294 -74.1317 None 10000
Saranac Lake 44.3294 -74.1317 None 0
Saranac Lake 44.3272 -74.1333 WGS84 2652
Saranac Lake 44.3295 -74.1313 None 0
Saranac Lake 44.3295 -74.1313 WGS84 2939
Saranac Lake 44.3294 -74.1311 None 3
Saranac Lake 44.3294 -74.1311 None 20000
Saranac Lake 44.3272 -74.1333 WGS84 2651
