In [1]:
import sys
sys.path.append('..')

In [10]:
from pathlib import Path
import sqlite3
import regex
from tqdm.notebook import tqdm
import regex
import numpy as np
import pandas as pd
import dask.dataframe as dd

In [18]:
DATA = Path('..') / 'data'
PRUNED = DATA / '01_pruned'
INPUT = DATA / 'input'

RAW = PRUNED / 'raw.csv'
DB = PRUNED / 'gazetteer.db'

OUTPUT = PRUNED / 'mammals_no_geo_gazetteer_2020-03-09.csv.gz'

TARGET = INPUT / 'mammals_no_geo.csv.gz'

CHUNK = 1_000_000

In [11]:
unknown = ('unspecified', 'unknown')

remove = regex.compile(r'(?<!\d)[.,;/(){}"\'\[\]\-](?!\d)')

sql = """select * from places where norm = ?"""

threshold = 0.1
hi = 999.0
lo = -hi

In [6]:
df = pd.read_csv(TARGET, dtype=str).fillna('')
print([c for c in df.columns])
df.shape

['binomial', 'X1st_body_mass', 'catalognumber', 'collectioncode', 'decimallatitude', 'decimallongitude', 'dynamicproperties', 'X1st_ear_length', 'X1st_ear_length_low', 'X1st_ear_length_high', 'X1st_ear_length_measured_from', 'X1st_ear_length_ambiguous', 'X1st_ear_length_units_inferred', 'X1st_ear_length_estimated', 'eventdate', 'fieldnotes', 'X1st_hind_foot_length', 'X1st_hind_foot_length_low', 'X1st_hind_foot_length_high', 'X1st_hind_foot_length_includes', 'X1st_hind_foot_length_units_inferred', 'X1st_hind_foot_length_estimated', 'institutioncode', 'X1st_life_stage_notation', 'X2nd_life_stage_notation', 'lifestage', 'locality', 'maximumelevationinmeters', 'minimumelevationinmeters', 'occurrenceid', 'occurrenceremarks', 'recordedby', 'references', 'reproductivecondition', 'scientificname', 'sex', 'X1st_sex_notation', 'X2nd_sex_notation', 'X1st_tail_length', 'X1st_tail_length_low', 'X1st_tail_length_high', 'X1st_tail_length_ambiguous', 'X1st_tail_length_units_inferred', 'X1st_tail_lengt

(76894, 66)

In [12]:
df['lat'] = None
df['lng'] = None
df['datum'] = None
df['uncert'] = None

with sqlite3.connect(DB) as cxn:
    cxn.row_factory = sqlite3.Row
    for idx, row in df.iterrows():
        loc = row['locality']
        loc = remove.sub(' ', loc)
        loc = ' '.join(loc.lower().split())
        if loc in unknown:
            continue

        hits = cxn.execute(sql, (loc, ))

        has_datum_uncert = []
        has_datum = []
        has_uncert = []
        neither = []
        min_lat = hi
        max_lat = lo
        min_lng = hi
        max_lng = lo
        for hit in hits:
            lat = round(hit[1], 4)
            lng = round(hit[2], 4)
            datum = hit['datum']
            uncert = hit['uncert']
            # print(loc, lat, lng, datum, uncert)
            if datum and uncert:
                has_datum_uncert.append((lat, lng, datum, uncert))
            elif datum:
                has_datum.append((lat, lng, datum, uncert))
            elif uncert:
                has_uncert.append((lat, lng, datum, uncert))
            else:
                neither.append((lat, lng, datum, uncert))
            min_lat = min(min_lat, lat)
            max_lat = max(max_lat, lat)
            min_lng = min(min_lng, lng)
            max_lng = max(max_lng, lng)
        if max_lat - min_lat >= threshold:
            continue
        if max_lng - min_lng >= threshold:
            continue
        if has_datum_uncert:
            df.at[idx, 'lat'] = has_datum_uncert[0][0]
            df.at[idx, 'lng'] = has_datum_uncert[0][1]
            df.at[idx, 'datum'] = has_datum_uncert[0][2]
            df.at[idx, 'uncert'] = has_datum_uncert[0][3]
        elif has_datum:
            df.at[idx, 'lat'] = has_datum[0][0]
            df.at[idx, 'lng'] = has_datum[0][1]
            df.at[idx, 'datum'] = has_datum[0][2]
            df.at[idx, 'uncert'] = has_datum[0][3]
        elif has_uncert:
            df.at[idx, 'lat'] = has_uncert[0][0]
            df.at[idx, 'lng'] = has_uncert[0][1]
            df.at[idx, 'datum'] = has_uncert[0][2]
            df.at[idx, 'uncert'] = has_uncert[0][3]
        elif neither:
            df.at[idx, 'lat'] = neither[0][0]
            df.at[idx, 'lng'] = neither[0][1]
            df.at[idx, 'datum'] = neither[0][2]
            df.at[idx, 'uncert'] = neither[0][3]

In [15]:
df.loc[:, ['locality', 'lat', 'lng', 'datum', 'uncert']].head(20)

Unnamed: 0,locality,lat,lng,datum,uncert
0,"3 miles south, 5 miles west Fort Collins 5780'",,,,
1,"Lodi Center, 1.5 mi. S",,,,
2,"5mi E of Ithaca, Pleasant Hill, elevation 1700'",,,,
3,"2 mi SE Walden, Arapaho Natl. Wildlife Refuge,...",40.706,-106.257,NO DISPONIBLE,0.0
4,"Sherman Township, French Creek Tributary",,,,
5,E Bloomfield,,,,
6,"2 mi SE Walden, Arapaho Natl. Wildlife Refuge,...",40.706,-106.257,NO DISPONIBLE,0.0
7,French Creek,,,,
8,French Creek,,,,
9,unspecified,,,,


In [19]:
df.to_csv(OUTPUT, index=False)

In [17]:
df.count()

binomial                 76894
X1st_body_mass           76894
catalognumber            76894
collectioncode           76894
decimallatitude          76894
                         ...  
tragus_length_1.value    76894
lat                      11994
lng                      11994
datum                     8828
uncert                   11994
Length: 70, dtype: int64

In [20]:
11994 / 76894 * 100

15.59809608031836