In [1]:
import sys
sys.path.append('..')

In [2]:
from pathlib import Path
import sqlite3
import csv
import regex
import pandas as pd
import dask.dataframe as dd
from tqdm.notebook import tqdm

In [3]:
DATA = Path('..') / 'data'
RAW = DATA / '00_raw'
PRUNED = DATA / '01_pruned'
IN_DIR = DATA / 'input'
OUT_DIR = DATA / 'output'

DB = PRUNED / 'gazetteer.db'

CHUNK = 1_000_000

## Get insects_idigbio/occurrence_raw data without lat/longs

In [None]:
in_file = RAW / 'insects_idigbio' / 'occurrence_raw.csv.gz'
out_file = IN_DIR / 'insects_idigbio_occurrence_raw_no_geo.csv.gz'

In [None]:
reader = pd.read_csv(in_file, dtype=str, chunksize=CHUNK)

first_chunk = True
for df in tqdm(reader):
    has_loc = df['dwc:locality'].notna() | df['dwc:verbatimLocality'].notna()
    df['dwc:decimalLatitude'] = (
        pd.to_numeric(df['dwc:decimalLatitude'], errors='coerce'
                      ).fillna(9999.9).astype(float))
    df['dwc:decimalLongitude'] = (
        pd.to_numeric(df['dwc:decimalLongitude'], errors='coerce'
                      ).fillna(9999.9).astype(float))
    has_lat = (df['dwc:decimalLatitude'].between(-90.0, 90.0)
               & df['dwc:decimalLatitude'] != 0.0)
    has_lng = (df['dwc:decimalLongitude'].between(-180.0, 180.0)
               & df['dwc:decimalLongitude'] != 0.0)
    keep = has_loc & ~(has_lat & has_lng)
    df = df.loc[keep, :]

    if first_chunk:
        df.to_csv(out_file, index=False)
    else:
        df.to_csv(out_file, index=False, mode='a', header=False)
    first_chunk = False

## Get insects_gbif without lat/longs


It looks like there is garbage data in this CSV file that Pandas cannot handle, so I need to use the Python CSV library. This can happen when people copy and paste Word documents into a data cell.

In [5]:
in_file = RAW / 'insects_gbif' / '0017955-200127171203522.csv'
out_file = IN_DIR / 'insects_gbif_no_geo.csv'

csv.field_size_limit(10_000_000)  # There are some big fields

131072

In [6]:
# headers = !zcat "$in_file" | head -1
headers = !head -1 "$in_file"
headers = headers[0].split('\t')
headers

['gbifID',
 'datasetKey',
 'occurrenceID',
 'kingdom',
 'phylum',
 'class',
 'order',
 'family',
 'genus',
 'species',
 'infraspecificEpithet',
 'taxonRank',
 'scientificName',
 'verbatimScientificName',
 'verbatimScientificNameAuthorship',
 'countryCode',
 'locality',
 'stateProvince',
 'occurrenceStatus',
 'individualCount',
 'publishingOrgKey',
 'decimalLatitude',
 'decimalLongitude',
 'coordinateUncertaintyInMeters',
 'coordinatePrecision',
 'elevation',
 'elevationAccuracy',
 'depth',
 'depthAccuracy',
 'eventDate',
 'day',
 'month',
 'year',
 'taxonKey',
 'speciesKey',
 'basisOfRecord',
 'institutionCode',
 'collectionCode',
 'catalogNumber',
 'recordNumber',
 'identifiedBy',
 'dateIdentified',
 'license',
 'rightsHolder',
 'recordedBy',
 'typeStatus',
 'establishmentMeans',
 'lastInterpreted',
 'mediaType',
 'issue']

In [7]:
reader = pd.read_csv(
    in_file, dtype=str, chunksize=CHUNK, sep='\t',
    error_bad_lines=False, warn_bad_lines=True)

first_chunk = True
for df in tqdm(reader):
    # has_loc = df['locality'].notna() | df['verbatimLocality'].notna()
    has_loc = df['locality'].notna()
    df['decimalLatitude'] = (
        pd.to_numeric(df['decimalLatitude'], errors='coerce'
                      ).fillna(9999.9).astype(float))
    df['decimalLongitude'] = (
        pd.to_numeric(df['decimalLongitude'], errors='coerce'
                      ).fillna(9999.9).astype(float))
    has_lat = (df['decimalLatitude'].between(-90.0, 90.0)
               & df['decimalLatitude'] != 0.0)
    has_lng = (df['decimalLongitude'].between(-180.0, 180.0)
               & df['decimalLongitude'] != 0.0)
    keep = has_loc & ~(has_lat & has_lng)
    df = df.loc[keep, :]

    if first_chunk:
        df.to_csv(out_file, index=False)
    else:
        df.to_csv(out_file, index=False, mode='a', header=False)
    first_chunk = False

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

b'Skipping line 5639820: expected 50 fields, saw 74\nSkipping line 5640136: expected 50 fields, saw 74\nSkipping line 5640706: expected 50 fields, saw 74\nSkipping line 5641012: expected 50 fields, saw 74\nSkipping line 5641844: expected 50 fields, saw 74\n'
b'Skipping line 35119977: expected 50 fields, saw 72\nSkipping line 35122783: expected 50 fields, saw 75\nSkipping line 35126587: expected 50 fields, saw 52\n'
b'Skipping line 35156846: expected 50 fields, saw 53\n'





## Set up for matching

In [None]:
unknown = ('unspecified', 'unknown')

remove = regex.compile(r'(?<!\d)[.,;/(){}"\'\[\]\-](?!\d)')

sql = """select * from places where norm = ?"""

threshold = 0.1
hi = 999.0
lo = -hi

## Match insects_idigbio/occurrence_raw

In [None]:
in_file = IN_DIR / 'insects_idigbio_occurrence_raw_no_geo.csv.gz'
out_file = OUT_DIR / 'insects_idigbio_no_geo_2020-03-10a.csv.gz'

In [None]:
df = pd.read_csv(in_file, dtype=str).fillna('')
print([c for c in df.columns])
df.shape

In [None]:
df['lat'] = None
df['lng'] = None
df['datum'] = None
df['uncert'] = None

with sqlite3.connect(DB) as cxn:
    cxn.row_factory = sqlite3.Row
    for idx, row in tqdm(df.iterrows()):
        for field in ['dwc:locality', 'dwc:verbatimLocality']:
            loc = row[field]
            loc = remove.sub(' ', loc)
            loc = ' '.join(loc.lower().split())
            if loc in unknown:
                continue

            hits = cxn.execute(sql, (loc, ))

            has_datum_uncert = []
            has_datum = []
            has_uncert = []
            neither = []
            min_lat = hi
            max_lat = lo
            min_lng = hi
            max_lng = lo
            for hit in hits:
                lat = round(hit[1], 4)
                lng = round(hit[2], 4)
                datum = hit['datum']
                uncert = hit['uncert']
                # print(loc, lat, lng, datum, uncert)
                if datum and uncert:
                    has_datum_uncert.append((lat, lng, datum, uncert))
                elif datum:
                    has_datum.append((lat, lng, datum, uncert))
                elif uncert:
                    has_uncert.append((lat, lng, datum, uncert))
                else:
                    neither.append((lat, lng, datum, uncert))
                min_lat = min(min_lat, lat)
                max_lat = max(max_lat, lat)
                min_lng = min(min_lng, lng)
                max_lng = max(max_lng, lng)
            if max_lat - min_lat >= threshold:
                continue
            if max_lng - min_lng >= threshold:
                continue
            if has_datum_uncert:
                df.at[idx, 'lat'] = has_datum_uncert[0][0]
                df.at[idx, 'lng'] = has_datum_uncert[0][1]
                df.at[idx, 'datum'] = has_datum_uncert[0][2]
                df.at[idx, 'uncert'] = has_datum_uncert[0][3]
                break
            elif has_datum:
                df.at[idx, 'lat'] = has_datum[0][0]
                df.at[idx, 'lng'] = has_datum[0][1]
                df.at[idx, 'datum'] = has_datum[0][2]
                df.at[idx, 'uncert'] = has_datum[0][3]
                break
            elif has_uncert:
                df.at[idx, 'lat'] = has_uncert[0][0]
                df.at[idx, 'lng'] = has_uncert[0][1]
                df.at[idx, 'datum'] = has_uncert[0][2]
                df.at[idx, 'uncert'] = has_uncert[0][3]
                break
            elif neither:
                df.at[idx, 'lat'] = neither[0][0]
                df.at[idx, 'lng'] = neither[0][1]
                df.at[idx, 'datum'] = neither[0][2]
                df.at[idx, 'uncert'] = neither[0][3]
                break

In [None]:
df.loc[:, ['dwc:locality', 'dwc:verbatimLocality', 'lat', 'lng', 'datum', 'uncert']].head(20)

In [None]:
df.to_csv(out_file, index=False)

In [None]:
df.count()

In [1]:
round(10302 / 50020 * 100.0, 2)

20.6

## Match insects_gbif

In [None]:
in_file = IN_DIR / 'insects_gbif_no_geo.csv.gz'
out_file = OUT_DIR / 'insects_gbif_no_geo_2020-03-10a.csv.gz'

In [None]:
df = pd.read_csv(in_file, dtype=str).fillna('')
print([c for c in df.columns])
df.shape

In [None]:
df['lat'] = None
df['lng'] = None
df['datum'] = None
df['uncert'] = None

with sqlite3.connect(DB) as cxn:
    cxn.row_factory = sqlite3.Row

    for idx, row in tqdm(df.iterrows()):
        for field in ['locality', ]:
            loc = row[field]
            loc = remove.sub(' ', loc)
            loc = ' '.join(loc.lower().split())
            if loc in unknown:
                continue

            hits = cxn.execute(sql, (loc, ))

            has_datum_uncert = []
            has_datum = []
            has_uncert = []
            neither = []
            min_lat = hi
            max_lat = lo
            min_lng = hi
            max_lng = lo
            for hit in hits:
                lat = round(hit[1], 4)
                lng = round(hit[2], 4)
                datum = hit['datum']
                uncert = hit['uncert']
                # print(loc, lat, lng, datum, uncert)
                if datum and uncert:
                    has_datum_uncert.append((lat, lng, datum, uncert))
                elif datum:
                    has_datum.append((lat, lng, datum, uncert))
                elif uncert:
                    has_uncert.append((lat, lng, datum, uncert))
                else:
                    neither.append((lat, lng, datum, uncert))
                min_lat = min(min_lat, lat)
                max_lat = max(max_lat, lat)
                min_lng = min(min_lng, lng)
                max_lng = max(max_lng, lng)
            if max_lat - min_lat >= threshold:
                continue
            if max_lng - min_lng >= threshold:
                continue
            if has_datum_uncert:
                df.at[idx, 'lat'] = has_datum_uncert[0][0]
                df.at[idx, 'lng'] = has_datum_uncert[0][1]
                df.at[idx, 'datum'] = has_datum_uncert[0][2]
                df.at[idx, 'uncert'] = has_datum_uncert[0][3]
                break
            elif has_datum:
                df.at[idx, 'lat'] = has_datum[0][0]
                df.at[idx, 'lng'] = has_datum[0][1]
                df.at[idx, 'datum'] = has_datum[0][2]
                df.at[idx, 'uncert'] = has_datum[0][3]
                break
            elif has_uncert:
                df.at[idx, 'lat'] = has_uncert[0][0]
                df.at[idx, 'lng'] = has_uncert[0][1]
                df.at[idx, 'datum'] = has_uncert[0][2]
                df.at[idx, 'uncert'] = has_uncert[0][3]
                break
            elif neither:
                df.at[idx, 'lat'] = neither[0][0]
                df.at[idx, 'lng'] = neither[0][1]
                df.at[idx, 'datum'] = neither[0][2]
                df.at[idx, 'uncert'] = neither[0][3]
                break

In [None]:
df.loc[:, ['locality', 'lat', 'lng', 'datum', 'uncert']].head(20)

In [None]:
df.to_csv(out_file, index=False)

In [None]:
df.count()

In [2]:
round(968681 / 6016843 * 100.0, 2)

16.1