In [1]:
import sys
sys.path.append('..')

In [1]:
import csv
import sqlite3
from pathlib import Path

import dask.dataframe as dd
import pandas as pd
import regex
from tqdm.notebook import tqdm

In [3]:
DATA = Path('..') / 'data'
RAW = DATA / '00_raw'
PRUNED = DATA / '01_pruned'
IN_DIR = DATA / 'input'
OUT_DIR = DATA / 'output'

DB = PRUNED / 'gazetteer.db'

CHUNK = 1_000_000

## Get plants_idigbio/occurrence_raw

In [None]:
in_file = RAW / 'insects_idigbio' / 'occurrence_raw.csv.gz'
out_file = IN_DIR / 'insects_idigbio_occurrence_raw_no_geo.csv'

In [None]:
reader = pd.read_csv(in_file, dtype=str, chunksize=CHUNK)

first_chunk = True
for df in tqdm(reader):
    has_loc = df['dwc:locality'].notna() | df['dwc:verbatimLocality'].notna()
    df['dwc:decimalLatitude'] = (
        pd.to_numeric(df['dwc:decimalLatitude'], errors='coerce'
                      ).fillna(9999.9).astype(float))
    df['dwc:decimalLongitude'] = (
        pd.to_numeric(df['dwc:decimalLongitude'], errors='coerce'
                      ).fillna(9999.9).astype(float))
    has_lat = (df['dwc:decimalLatitude'].between(-90.0, 90.0)
               & df['dwc:decimalLatitude'] != 0.0)
    has_lng = (df['dwc:decimalLongitude'].between(-180.0, 180.0)
               & df['dwc:decimalLongitude'] != 0.0)
    keep = has_loc & ~(has_lat & has_lng)
    df = df.loc[keep, :]

    if first_chunk:
        df.to_csv(out_file, index=False)
    else:
        df.to_csv(out_file, index=False, mode='a', header=False)
    first_chunk = False

## Get plants_gbif without lat/longs

It looks like there is garbage data in this CSV file that Pandas cannot handle, so I need to use the Python CSV library. This can happen when people copy and paste Word documents into a data cell.

In [5]:
in_file = RAW / 'plants_gbif' / 'verbatim.txt'
out_file = IN_DIR / 'plants_gbif_verbatim_no_geo.csv'

# csv.field_size_limit(10_000_000)  # There are some big fields

131072

In [6]:
# headers = !zcat "$in_file" | head -1
headers = !head -1 "$in_file"
headers = headers[0].split('\t')
len(headers)

217

In [8]:
reader = pd.read_csv(
    in_file, dtype=str, chunksize=CHUNK, sep='\t',
    error_bad_lines=False, warn_bad_lines=False)

first_chunk = True
for df in tqdm(reader):
    # has_loc = df['locality'].notna() | df['verbatimLocality'].notna()
    has_loc = df['locality'].notna()
    df['decimalLatitude'] = (
        pd.to_numeric(df['decimalLatitude'], errors='coerce'
                      ).fillna(9999.9).astype(float))
    df['decimalLongitude'] = (
        pd.to_numeric(df['decimalLongitude'], errors='coerce'
                      ).fillna(9999.9).astype(float))
    has_lat = (df['decimalLatitude'].between(-90.0, 90.0)
               & df['decimalLatitude'] != 0.0)
    has_lng = (df['decimalLongitude'].between(-180.0, 180.0)
               & df['decimalLongitude'] != 0.0)
    keep = has_loc & ~(has_lat & has_lng)
    df = df.loc[keep, :]

    if first_chunk:
        df.to_csv(out_file, index=False)
    else:
        df.to_csv(out_file, index=False, mode='a', header=False)
    first_chunk = False

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




## Setup for matching

In [5]:
unknown = ('unspecified', 'unknown')

remove = regex.compile(r'(?<!\d)[.,;/(){}"\'\[\]\-](?!\d)')

sql = """select * from places where norm = ?"""

threshold = 0.1
hi = 999.0
lo = -hi

## Match plants_idigbio/occurrence_raw

In [6]:
in_file = IN_DIR / 'plants_idigbio_occurrence_raw_no_geo.csv.gz'
out_file = OUT_DIR / 'plants_idigbio_no_geo_2020-03-17a.csv.gz'

In [8]:
headers = !zcat "$in_file" | head -1
headers = headers[0].split(',')
headers

['coreid',
 'aec:associatedTaxa',
 'dc:rights',
 'dcterms:accessRights',
 'dcterms:bibliographicCitation',
 'dcterms:language',
 'dcterms:license',
 'dcterms:modified',
 'dcterms:references',
 'dcterms:rights',
 'dcterms:rightsHolder',
 'dcterms:source',
 'dcterms:type',
 'dwc:Identification',
 'dwc:MeasurementOrFact',
 'dwc:ResourceRelationship',
 'dwc:VerbatimEventDate',
 'dwc:acceptedNameUsage',
 'dwc:acceptedNameUsageID',
 'dwc:accessRights',
 'dwc:associatedMedia',
 'dwc:associatedOccurrences',
 'dwc:associatedOrganisms',
 'dwc:associatedReferences',
 'dwc:associatedSequences',
 'dwc:associatedTaxa',
 'dwc:basisOfRecord',
 'dwc:bed',
 'dwc:behavior',
 'dwc:catalogNumber',
 'dwc:class',
 'dwc:classs',
 'dwc:collectionCode',
 'dwc:collectionID',
 'dwc:continent',
 'dwc:coordinatePrecision',
 'dwc:coordinateUncertaintyInMeters',
 'dwc:country',
 'dwc:countryCode',
 'dwc:county',
 'dwc:dataGeneralizations',
 'dwc:datasetID',
 'dwc:datasetName',
 'dwc:dateIdentified',
 'dwc:day',
 'dwc

In [13]:
reader = pd.read_csv(in_file, dtype=str, chunksize=CHUNK)

with sqlite3.connect(DB) as cxn:
    cxn.row_factory = sqlite3.Row
    first_chunk = True
    for df in tqdm(reader):
        df = df.fillna('')
        df['lat'] = None
        df['lng'] = None
        df['datum'] = None
        df['uncert'] = None
        for idx, row in df.iterrows():
            for field in ['dwc:locality', 'dwc:verbatimLocality']:
                loc = row[field]
                loc = remove.sub(' ', loc)
                loc = ' '.join(loc.lower().split())
                if loc in unknown:
                    continue

                hits = cxn.execute(sql, (loc, ))

                has_datum_uncert = []
                has_datum = []
                has_uncert = []
                neither = []
                min_lat = hi
                max_lat = lo
                min_lng = hi
                max_lng = lo
                for hit in hits:
                    lat = round(hit[1], 4)
                    lng = round(hit[2], 4)
                    datum = hit['datum']
                    uncert = hit['uncert']
                    # print(loc, lat, lng, datum, uncert)
                    if datum and uncert:
                        has_datum_uncert.append((lat, lng, datum, uncert))
                    elif datum:
                        has_datum.append((lat, lng, datum, uncert))
                    elif uncert:
                        has_uncert.append((lat, lng, datum, uncert))
                    else:
                        neither.append((lat, lng, datum, uncert))
                    min_lat = min(min_lat, lat)
                    max_lat = max(max_lat, lat)
                    min_lng = min(min_lng, lng)
                    max_lng = max(max_lng, lng)
                if max_lat - min_lat >= threshold:
                    continue
                if max_lng - min_lng >= threshold:
                    continue
                if has_datum_uncert:
                    df.at[idx, 'lat'] = has_datum_uncert[0][0]
                    df.at[idx, 'lng'] = has_datum_uncert[0][1]
                    df.at[idx, 'datum'] = has_datum_uncert[0][2]
                    df.at[idx, 'uncert'] = has_datum_uncert[0][3]
                    break
                elif has_datum:
                    df.at[idx, 'lat'] = has_datum[0][0]
                    df.at[idx, 'lng'] = has_datum[0][1]
                    df.at[idx, 'datum'] = has_datum[0][2]
                    df.at[idx, 'uncert'] = has_datum[0][3]
                    break
                elif has_uncert:
                    df.at[idx, 'lat'] = has_uncert[0][0]
                    df.at[idx, 'lng'] = has_uncert[0][1]
                    df.at[idx, 'datum'] = has_uncert[0][2]
                    df.at[idx, 'uncert'] = has_uncert[0][3]
                    break
                elif neither:
                    df.at[idx, 'lat'] = neither[0][0]
                    df.at[idx, 'lng'] = neither[0][1]
                    df.at[idx, 'datum'] = neither[0][2]
                    df.at[idx, 'uncert'] = neither[0][3]
                    break
        if first_chunk:
            df.to_csv(out_file, index=False)
            first_chunk = False            
        else:
            df.to_csv(out_file, index=False, mode='a', header=False)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))





## Match plants_gbif

In [14]:
in_file = IN_DIR / 'plants_gbif_verbatim_no_geo.csv.gz'
out_file = OUT_DIR / 'plants_gbif_verbatim_no_geo_2020-03-17a.csv.gz'

In [15]:
headers = !zcat "$in_file" | head -1
headers = headers[0].split(',')
headers

['gbifID',
 'abstract',
 'accessRights',
 'accrualMethod',
 'accrualPeriodicity',
 'accrualPolicy',
 'alternative',
 'audience',
 'available',
 'bibliographicCitation',
 'conformsTo',
 'contributor',
 'coverage',
 'created',
 'creator',
 'date',
 'dateAccepted',
 'dateCopyrighted',
 'dateSubmitted',
 'description',
 'educationLevel',
 'extent',
 'format',
 'hasFormat',
 'hasPart',
 'hasVersion',
 'identifier',
 'instructionalMethod',
 'isFormatOf',
 'isPartOf',
 'isReferencedBy',
 'isReplacedBy',
 'isRequiredBy',
 'isVersionOf',
 'issued',
 'language',
 'license',
 'mediator',
 'medium',
 'modified',
 'provenance',
 'publisher',
 'references',
 'relation',
 'replaces',
 'requires',
 'rights',
 'rightsHolder',
 'source',
 'spatial',
 'subject',
 'tableOfContents',
 'temporal',
 'title',
 'type',
 'valid',
 'institutionID',
 'collectionID',
 'datasetID',
 'institutionCode',
 'collectionCode',
 'datasetName',
 'ownerInstitutionCode',
 'basisOfRecord',
 'informationWithheld',
 'dataGeneral

In [16]:
reader = pd.read_csv(in_file, dtype=str, chunksize=CHUNK)

with sqlite3.connect(DB) as cxn:
    cxn.row_factory = sqlite3.Row
    first_chunk = False
    for df in tqdm(reader):
        df = df.fillna('')
        df['lat'] = None
        df['lng'] = None
        df['datum'] = None
        df['uncert'] = None
        for idx, row in df.iterrows():
            for field in ['locality', 'verbatimLocality']:
                loc = row[field]
                loc = remove.sub(' ', loc)
                loc = ' '.join(loc.lower().split())
                if loc in unknown:
                    continue

                hits = cxn.execute(sql, (loc, ))

                has_datum_uncert = []
                has_datum = []
                has_uncert = []
                neither = []
                min_lat = hi
                max_lat = lo
                min_lng = hi
                max_lng = lo
                for hit in hits:
                    lat = round(hit[1], 4)
                    lng = round(hit[2], 4)
                    datum = hit['datum']
                    uncert = hit['uncert']
                    # print(loc, lat, lng, datum, uncert)
                    if datum and uncert:
                        has_datum_uncert.append((lat, lng, datum, uncert))
                    elif datum:
                        has_datum.append((lat, lng, datum, uncert))
                    elif uncert:
                        has_uncert.append((lat, lng, datum, uncert))
                    else:
                        neither.append((lat, lng, datum, uncert))
                    min_lat = min(min_lat, lat)
                    max_lat = max(max_lat, lat)
                    min_lng = min(min_lng, lng)
                    max_lng = max(max_lng, lng)
                if max_lat - min_lat >= threshold:
                    continue
                if max_lng - min_lng >= threshold:
                    continue
                if has_datum_uncert:
                    df.at[idx, 'lat'] = has_datum_uncert[0][0]
                    df.at[idx, 'lng'] = has_datum_uncert[0][1]
                    df.at[idx, 'datum'] = has_datum_uncert[0][2]
                    df.at[idx, 'uncert'] = has_datum_uncert[0][3]
                    break
                elif has_datum:
                    df.at[idx, 'lat'] = has_datum[0][0]
                    df.at[idx, 'lng'] = has_datum[0][1]
                    df.at[idx, 'datum'] = has_datum[0][2]
                    df.at[idx, 'uncert'] = has_datum[0][3]
                    break
                elif has_uncert:
                    df.at[idx, 'lat'] = has_uncert[0][0]
                    df.at[idx, 'lng'] = has_uncert[0][1]
                    df.at[idx, 'datum'] = has_uncert[0][2]
                    df.at[idx, 'uncert'] = has_uncert[0][3]
                    break
                elif neither:
                    df.at[idx, 'lat'] = neither[0][0]
                    df.at[idx, 'lng'] = neither[0][1]
                    df.at[idx, 'datum'] = neither[0][2]
                    df.at[idx, 'uncert'] = neither[0][3]
                    break
        if first_chunk:
            df.to_csv(out_file, index=False)
            first_chunk = False
        else:
            df.to_csv(out_file, index=False, mode='a', header=False)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




## Get results

In [14]:
in_file = IN_DIR / 'plants_idigbio_occurrence_raw_no_geo.csv.gz'
headless = OUT_DIR / 'plants_idigbio_no_geo_2020-03-17a.csv.gz'
out_file = OUT_DIR / 'plants_idigbio_no_geo_2020-03-20a.csv'

header = !zcat "$in_file" | head -1
header = header[0]
header += ',lat,lng,datum,uncert'
header

'coreid,aec:associatedTaxa,dc:rights,dcterms:accessRights,dcterms:bibliographicCitation,dcterms:language,dcterms:license,dcterms:modified,dcterms:references,dcterms:rights,dcterms:rightsHolder,dcterms:source,dcterms:type,dwc:Identification,dwc:MeasurementOrFact,dwc:ResourceRelationship,dwc:VerbatimEventDate,dwc:acceptedNameUsage,dwc:acceptedNameUsageID,dwc:accessRights,dwc:associatedMedia,dwc:associatedOccurrences,dwc:associatedOrganisms,dwc:associatedReferences,dwc:associatedSequences,dwc:associatedTaxa,dwc:basisOfRecord,dwc:bed,dwc:behavior,dwc:catalogNumber,dwc:class,dwc:classs,dwc:collectionCode,dwc:collectionID,dwc:continent,dwc:coordinatePrecision,dwc:coordinateUncertaintyInMeters,dwc:country,dwc:countryCode,dwc:county,dwc:dataGeneralizations,dwc:datasetID,dwc:datasetName,dwc:dateIdentified,dwc:day,dwc:decimalLatitude,dwc:decimalLongitude,dwc:disposition,dwc:dynamicProperties,dwc:earliestAgeOrLowestStage,dwc:earliestEonOrLowestEonothem,dwc:earliestEpochOrLowestSeries,dwc:earliest

In [16]:
with open(out_file, 'w') as out:
    out.write(header)
    out.write('\n')

!zcat "$headless" >> "$out_file"

In [19]:
in_file = IN_DIR / 'plants_gbif_verbatim_no_geo.csv.gz'
headless = OUT_DIR / 'plants_gbif_verbatim_no_geo_2020-03-17a.csv.gz'
out_file = OUT_DIR / 'plants_gbif_verbatim_no_geo_2020-03-20a.csv'

header = !zcat "$in_file" | head -1
header = header[0]
header += ',lat,lng,datum,uncert'
header

'gbifID,abstract,accessRights,accrualMethod,accrualPeriodicity,accrualPolicy,alternative,audience,available,bibliographicCitation,conformsTo,contributor,coverage,created,creator,date,dateAccepted,dateCopyrighted,dateSubmitted,description,educationLevel,extent,format,hasFormat,hasPart,hasVersion,identifier,instructionalMethod,isFormatOf,isPartOf,isReferencedBy,isReplacedBy,isRequiredBy,isVersionOf,issued,language,license,mediator,medium,modified,provenance,publisher,references,relation,replaces,requires,rights,rightsHolder,source,spatial,subject,tableOfContents,temporal,title,type,valid,institutionID,collectionID,datasetID,institutionCode,collectionCode,datasetName,ownerInstitutionCode,basisOfRecord,informationWithheld,dataGeneralizations,dynamicProperties,occurrenceID,catalogNumber,recordNumber,recordedBy,individualCount,organismQuantity,organismQuantityType,sex,lifeStage,reproductiveCondition,behavior,establishmentMeans,occurrenceStatus,preparations,disposition,associatedMedia,associa

In [20]:
with open(out_file, 'w') as out:
    out.write(header)
    out.write('\n')

!zcat "$headless" >> "$out_file"

In [21]:
unzipped = OUT_DIR / 'plants_idigbio_no_geo_2020-03-20a.csv'

!gzip "$unzipped"

In [22]:
unzipped = OUT_DIR / 'plants_gbif_verbatim_no_geo_2020-03-20a.csv'

!gzip "$unzipped"

In [28]:
out_file = OUT_DIR / 'plants_idigbio_no_geo_2020-03-20a.csv.gz'
df = pd.read_csv(out_file, usecols=['lat', 'lng'])

In [29]:
df.head()

Unnamed: 0,lat,lng
0,,
1,,
2,,
3,,
4,,


In [30]:
df.shape

(20164429, 2)

In [31]:
df.count()

lat    3205993
lng    3205993
dtype: int64

In [33]:
round(3205993 / 20164429 * 100.0, 2)

15.9

In [35]:
out_file = OUT_DIR / 'plants_gbif_verbatim_no_geo_2020-03-20a.csv.gz'
df = pd.read_csv(out_file, usecols=['lat', 'lng'])

In [36]:
df.shape

(33946224, 2)

In [37]:
df.count()

lat    4732193
lng    4732193
dtype: int64

In [40]:
round(4732193 / 33946224 * 100.0, 2)

13.94