# Build the Gazetteer from iDigBio Data

This is a dump of all of iDigBio data retrieved on 2020-03-30. (No DOI available.)

## Basic Notebook Setup

In [1]:
import os
import io
import sqlite3
import zipfile
from pathlib import Path

import dask.dataframe as dd
import pandas as pd
import regex
from tqdm import tqdm

In [2]:
DATA_DIR = Path('..') / 'data'
RAW_DIR = DATA_DIR / '00_raw'
INTERIM_DIR = DATA_DIR / '01_interim'

ZIP_FILE = RAW_DIR / 'idigbio_all_2020-03-30.zip'
DB = INTERIM_DIR / 'gazetteer_idigbio_2020-04-23.db'

CHUNK = 1_000_000

## Build Gazetteer Related Fields

Gather all of the fields below. Not all of them will make it into the Gazetteer table but we want them all geography related fields available if we tweak the algorithm later.

iDigBio uses name spaces.

In [3]:
DWC_FIELDS = {f'dwc:{f}': f for f in """
    continent
    coordinatePrecision
    coordinateUncertaintyInMeters
    country
    countryCode
    county
    decimalLatitude
    decimalLongitude
    footprintSRS
    footprintSpatialFit
    footprintWKT
    geodeticDatum
    georeferenceProtocol
    georeferenceRemarks
    georeferenceSources
    georeferenceVerificationStatus
    georeferencedBy
    georeferencedDate
    higherGeography
    higherGeographyID
    island
    islandGroup
    locality
    locationAccordingTo
    locationID
    locationRemarks
    maximumDepthInMeters verbatimDepth
    maximumDistanceAboveSurfaceInMeters
    maximumElevationInMeters
    minimumDepthInMeters
    minimumDistanceAboveSurfaceInMeters
    minimumElevationInMeters
    municipality
    pointRadiusSpatialFit
    stateProvince
    verbatimCoordinateSystem
    verbatimCoordinates
    verbatimElevation
    verbatimLatitude
    verbatimLocality
    verbatimLongitude
    verbatimSRS
    waterBody
""".split()}


IDIGBIO_FIELDS = {f'idigbio:{f}': f for f in """
    geoPoint
    isoCountryCode
""".split()}

OTHER_FIELDS = {'coreid': 'coreid'}  # Need to link data

FIELDS = {**OTHER_FIELDS, **DWC_FIELDS, **IDIGBIO_FIELDS}

## Examine Tables

What's in the downloaded zip file and what is in the tables.

In [4]:
with zipfile.ZipFile(ZIP_FILE) as zippy:
    names = zippy.namelist()

names

['occurrence.csv',
 'multimedia_raw.csv',
 'multimedia.csv',
 'occurrence_raw.csv',
 'records.citation.txt',
 'mediarecords.citation.txt',
 'meta.xml']

The occurrence.csv & occurrence_raw.csv files may have what we need. Look at their headers.

### occurrence.csv

In [5]:
with zipfile.ZipFile(ZIP_FILE) as zippy:
    with zippy.open('occurrence.csv') as in_file:
        headers = in_file.readline()

headers = [h.decode().strip() for h in sorted(headers.split(b','))]

#### All Headers in occurrence.csv

In [6]:
sorted(headers)

['coreid',
 'dwc:basisOfRecord',
 'dwc:bed',
 'dwc:catalogNumber',
 'dwc:class',
 'dwc:collectionCode',
 'dwc:collectionID',
 'dwc:continent',
 'dwc:coordinateUncertaintyInMeters',
 'dwc:country',
 'dwc:county',
 'dwc:earliestAgeOrLowestStage',
 'dwc:earliestEonOrLowestEonothem',
 'dwc:earliestEpochOrLowestSeries',
 'dwc:earliestEraOrLowestErathem',
 'dwc:earliestPeriodOrLowestSystem',
 'dwc:eventDate',
 'dwc:family',
 'dwc:fieldNumber',
 'dwc:formation',
 'dwc:genus',
 'dwc:geologicalContextID',
 'dwc:group',
 'dwc:higherClassification',
 'dwc:highestBiostratigraphicZone',
 'dwc:individualCount',
 'dwc:infraspecificEpithet',
 'dwc:institutionCode',
 'dwc:institutionID',
 'dwc:kingdom',
 'dwc:latestAgeOrHighestStage',
 'dwc:latestEonOrHighestEonothem',
 'dwc:latestEpochOrHighestSeries',
 'dwc:latestEraOrHighestErathem',
 'dwc:latestPeriodOrHighestSystem',
 'dwc:lithostratigraphicTerms',
 'dwc:locality',
 'dwc:lowestBiostratigraphicZone',
 'dwc:maximumDepthInMeters',
 'dwc:maximumElevat

#### Target Headers in occurrence.csv

In [7]:
HEADERS = sorted(h for h in headers if h in FIELDS)
HEADERS

['coreid',
 'dwc:continent',
 'dwc:coordinateUncertaintyInMeters',
 'dwc:country',
 'dwc:county',
 'dwc:locality',
 'dwc:maximumDepthInMeters',
 'dwc:maximumElevationInMeters',
 'dwc:minimumDepthInMeters',
 'dwc:minimumElevationInMeters',
 'dwc:municipality',
 'dwc:stateProvince',
 'dwc:verbatimLocality',
 'dwc:waterBody',
 'idigbio:geoPoint',
 'idigbio:isoCountryCode']

### occurrence_raw.csv

In [8]:
with zipfile.ZipFile(ZIP_FILE) as zippy:
    with zippy.open('occurrence_raw.csv') as in_file:
        headers = in_file.readline()

headers = [h.decode().strip() for h in sorted(headers.split(b','))]

#### All headers in occurrence_raw.csv

In [9]:
sorted(headers)

['aec:associatedTaxa',
 'coreid',
 'dc:rights',
 'dcterms:accessRights',
 'dcterms:bibliographicCitation',
 'dcterms:language',
 'dcterms:license',
 'dcterms:modified',
 'dcterms:references',
 'dcterms:rights',
 'dcterms:rightsHolder',
 'dcterms:source',
 'dcterms:type',
 'dwc:Identification',
 'dwc:MeasurementOrFact',
 'dwc:ResourceRelationship',
 'dwc:VerbatimEventDate',
 'dwc:acceptedNameUsage',
 'dwc:acceptedNameUsageID',
 'dwc:accessRights',
 'dwc:associatedMedia',
 'dwc:associatedOccurrences',
 'dwc:associatedOrganisms',
 'dwc:associatedReferences',
 'dwc:associatedSequences',
 'dwc:associatedTaxa',
 'dwc:basisOfRecord',
 'dwc:bed',
 'dwc:behavior',
 'dwc:catalogNumber',
 'dwc:class',
 'dwc:classs',
 'dwc:collectionCode',
 'dwc:collectionID',
 'dwc:continent',
 'dwc:coordinatePrecision',
 'dwc:coordinateUncertaintyInMeters',
 'dwc:country',
 'dwc:countryCode',
 'dwc:county',
 'dwc:dataGeneralizations',
 'dwc:datasetID',
 'dwc:datasetName',
 'dwc:dateIdentified',
 'dwc:day',
 'dwc

#### Target headers in occurrence_raw.csv

In [10]:
RAW_HEADERS = sorted(h for h in headers if h in FIELDS)

## Clean the Database

In [11]:
if DB.exists():
    os.remove(DB)

## Create Input Tables

Grab all of the relevant fields from the table if the record has a valid latitude and longitude.

### Create occurrence Table

In [14]:
renames = {h: FIELDS[h] for h in HEADERS}

with sqlite3.connect(DB) as cxn:
    with zipfile.ZipFile(ZIP_FILE) as zippy:
        with zippy.open('occurrence.csv') as in_file:
            reader = pd.read_csv(
                in_file, dtype=str, keep_default_na=False,
                chunksize=CHUNK, usecols=HEADERS)

            if_exists = 'replace'

            for df in tqdm(reader):
                df = df.rename(columns=renames)

                has_point = df['geoPoint'] != ''

                df.loc[has_point, :].to_sql(
                    'occurrence', cxn, if_exists=if_exists, index=False)

                if_exists = 'append'

122it [23:40, 11.65s/it]


### Create occurrence_raw Table

In [16]:
renames = {h: FIELDS[h] for h in RAW_HEADERS}

with sqlite3.connect(DB) as cxn:
    with zipfile.ZipFile(ZIP_FILE) as zippy:
        with zippy.open('occurrence_raw.csv') as in_file:
            reader = pd.read_csv(
                in_file, dtype=str, keep_default_na=False,
                chunksize=CHUNK, usecols=RAW_HEADERS)

            if_exists = 'replace'

            for df in tqdm(reader):
                df = df.rename(columns=renames)

                df['decimalLatitude'] = pd.to_numeric(
                    df['decimalLatitude'], errors='coerce')
                df['decimalLongitude'] = pd.to_numeric(
                    df['decimalLongitude'], errors='coerce')

                has_lat = df['decimalLatitude'].notna()
                has_lng = df['decimalLongitude'].notna()

                df.loc[has_lat & has_lng, :].to_sql(
                    'occurrence_raw', cxn, if_exists=if_exists, index=False)

                if_exists = 'append'

122it [32:45, 16.11s/it]
