In [1]:
import sys
sys.path.append('..')

In [2]:
import os
from pathlib import Path
import sqlite3
import zipfile
import regex
import numpy as np
import pandas as pd
import dask.dataframe as dd

In [3]:
DATA_DIR = Path('..') / 'data'
RAW_DIR = DATA_DIR / '00_raw'
INTERIM_DIR = DATA_DIR / '01_interim'

ZIP_FILE = RAW_DIR / 'idigbio_all.zip'

In [4]:
with zipfile.ZipFile(ZIP_FILE) as zippy:
    with zippy.open('occurrence_raw.csv') as in_file:
        occurrence_raw = in_file.readline()
occurrence_raw = [h.decode().strip() for h in sorted(occurrence_raw.split(b','))]
occurrence_raw

['aec:associatedTaxa',
 'coreid',
 'dc:rights',
 'dcterms:accessRights',
 'dcterms:bibliographicCitation',
 'dcterms:language',
 'dcterms:license',
 'dcterms:modified',
 'dcterms:references',
 'dcterms:rights',
 'dcterms:rightsHolder',
 'dcterms:source',
 'dcterms:type',
 'dwc:Identification',
 'dwc:MeasurementOrFact',
 'dwc:ResourceRelationship',
 'dwc:VerbatimEventDate',
 'dwc:acceptedNameUsage',
 'dwc:acceptedNameUsageID',
 'dwc:accessRights',
 'dwc:associatedMedia',
 'dwc:associatedOccurrences',
 'dwc:associatedOrganisms',
 'dwc:associatedReferences',
 'dwc:associatedSequences',
 'dwc:associatedTaxa',
 'dwc:basisOfRecord',
 'dwc:bed',
 'dwc:behavior',
 'dwc:catalogNumber',
 'dwc:class',
 'dwc:classs',
 'dwc:collectionCode',
 'dwc:collectionID',
 'dwc:continent',
 'dwc:coordinatePrecision',
 'dwc:coordinateUncertaintyInMeters',
 'dwc:country',
 'dwc:countryCode',
 'dwc:county',
 'dwc:dataGeneralizations',
 'dwc:datasetID',
 'dwc:datasetName',
 'dwc:dateIdentified',
 'dwc:day',
 'dwc

In [5]:
with zipfile.ZipFile(ZIP_FILE) as zippy:
    with zippy.open('occurrence.csv') as in_file:
        occurrence = in_file.readline()
occurrence = [h.decode().strip() for h in sorted(occurrence.split(b','))]
occurrence

['coreid',
 'dwc:basisOfRecord',
 'dwc:bed',
 'dwc:catalogNumber',
 'dwc:class',
 'dwc:collectionCode',
 'dwc:collectionID',
 'dwc:continent',
 'dwc:coordinateUncertaintyInMeters',
 'dwc:country',
 'dwc:county',
 'dwc:earliestAgeOrLowestStage',
 'dwc:earliestEonOrLowestEonothem',
 'dwc:earliestEpochOrLowestSeries',
 'dwc:earliestEraOrLowestErathem',
 'dwc:earliestPeriodOrLowestSystem',
 'dwc:eventDate',
 'dwc:family',
 'dwc:fieldNumber',
 'dwc:formation',
 'dwc:genus',
 'dwc:geologicalContextID',
 'dwc:group',
 'dwc:higherClassification',
 'dwc:highestBiostratigraphicZone',
 'dwc:individualCount',
 'dwc:infraspecificEpithet',
 'dwc:institutionCode',
 'dwc:institutionID',
 'dwc:kingdom',
 'dwc:latestAgeOrHighestStage',
 'dwc:latestEonOrHighestEonothem',
 'dwc:latestEpochOrHighestSeries',
 'dwc:latestEraOrHighestErathem',
 'dwc:latestPeriodOrHighestSystem',
 'dwc:lithostratigraphicTerms',
 'dwc:locality',
 'dwc:lowestBiostratigraphicZone',
 'dwc:maximumDepthInMeters',
 'dwc:maximumElevat

In [6]:
fields = [f'dwc:{f}' for f in """
    locationID higherGeographyID higherGeography
    continent
    waterBody islandGroup island
    country countryCode stateProvince county municipality
    locality verbatimLocality 
    minimumElevationInMeters maximumElevationInMeters
    verbatimElevation 
    minimumDepthInMeters maximumDepthInMeters verbatimDepth 
    minimumDistanceAboveSurfaceInMeters
    maximumDistanceAboveSurfaceInMeters 
    locationAccordingTo locationRemarks 
    decimalLatitude decimalLongitude geodeticDatum 
    coordinateUncertaintyInMeters coordinatePrecision
    pointRadiusSpatialFit 
    verbatimCoordinates verbatimLatitude verbatimLongitude 
    verbatimCoordinateSystem verbatimSRS 
    footprintWKT footprintSRS footprintSpatialFit 
    georeferencedBy georeferencedDate georeferenceProtocol 
    georeferenceSources georeferenceVerificationStatus 
    georeferenceRemarks 
""".split()] + [f'idigbio:{f}' for f in """
    isoCountryCode geoPoint
""".split()]

fields

['dwc:locationID',
 'dwc:higherGeographyID',
 'dwc:higherGeography',
 'dwc:continent',
 'dwc:waterBody',
 'dwc:islandGroup',
 'dwc:island',
 'dwc:country',
 'dwc:countryCode',
 'dwc:stateProvince',
 'dwc:county',
 'dwc:municipality',
 'dwc:locality',
 'dwc:verbatimLocality',
 'dwc:minimumElevationInMeters',
 'dwc:maximumElevationInMeters',
 'dwc:verbatimElevation',
 'dwc:minimumDepthInMeters',
 'dwc:maximumDepthInMeters',
 'dwc:verbatimDepth',
 'dwc:minimumDistanceAboveSurfaceInMeters',
 'dwc:maximumDistanceAboveSurfaceInMeters',
 'dwc:locationAccordingTo',
 'dwc:locationRemarks',
 'dwc:decimalLatitude',
 'dwc:decimalLongitude',
 'dwc:geodeticDatum',
 'dwc:coordinateUncertaintyInMeters',
 'dwc:coordinatePrecision',
 'dwc:pointRadiusSpatialFit',
 'dwc:verbatimCoordinates',
 'dwc:verbatimLatitude',
 'dwc:verbatimLongitude',
 'dwc:verbatimCoordinateSystem',
 'dwc:verbatimSRS',
 'dwc:footprintWKT',
 'dwc:footprintSRS',
 'dwc:footprintSpatialFit',
 'dwc:georeferencedBy',
 'dwc:georeferenced

In [7]:
[f for f in occurrence if f in fields]

['dwc:continent',
 'dwc:coordinateUncertaintyInMeters',
 'dwc:country',
 'dwc:county',
 'dwc:locality',
 'dwc:maximumDepthInMeters',
 'dwc:maximumElevationInMeters',
 'dwc:minimumDepthInMeters',
 'dwc:minimumElevationInMeters',
 'dwc:municipality',
 'dwc:stateProvince',
 'dwc:verbatimLocality',
 'dwc:waterBody',
 'idigbio:geoPoint',
 'idigbio:isoCountryCode']

In [8]:
[f for f in occurrence_raw if f in fields]

['dwc:continent',
 'dwc:coordinatePrecision',
 'dwc:coordinateUncertaintyInMeters',
 'dwc:country',
 'dwc:countryCode',
 'dwc:county',
 'dwc:decimalLatitude',
 'dwc:decimalLongitude',
 'dwc:footprintSRS',
 'dwc:footprintSpatialFit',
 'dwc:footprintWKT',
 'dwc:geodeticDatum',
 'dwc:georeferenceProtocol',
 'dwc:georeferenceRemarks',
 'dwc:georeferenceSources',
 'dwc:georeferenceVerificationStatus',
 'dwc:georeferencedBy',
 'dwc:georeferencedDate',
 'dwc:higherGeography',
 'dwc:higherGeographyID',
 'dwc:island',
 'dwc:islandGroup',
 'dwc:locality',
 'dwc:locationAccordingTo',
 'dwc:locationID',
 'dwc:locationRemarks',
 'dwc:maximumDepthInMeters',
 'dwc:maximumElevationInMeters',
 'dwc:minimumDepthInMeters',
 'dwc:minimumElevationInMeters',
 'dwc:municipality',
 'dwc:pointRadiusSpatialFit',
 'dwc:stateProvince',
 'dwc:verbatimCoordinateSystem',
 'dwc:verbatimCoordinates',
 'dwc:verbatimDepth',
 'dwc:verbatimElevation',
 'dwc:verbatimLatitude',
 'dwc:verbatimLocality',
 'dwc:verbatimLongitu