In [1]:
import io
import os
import sqlite3
import zipfile
from pathlib import Path

import pandas as pd
import regex
from tqdm import tqdm

In [2]:
DATA_DIR = Path('..') / 'data'
RAW_DIR = DATA_DIR / 'idigbio'
INTERIM_DIR = RAW_DIR / 'interim'

ZIP = RAW_DIR / '044fae92-60be-4eea-9876-64b15544969c.zip'
DB = INTERIM_DIR / 'gazetteer_10_idigbio_2021-Jan.db'
CSV_PATTERN = 'idigbio_2021-02-11a*.csv'

CHUNK = 1_000_000

In [3]:
if DB.exists():
    os.remove(DB)

## Columns to use for output

In [4]:
DWC_FIELDS = {f'dwc:{f}': f for f in """
    continent
    coordinatePrecision
    coordinateUncertaintyInMeters
    country
    countryCode
    county
    decimalLatitude
    decimalLongitude
    footprintSRS
    footprintSpatialFit
    footprintWKT
    georeferencedDate
    geodeticDatum
    georeferencedBy
    georeferenceProtocol
    georeferenceRemarks
    georeferenceSources
    georeferenceVerificationStatus
    higherGeography
    higherGeographyID
    island
    islandGroup
    locationAccordingTo
    locality
    locationID
    locationRemarks
    maximumDepthInMeters
    maximumDistanceAboveSurfaceInMeters
    maximumElevationInMeters
    minimumDepthInMeters
    minimumDistanceAboveSurfaceInMeters
    minimumElevationInMeters
    municipality
    pointRadiusSpatialFit
    stateProvince
    verbatimCoordinateSystem
    verbatimCoordinates
    verbatimDepth
    verbatimElevation
    verbatimLatitude
    verbatimLocality
    verbatimLongitude
    verbatimSRS
    waterBody
""".split()}


IDIGBIO_FIELDS = {f'idigbio:{f}': f for f in """
    geoPoint
    isoCountryCode
""".split()}

OTHER_FIELDS = {'coreid': 'coreid'}  # Need to link data

FIELDS = {**OTHER_FIELDS, **DWC_FIELDS, **IDIGBIO_FIELDS}

### Get the headers from the zip file

In [5]:
def get_headers(zip_file):
    with zipfile.ZipFile(ZIP) as zippy:
        with zippy.open(zip_file) as in_file:
            headers = in_file.readline()
    return [h.decode().strip() for h in sorted(headers.split(b','))]

### Write to database

In [6]:
def insert(zip_file, usecols, columns):
    table = zip_file.split('.')[0]

    with sqlite3.connect(DB) as cxn:
        with zipfile.ZipFile(ZIP) as zippy:
            with zippy.open(zip_file) as in_file:

                reader = pd.read_csv(
                    in_file, dtype=str, keep_default_na=False,
                    chunksize=CHUNK, usecols=usecols)

                if_exists = 'replace'

                for df in tqdm(reader):
                    df = df.rename(columns=columns)

                    df.to_sql(table, cxn,
                              if_exists=if_exists, index=False)

                    if_exists = 'append'

In [7]:
def wrapper(zip_file):
    headers = get_headers(zip_file)

    usecols = [h for h in headers if h in FIELDS]
    columns = {h: FIELDS[h] for h in usecols}

    insert(zip_file, usecols, columns)

## Write to DB

In [8]:
wrapper('occurrence.csv')

127it [41:45, 19.73s/it]


In [9]:
wrapper('occurrence_raw.csv')

127it [1:01:36, 29.10s/it]


##  Update database

In [10]:
sql = """
    create index if not exists occ_coreid on occurrence (coreid);
    create index if not exists raw_coreid on occurrence_raw (coreid);
"""

with sqlite3.connect(DB) as cxn:
    cxn.executescript(sql)

In [11]:
sql = """
    alter table occurrence rename column isoCountryCode to idigbio_countrycode;
    alter table occurrence add column idigbio_decimallatitude_wgs84;
    alter table occurrence add column idigbio_decimallongitude_wgs84;
"""

with sqlite3.connect(DB) as cxn:
    cxn.executescript(sql)

In [12]:
sql = """
    update occurrence
       set idigbio_decimallatitude_wgs84  = json_extract(geoPoint, '$.lat'),
           idigbio_decimallongitude_wgs84 = json_extract(geoPoint, '$.lon')
     where geoPoint is not null
       and geoPoint <> '';
"""

with sqlite3.connect(DB) as cxn:
    cxn.execute(sql)
    cxn.commit()

## Write output to CSV file

In [3]:
for path in INTERIM_DIR.glob(CSV_PATTERN):
    os.remove(path)

In [4]:
sql = """
    select occurrence_raw.*,
           idigbio_countrycode,
           idigbio_decimallatitude_wgs84,
           idigbio_decimallongitude_wgs84
    from occurrence_raw
    left join occurrence using (coreid)
    where idigbio_decimallatitude_wgs84 is not null
      and idigbio_decimallongitude_wgs84 is not null
      and idigbio_decimallatitude_wgs84 + idigbio_decimallongitude_wgs84 <> 0
    and (
        occurrence_raw.continent <> ''
    or  occurrence_raw.coordinatePrecision <> ''
    or  occurrence_raw.coordinateUncertaintyInMeters <> ''
    or  occurrence_raw.country <> ''
    or  occurrence_raw.countryCode <> ''
    or  occurrence_raw.county <> ''
    or  occurrence_raw.decimalLatitude <> ''
    or  occurrence_raw.decimalLongitude <> ''
    or  occurrence_raw.footprintSRS <> ''
    or  occurrence_raw.footprintSpatialFit <> ''
    or  occurrence_raw.footprintWKT <> ''
    or  occurrence_raw.georeferencedDate <> ''
    or  occurrence_raw.geodeticDatum <> ''
    or  occurrence_raw.georeferencedBy <> ''
    or  occurrence_raw.georeferenceProtocol <> ''
    or  occurrence_raw.georeferenceRemarks <> ''
    or  occurrence_raw.georeferenceSources <> ''
    or  occurrence_raw.georeferenceVerificationStatus <> ''
    or  occurrence_raw.higherGeography <> ''
    or  occurrence_raw.higherGeographyID <> ''
    or  occurrence_raw.island <> ''
    or  occurrence_raw.islandGroup <> ''
    or  occurrence_raw.locationAccordingTo <> ''
    or  occurrence_raw.locality <> ''
    or  occurrence_raw.locationID <> ''
    or  occurrence_raw.locationRemarks <> ''
    or  occurrence_raw.maximumDepthInMeters <> ''
    or  occurrence_raw.maximumElevationInMeters <> ''
    or  occurrence_raw.minimumDepthInMeters <> ''
    or  occurrence_raw.minimumElevationInMeters <> ''
    or  occurrence_raw.municipality <> ''
    or  occurrence_raw.pointRadiusSpatialFit <> ''
    or  occurrence_raw.stateProvince <> ''
    or  occurrence_raw.verbatimCoordinateSystem <> ''
    or  occurrence_raw.verbatimCoordinates <> ''
    or  occurrence_raw.verbatimDepth <> ''
    or  occurrence_raw.verbatimElevation <> ''
    or  occurrence_raw.verbatimLatitude <> ''
    or  occurrence_raw.verbatimLocality <> ''
    or  occurrence_raw.verbatimLongitude <> ''
    or  occurrence_raw.verbatimSRS <> ''
    or  occurrence_raw.waterBody <> ''
    );
    """

max_files = 10

In [None]:
with sqlite3.connect(DB) as cxn:

    reader = pd.read_sql(sql, cxn, chunksize=CHUNK)

    for i, df in tqdm(enumerate(reader)):
        if i == 0:
            headers = ','.join(df.columns.values.tolist())
            path = INTERIM_DIR / CSV_PATTERN.replace('*', '0')

            with open(path, 'w') as out:
                out.write(f'{headers}\n')

        i = (i // max_files) + 1
        path = INTERIM_DIR / CSV_PATTERN.replace('*', str(i))

        with open(path, 'a') as out:
            df.to_csv(out, header=False, index=False)

0it [00:00, ?it/s]