In [1]:
import sys
sys.path.append('..')

In [2]:
from pathlib import Path
import sqlite3
import regex
import numpy as np
import pandas as pd
import dask.dataframe as dd
from tqdm.notebook import tqdm

In [3]:
DATA = Path('..') / 'data'
PRUNED = DATA / '01_pruned'
DB = PRUNED / 'gazetteer.db'

CXN = sqlite3.connect(DB)

COLS = ['gbifid', 'coreid', 'locality',
        'lat', 'lng', 'datum', 'uncert',
        'country', 'state', 'county']

CHUNK = 1_000_000

In [4]:
FILES = !ls "$PRUNED"
FILES

['gazetteer.db',
 'insects_gbif.csv.gz',
 'insects_idigbio_occurrence_raw.csv.gz',
 'mammals_gbif_verbatim.csv.gz',
 'mammals_idigbio_occurrence_raw.csv.gz',
 'Plants_gbif_idigbio.csv.gz',
 'plants_gbif_verbatim.csv.gz',
 'plants_idigbio_occurrence_raw.csv.gz']

## insects_gbif

In [5]:
in_file = PRUNED / 'insects_gbif.csv.gz'
df = pd.read_csv(in_file, dtype=str)
df.columns

Index(['gbifid', 'country', 'locality', 'state', 'lat', 'lng', 'uncert'], dtype='object')

In [6]:
df['coreid'] = ''
df['datum'] = ''
df['county'] = ''

df['lat'] = pd.to_numeric(df['lat'], errors='coerce').fillna(9999.9).astype(float)
df['lng'] = pd.to_numeric(df['lng'], errors='coerce').fillna(9999.9).astype(float)
df['uncert'] = pd.to_numeric(df['uncert'], errors='coerce').fillna(0.0).astype(float)

df['lat'] = df['lat'].round(4)
df['lat'] = df['lat'].round(4)
df['uncert'] = df['uncert'].round().astype(int)

has_locality = df['locality'].notna()
has_lat = df['lat'].between(-90.0, 90.0)
has_lng = df['lng'].between(-180.0, 180.0)

df.loc[has_locality & has_lat & has_lng, COLS].to_sql(
    'raw', CXN, if_exists='replace', index=False)

## insects_idigbio_occurrence_raw

In [7]:
in_file = PRUNED / 'insects_idigbio_occurrence_raw.csv.gz'
df = pd.read_csv(in_file, dtype=str)
df.columns

Index(['coreid', 'uncert', 'country', 'county', 'lat', 'lng', 'datum',
       'locality', 'state', 'locality2'],
      dtype='object')

In [8]:
df = pd.read_csv(in_file, dtype=str)

df['gbifid'] = ''

df['lat'] = pd.to_numeric(df['lat'], errors='coerce').fillna(9999.9).astype(float)
df['lng'] = pd.to_numeric(df['lng'], errors='coerce').fillna(9999.9).astype(float)
df['uncert'] = pd.to_numeric(df['uncert'], errors='coerce').fillna(0.0).astype(float)

df['lat'] = df['lat'].round(4)
df['lat'] = df['lat'].round(4)
df['uncert'] = df['uncert'].round().astype(int)

has_locality = df['locality'].notna()
has_lat = df['lat'].between(-90.0, 90.0)
has_lng = df['lng'].between(-180.0, 180.0)

df.loc[has_locality & has_lat & has_lng, COLS].to_sql(
    'raw', CXN, if_exists='append', index=False)

In [9]:
df['locality'] = df['locality2']

has_locality = df['locality'].notna()

df.loc[has_locality & has_lat & has_lng, COLS].to_sql(
    'raw', CXN, if_exists='append', index=False)

## mammals_gbif_verbatim

In [10]:
in_file = PRUNED / 'mammals_gbif_verbatim.csv.gz'
df = pd.read_csv(in_file, dtype=str)
df.columns

Index(['gbifid', 'country', 'state', 'county', 'locality', 'locality2', 'lat',
       'lng', 'datum', 'uncert', 'prec'],
      dtype='object')

In [11]:
df = pd.read_csv(in_file, dtype=str)

df['coreid'] = ''

df['lat'] = pd.to_numeric(df['lat'], errors='coerce').fillna(9999.9).astype(float)
df['lng'] = pd.to_numeric(df['lng'], errors='coerce').fillna(9999.9).astype(float)
df['uncert'] = pd.to_numeric(df['uncert'], errors='coerce').fillna(0.0).astype(float)

df['lat'] = df['lat'].round(4)
df['lat'] = df['lat'].round(4)
df['uncert'] = df['uncert'].round().astype(int)

has_locality = df['locality'].notna()
has_lat = df['lat'].between(-90.0, 90.0)
has_lng = df['lng'].between(-180.0, 180.0)

df.loc[has_locality & has_lat & has_lng, COLS].to_sql(
    'raw', CXN, if_exists='append', index=False)

In [12]:
df['locality'] = df['locality2']

has_locality = df['locality'].notna()

df.loc[has_locality & has_lat & has_lng, COLS].to_sql(
    'raw', CXN, if_exists='append', index=False)

## mammals_idigbio_occurrence_raw

In [13]:
in_file = PRUNED / 'mammals_idigbio_occurrence_raw.csv.gz'
df = pd.read_csv(in_file, dtype=str)
df.columns

Index(['coreid', 'prec', 'uncert', 'country', 'county', 'lat', 'lng', 'datum',
       'locality', 'state', 'locality2'],
      dtype='object')

In [14]:
df = pd.read_csv(in_file, dtype=str)

df['gbifid'] = ''

df['lat'] = pd.to_numeric(df['lat'], errors='coerce').fillna(9999.9).astype(float)
df['lng'] = pd.to_numeric(df['lng'], errors='coerce').fillna(9999.9).astype(float)
df['uncert'] = pd.to_numeric(df['uncert'], errors='coerce').fillna(0.0).astype(float)

df['lat'] = df['lat'].round(4)
df['lat'] = df['lat'].round(4)
df['uncert'] = df['uncert'].round().astype(int)

has_locality = df['locality'].notna()
has_lat = df['lat'].between(-90.0, 90.0)
has_lng = df['lng'].between(-180.0, 180.0)

df.loc[has_locality & has_lat & has_lng, COLS].to_sql(
    'raw', CXN, if_exists='append', index=False)

In [15]:
df['locality'] = df['locality2']

has_locality = df['locality'].notna()

df.loc[has_locality & has_lat & has_lng, COLS].to_sql(
    'raw', CXN, if_exists='append', index=False)

## Plants_gbif_idigbio

In [16]:
in_file = PRUNED / 'Plants_gbif_idigbio.csv.gz'
df = pd.read_csv(in_file, dtype=str)
df.columns

Index(['gbifid', 'occurrenceID', 'country', 'locality', 'state', 'lat', 'lng',
       'uncert'],
      dtype='object')

In [17]:
df = pd.read_csv(in_file, dtype=str)

df['coreid'] = ''
df['datum'] = ''
df['county'] = ''

df['lat'] = pd.to_numeric(df['lat'], errors='coerce').fillna(9999.9).astype(float)
df['lng'] = pd.to_numeric(df['lng'], errors='coerce').fillna(9999.9).astype(float)
df['uncert'] = pd.to_numeric(df['uncert'], errors='coerce').fillna(0.0).astype(float)

df['lat'] = df['lat'].round(4)
df['lat'] = df['lat'].round(4)
df['uncert'] = df['uncert'].round().astype(int)

has_locality = df['locality'].notna()
has_lat = df['lat'].between(-90.0, 90.0)
has_lng = df['lng'].between(-180.0, 180.0)

df.loc[has_locality & has_lat & has_lng, COLS].to_sql(
    'raw', CXN, if_exists='append', index=False)

## plants_gbif_verbatim

In [20]:
df = None
in_file = PRUNED / 'plants_gbif_verbatim.csv.gz'
headers = !zcat "$in_file" | head -1
headers = headers[0].split(',')
headers

['gbifid',
 'country',
 'state',
 'county',
 'locality',
 'locality2',
 'lat',
 'lng',
 'datum',
 'uncert',
 'prec']

In [21]:
reader = pd.read_csv(in_file, dtype=str, chunksize=CHUNK)

for df in tqdm(reader):
    df['coreid'] = ''

    df['lat'] = (pd.to_numeric(df['lat'], errors='coerce')
                 .fillna(9999.9).astype(float))
    df['lng'] = (pd.to_numeric(df['lng'], errors='coerce')
                 .fillna(9999.9).astype(float))
    df['uncert'] = (pd.to_numeric(df['uncert'], errors='coerce')
                    .fillna(0.0).astype(float))
    
    df['lat'] = df['lat'].round(4)
    df['lat'] = df['lat'].round(4)
    df['uncert'] = df['uncert'].round()

    has_locality = df['locality'].notna()
    has_lat = df['lat'].between(-90.0, 90.0)
    has_lng = df['lng'].between(-180.0, 180.0)

    df.loc[has_locality & has_lat & has_lng, COLS].to_sql(
        'raw', CXN, if_exists='append', index=False)
    
    df['locality'] = df['locality2']

    has_locality = df['locality'].notna()

    df.loc[has_locality & has_lat & has_lng, COLS].to_sql(
        'raw', CXN, if_exists='append', index=False)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




## plants_idigbio_occurrence_raw

In [22]:
in_file = PRUNED / 'plants_idigbio_occurrence_raw.csv.gz'
headers = !zcat "$in_file" | head -1
headers = headers[0].split(',')
headers

['coreid',
 'prec',
 'uncert',
 'country',
 'county',
 'lat',
 'lng',
 'datum',
 'locality',
 'state',
 'locality2']

In [23]:
reader = pd.read_csv(in_file, dtype=str, chunksize=CHUNK)

for df in tqdm(reader):
    df['gbifid'] = ''

    df['lat'] = (pd.to_numeric(df['lat'], errors='coerce')
                 .fillna(9999.9).astype(float))
    df['lng'] = (pd.to_numeric(df['lng'], errors='coerce')
                 .fillna(9999.9).astype(float))
    df['uncert'] = (pd.to_numeric(df['uncert'], errors='coerce')
                    .fillna(0.0).astype(float)).astype(int)
    
    df['lat'] = df['lat'].round(4)
    df['lat'] = df['lat'].round(4)
    df['uncert'] = df['uncert'].round()

    has_locality = df['locality'].notna()
    has_lat = df['lat'].between(-90.0, 90.0)
    has_lng = df['lng'].between(-180.0, 180.0)

    df.loc[has_locality & has_lat & has_lng, COLS].to_sql(
        'raw', CXN, if_exists='append', index=False)
    
    df['locality'] = df['locality2']

    has_locality = df['locality'].notna()

    df.loc[has_locality & has_lat & has_lng, COLS].to_sql(
        'raw', CXN, if_exists='append', index=False)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




## Create indexes

In [24]:
sql = 'CREATE INDEX raw_locality ON raw (locality);'
CXN.executescript(sql)

<sqlite3.Cursor at 0x7efb5b417730>

In [25]:
sql = """
    CREATE INDEX raw_lat ON raw (lat);
    CREATE INDEX raw_lng on raw (lng);
"""
CXN.executescript(sql)

<sqlite3.Cursor at 0x7efb5b4173b0>