# Adjust Invalid Data

## Basic Setup

In [1]:
import io
import os
import sqlite3
import string
import zipfile
from hashlib import sha256
from pathlib import Path
from pprint import pp

import dask.dataframe as dd
import pandas as pd
import regex
from tqdm import tqdm

In [2]:
DATA_DIR = Path('..') / 'data'
INTERIM_DIR = DATA_DIR / '01_interim'

IN_DB = INTERIM_DIR / 'gazetteer_03_idigbio_2020-03-30.db'
OUT_DB = DATA_DIR / 'gazetteer_04_idigbio_2020-03-30.db'

CHUNK = 1_000_000

## Helper functions

In [3]:
def display_all(df):
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        display(df)

## Database Setup

In [4]:
if OUT_DB.exists():
    os.remove(OUT_DB)

### Create a Table for Logging Adjusted Values

In [5]:
df = pd.DataFrame(
    columns=['field', 'literal', 'becomes', 'floor', 'ceiling'])

with sqlite3.connect(OUT_DB) as cxn:
    df.to_sql('adjustments', cxn, if_exists='replace', index=False)

### Database Related Functions

Get the list of columns in a table. Skip any columns that require special handling

In [6]:
def get_columns(table='gazetteer', db=IN_DB):
    specials = """ hash source locality """.split()

    sql = f'PRAGMA table_info({table});'

    with sqlite3.connect(db) as cxn:
        cxn.row_factory = sqlite3.Row
        columns = [r[1] for r in cxn.execute(sql) if r[1] not in specials]

    return columns

Get a list of all distinct text values for manual filtering

In [7]:
def text_field(field):
    sql = f"""
        select {field}, count(*) as n
          from gazetteer
      group by {field}
      order by n desc, {field}
    """
    with sqlite3.connect(IN_DB) as cxn:
        df = pd.read_sql(sql, cxn)
    return df

In [8]:
COLUMNS = get_columns()
COLUMNS

['continent',
 'coordinatePrecision',
 'coordinateUncertaintyInMeters',
 'country',
 'countryCode',
 'county',
 'decimalLatitude',
 'decimalLongitude',
 'geodeticDatum',
 'georeferenceSources',
 'higherGeography',
 'island',
 'islandGroup',
 'locationRemarks',
 'maximumDepthInMeters',
 'maximumElevationInMeters',
 'minimumDepthInMeters',
 'minimumElevationInMeters',
 'municipality',
 'stateProvince',
 'verbatimCoordinateSystem',
 'verbatimCoordinates',
 'verbatimDepth',
 'verbatimElevation',
 'waterBody']

## continent

**string**

In [9]:
field = 'continent'

df = text_field(field)
df.shape

(246, 2)

In [10]:
display_all(df)

Unnamed: 0,continent,n
0,,9892329
1,north america,2822231
2,south america,682813
3,europe,674464
4,africa,309587
5,mesoamerica,263126
6,asia,234936
7,southern america,186724
8,africa madagascar,170657
9,south america neotropics,101678


In [11]:
df = pd.DataFrame(data={'literal': [
    'na',
    'no higher geography data',
    'no higher geography recorded',
    'not determined',
    'not in specify tree',
    'unassigned',
    'undefined',
    'unknown captive',
    'unknown continent',
    'unplaced',
    'unrecorded',
    'unspecified',
]})

df['becomes'] = None
df['field'] = field

In [12]:
with sqlite3.connect(OUT_DB) as cxn:
    df.to_sql('adjustments', cxn, if_exists='append', index=False)

## coordinatePrecision

**string**

**no changes**

In [13]:
field = 'coordinatePrecision'

df = text_field(field)
df.shape

(4654, 2)

In [14]:
display_all(df)

Unnamed: 0,coordinatePrecision,n
0,,15483147
1,0.001,233256
2,0.00001,151405
3,0.01667,119489
4,0.0001,72663
5,0.01,37863
6,0.000278,30374
7,500.000,9898
8,0.1,8258
9,1000.000,8130


## coordinateUncertaintyInMeters

**numeric**

In [15]:
field = 'coordinateUncertaintyInMeters'

df = pd.DataFrame(data={
    'field': [field],
    'floor': [0],
    'ceiling': [40_000 / 2 * 1000],
})

In [16]:
with sqlite3.connect(OUT_DB) as cxn:
    df.to_sql('adjustments', cxn, if_exists='append', index=False)

## country

**string**

In [17]:
field = 'country'

df = text_field(field)
df.shape

(2665, 2)

In [18]:
display_all(df)

Unnamed: 0,country,n
0,united states,4338919
1,australia,3213394
2,norway,998247
3,usa,942064
4,mexico,818613
5,brasil,645714
6,canada,529853
7,sweden,517138
8,u s a,369551
9,brazil,318996


In [19]:
df = pd.DataFrame(data={'literal': [
    '0',
    '5',
    '1971',
    '2008',
    '2012',
    'b',
    'i',
    'no aplica',
    'no data',
    'no disponible',
    'testes-2x1',
    'testes-3x2',
    'testes-4x2',
    'testes-4x3',
    'testes-5x3',
    'testes-5x4',
    'testes-6x4',
    'undefined',
    'unknown',
    'unknown captive',
    'unknown country',
]})

df['becomes'] = None
df['field'] = field

In [20]:
with sqlite3.connect(OUT_DB) as cxn:
    df.to_sql('adjustments', cxn, if_exists='append', index=False)

## countryCode

**string**

**no changes**

In [21]:
field = 'countryCode'

df = text_field(field)
df.shape

(481, 2)

In [22]:
display_all(df)

Unnamed: 0,countryCode,n
0,,12649744
1,us,1297132
2,mx,541132
3,br,219759
4,mg,142080
5,ca,91209
6,ec,87142
7,cr,84095
8,bo,73453
9,co,54723


## county

**string**